Reputation: 2293
I'm working on hash table in C language and I'm testing hash function for string.
The first function I've tried is to add ascii code and use modulo (% 100
) but i've got poor results with the first test of data: 40 collisions for 130 words.
The final input data will contain 8000 words (it's a dictionary stores in a file). The hash table is declared as int table[10000]
and contains the position of the word in a .txt file.
Upvotes: 175
Views: 376516
Reputation: 4209
I want to summarize it all for newbies to C like me. According to Andriy Makukha precision efforts, MurmurHash3
is the best.
A decent C port can be found in murmurhash.c.
Upvotes: 0
Reputation: 8334
I wanted to verify Xiaoning Bian's answer, but unfortunately he didn't post his code. So I implemented a little test suite and ran different little hashing functions on the list of 466K English words to see number of collisions for each:
Hash function | Collisions | Time (words) | Time (file) | Avalanching
===============================================================================
CRC32 | 23 (0.005%) | 96.1 ms | 120.9 ms | 99.8%
MurmurOAAT | 26 (0.006%) | 17.2 ms | 19.4 ms | 100.0%
FNV hash | 32 (0.007%) | 16.4 ms | 14.0 ms | 98.6%
Jenkins OAAT | 36 (0.008%) | 19.9 ms | 18.5 ms | 100.0%
DJB2 hash | 344 (0.074%) | 18.2 ms | 12.7 ms | 92.9%
K&R V2 | 356 (0.076%) | 18.0 ms | 11.9 ms | 92.8%
Coffin | 763 (0.164%) | 16.3 ms | 11.7 ms | 91.4%
x17 hash | 2242 (0.481%) | 18.6 ms | 20.0 ms | 91.5%
-------------------------------------------------------------------------------
large_prime | 25 (0.005%) | 16.5 ms | 18.6 ms | 96.6%
MurmurHash3_x86_32 | 19 (0.004%) | 20.5 ms | 4.4 ms | 100.0%
I included time for both: hashing all words individually and hashing the entire file of all English words once. I also included a more complex MurmurHash3_x86_32
into my test for reference. Additionally, I also calculated "avalanching", which is a measure of how unpredictable the hashes of consecutive words are. Anything below 100% means "quite predictable" (which might be fine for hashing tables, but is bad for other uses).
Conclusions:
Liz
/MHz
, Bon
/COM
, Rey
/SEX
.hash = 17000069 * hash + s[i]
produces only 25 collisions compared to DJB2's 344 collisions.Test code (compiled with gcc -O2
):
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define MODE 1 // 1 = hash each word individually; 2 = hash the entire file
#define MAXLEN 50 // maximum length of a word
#define MAXWORDS 500000 // maximum number of words in a file
#define SEED 0x12345678
#if MODE == 1
char words[MAXWORDS][MAXLEN];
#else
char file[MAXWORDS * MAXLEN];
#endif
uint32_t DJB2_hash(const uint8_t *str)
{
uint32_t hash = 5381;
uint8_t c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
uint32_t FNV(const void* key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
h ^= 2166136261UL;
const uint8_t* data = (const uint8_t*)key;
for(int i = 0; data[i] != '\0'; i++)
{
h ^= data[i];
h *= 16777619;
}
return h;
}
uint32_t MurmurOAAT_32(const char* str, uint32_t h)
{
// One-byte-at-a-time hash based on Murmur's mix
// Source: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
for (; *str; ++str) {
h ^= *str;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint32_t KR_v2_hash(const char *s)
{
// Source: https://stackoverflow.com/a/45641002/5407270
// a.k.a. Java String hashCode()
uint32_t hashval = 0;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval;
}
uint32_t Jenkins_one_at_a_time_hash(const char *str)
{
uint32_t hash, i;
for (hash = i = 0; str[i] != '\0'; ++i)
{
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
uint32_t crc32b(const uint8_t *str) {
// Source: https://stackoverflow.com/a/21001712
unsigned int byte, crc, mask;
int i = 0, j;
crc = 0xFFFFFFFF;
while (str[i] != 0) {
byte = str[i];
crc = crc ^ byte;
for (j = 7; j >= 0; j--) {
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
i = i + 1;
}
return ~crc;
}
inline uint32_t _rotl32(uint32_t x, int32_t bits)
{
return x<<bits | x>>(32-bits); // C idiom: will be optimized to a single operation
}
uint32_t Coffin_hash(char const *input) {
// Source: https://stackoverflow.com/a/7666668/5407270
uint32_t result = 0x55555555;
while (*input) {
result ^= *input++;
result = _rotl32(result, 5);
}
return result;
}
uint32_t x17(const void * key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
const uint8_t * data = (const uint8_t*)key;
for (int i = 0; data[i] != '\0'; ++i)
{
h = 17 * h + (data[i] - ' ');
}
return h ^ (h >> 16);
}
uint32_t large_prime(const uint8_t *s, uint32_t hash)
{
// Better alternative to x33: multiply by a large co-prime instead of a small one
while (*s != '\0')
hash = 17000069*hash + *s++;
return hash;
}
void readfile(FILE* fp, char* buffer) {
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
rewind(fp);
fread(buffer, size, 1, fp);
buffer[size] = '\0';
}
struct timeval stop, start;
void timing_start() {
gettimeofday(&start, NULL);
}
void timing_end() {
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
}
uint32_t apply_hash(int hash, const char* line)
{
uint32_t result;
#if MODE == 2
timing_start();
#endif
switch (hash) {
case 1: result = crc32b((const uint8_t*)line); break;
case 2: result = large_prime((const uint8_t *)line, SEED); break;
case 3: result = MurmurOAAT_32(line, SEED); break;
case 4: result = FNV(line, SEED); break;
case 5: result = Jenkins_one_at_a_time_hash(line); break;
case 6: result = DJB2_hash((const uint8_t*)line); break;
case 7: result = KR_v2_hash(line); break;
case 8: result = Coffin_hash(line); break;
case 9: result = x17(line, SEED); break;
default: break;
}
#if MODE == 2
timing_end();
#endif
return result;
}
int main(int argc, char* argv[])
{
// Read arguments
const int hash_choice = atoi(argv[1]);
char const* const fname = argv[2];
printf("Algorithm: %d\n", hash_choice);
printf("File name: %s\n", fname);
// Open file
FILE* f = fopen(fname, "r");
#if MODE == 1
char line[MAXLEN];
size_t word_count = 0;
uint32_t hashes[MAXWORDS];
// Read file line by line
while (fgets(line, sizeof(line), f)) {
line[strcspn(line, "\n")] = '\0'; // strip newline
strcpy(words[word_count], line);
word_count++;
}
fclose(f);
// Calculate hashes
timing_start();
for (size_t i = 0; i < word_count; i++) {
hashes[i] = apply_hash(hash_choice, words[i]);
}
timing_end();
// Output hashes
for (size_t i = 0; i < word_count; i++) {
printf("%08x\n", hashes[i]);
}
#else
// Read entire file
readfile(f, file);
fclose(f);
printf("Len: %lu\n", strlen(file));
// Calculate hash
uint32_t hash = apply_hash(hash_choice, file);
printf("%08x\n", hash);
#endif
return 0;
}
P.S. A more comprehensive review of speed and quality of modern hash functions can be found in SMHasher repository of Reini Urban (rurban). Notice the "Quality problems" column in the table.
Upvotes: 35
Reputation: 53105
djb2
is goodThough djb2
, as presented on stackoverflow by cnicutar, is almost certainly better, I think it's worth showing the K&R hashes too:
unsigned long hash(unsigned char *str)
{
unsigned int hash = 0;
int c;
while (c = *str++)
hash += c;
return hash;
}
% HASHSIZE
from the return statement if you plan on doing the modulus sizing-to-your-array-length outside the hash algorithm. Also, I recommend you make the return and "hashval" type unsigned long
, or even better: uint32_t
or uint64_t
, instead of the simple unsigned
(int). This is a simple algorithm which takes into account byte order of each byte in the string by doing this style of algorithm: hashvalue = new_byte + 31*hashvalue
, for all bytes in the string:
unsigned hash(char *s)
{
unsigned hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval % HASHSIZE;
}
Note that it's clear from the two algorithms that one reason the 1st edition hash is so terrible is because it does NOT take into consideration string character order, so hash("ab")
would therefore return the same value as hash("ba")
. This is not so with the 2nd edition hash, however, which would (much better!) return two different values for those strings.
std::unordered_map<>
template container hash table is excellent.The GCC C++11 hashing functions used for unordered_map
(a hash table template) and unordered_set
(a hash set template) appear to be as follows.
Code:
// Implementation of Murmur hash for 32-bit size_t.
size_t _Hash_bytes(const void* ptr, size_t len, size_t seed)
{
const size_t m = 0x5bd1e995;
size_t hash = seed ^ len;
const char* buf = static_cast<const char*>(ptr);
// Mix 4 bytes at a time into the hash.
while (len >= 4)
{
size_t k = unaligned_load(buf);
k *= m;
k ^= k >> 24;
k *= m;
hash *= m;
hash ^= k;
buf += 4;
len -= 4;
}
// Handle the last few bytes of the input array.
switch (len)
{
case 3:
hash ^= static_cast<unsigned char>(buf[2]) << 16;
[[gnu::fallthrough]];
case 2:
hash ^= static_cast<unsigned char>(buf[1]) << 8;
[[gnu::fallthrough]];
case 1:
hash ^= static_cast<unsigned char>(buf[0]);
hash *= m;
};
// Do a few final mixes of the hash.
hash ^= hash >> 13;
hash *= m;
hash ^= hash >> 15;
return hash;
}
std::unordered_map<>
hash used above.Not only is is the best of all of these, but Austin released MurmerHash3 into the public domain. See my other answer on this here: What is the default hash function used in C++ std::unordered_map?.
Upvotes: 12
Reputation: 29
I have tried these hash functions and got the following result. I have about 960^3 entries, each 64 bytes long, 64 chars in different order, hash value 32bit. Codes from here.
Hash function | collision rate | how many minutes to finish
==============================================================
MurmurHash3 | 6.?% | 4m15s
Jenkins One.. | 6.1% | 6m54s
Bob, 1st in link | 6.16% | 5m34s
SuperFastHash | 10% | 4m58s
bernstein | 20% | 14s only finish 1/20
one_at_a_time | 6.16% | 7m5s
crc | 6.16% | 7m56s
One strange things is that almost all the hash functions have 6% collision rate for my data.
Upvotes: 2
Reputation: 1741
djb2 has 317 collisions for this 466k english dictionary while MurmurHash has none for 64 bit hashes, and 21 for 32 bit hashes (around 25 is to be expected for 466k random 32 bit hashes). My recommendation is using MurmurHash if available, it is very fast, because it takes in several bytes at a time. But if you need a simple and short hash function to copy and paste to your project I'd recommend using murmurs one-byte-at-a-time version:
uint32_t inline MurmurOAAT32 ( const char * key)
{
uint32_t h(3323198485ul);
for (;*key;++key) {
h ^= *key;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint64_t inline MurmurOAAT64 ( const char * key)
{
uint64_t h(525201411107845655ull);
for (;*key;++key) {
h ^= *key;
h *= 0x5bd1e9955bd1e995;
h ^= h >> 47;
}
return h;
}
The optimal size of a hash table is - in short - as large as possible while still fitting into memory. Because we don't usually know or want to look up how much memory we have available, and it might even change, the optimal hash table size is roughly 2x the expected number of elements to be stored in the table. Allocating much more than that will make your hash table faster but at rapidly diminishing returns, making your hash table smaller than that will make it exponentially slower. This is because there is a non-linear trade-off between space and time complexity for hash tables, with an optimal load factor of 2-sqrt(2) = 0.58... apparently.
Upvotes: 10
Reputation: 375
One thing I've used with good results is the following (I don't know if its mentioned already because I can't remember its name).
You precompute a table T with a random number for each character in your key's alphabet [0,255]. You hash your key 'k0 k1 k2 ... kN' by taking T[k0] xor T[k1] xor ... xor T[kN]. You can easily show that this is as random as your random number generator and its computationally very feasible and if you really run into a very bad instance with lots of collisions you can just repeat the whole thing using a fresh batch of random numbers.
Upvotes: 0
Reputation: 101149
There are a number of existing hashtable implementations for C, from the C standard library hcreate/hdestroy/hsearch, to those in the APR and glib, which also provide prebuilt hash functions. I'd highly recommend using those rather than inventing your own hashtable or hash function; they've been optimized heavily for common use-cases.
If your dataset is static, however, your best solution is probably to use a perfect hash. gperf will generate a perfect hash for you for a given dataset.
Upvotes: 9
Reputation: 490623
First, you generally do not want to use a cryptographic hash for a hash table. An algorithm that's very fast by cryptographic standards is still excruciatingly slow by hash table standards.
Second, you want to ensure that every bit of the input can/will affect the result. One easy way to do that is to rotate the current result by some number of bits, then XOR the current hash code with the current byte. Repeat until you reach the end of the string. Note that you generally do not want the rotation to be an even multiple of the byte size either.
For example, assuming the common case of 8 bit bytes, you might rotate by 5 bits:
int hash(char const *input) {
int result = 0x55555555;
while (*input) {
result ^= *input++;
result = rol(result, 5);
}
}
Edit: Also note that 10000 slots is rarely a good choice for a hash table size. You usually want one of two things: you either want a prime number as the size (required to ensure correctness with some types of hash resolution) or else a power of 2 (so reducing the value to the correct range can be done with a simple bit-mask).
Upvotes: 35
Reputation: 4805
Wikipedia shows a nice string hash function called Jenkins One At A Time Hash. It also quotes improved versions of this hash.
uint32_t jenkins_one_at_a_time_hash(char *key, size_t len)
{
uint32_t hash, i;
for(hash = i = 0; i < len; ++i)
{
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
Upvotes: 11
Reputation: 80335
First, is 40 collisions for 130 words hashed to 0..99 bad? You can't expect perfect hashing if you are not taking steps specifically for it to happen. An ordinary hash function won't have fewer collisions than a random generator most of the time.
A hash function with a good reputation is MurmurHash3.
Finally, regarding the size of the hash table, it really depends what kind of hash table you have in mind, especially, whether buckets are extensible or one-slot. If buckets are extensible, again there is a choice: you choose the average bucket length for the memory/speed constraints that you have.
Upvotes: 2