Reputation: 191
I have 6-byte strings of the format cccnnn
where c
is a character A-Z (ASCII 65-90) and n
a character 0-9 (ASCII 48-57). There is a total of 263 * 103 = 17,576,000 different combinations.
I want to create a perfect hash function that maps each string of this type to an integer index and I want it to be as fast as possible. The function does not have to be minimal, but the range can not be too large. Twice the number of combinations might be okay, but preferably not not more than that because each string will be mapped to a bit in a bit array that is already ~2MB.
The most obvious, and so far best, solution I can think of is to interpret the string as a number in base 26 and 10 and do the required multiplications and subtractions to arrive at an integer in the range [0, 17576000-1]:
inline word hash1(unsigned char *buffer)
{
return (((((word) buffer[0] * 26 + buffer[1]) * 26
+ buffer[2]) * 10 + buffer[3]) * 10
+ buffer[4]) * 10 + buffer[5] - 45700328;
}
Here buffer[0-5]
contains the character indexes, word
is a typedef
of uint64_t
and 45700328 = ((((65*26+65)*26+65)*10+48)*10+48)*10+48
, which converts the characters to the correct base instead of writing (buffer[0] - 65) * 26
etc. (It saves a few subtractions.)
I have thought of ways to improve this. One idea I had is to do use the same principle but with bit shifting rather than multiplication. I had to mix around the order of the characters to find a solution with as few operations as possible. I found that multiplication by 260 and 10 only require two shifts and an addition each, (x << 8) + (x << 2)
and (x << 3) + (x << 1)
respectively, and that I could use that to calculate each multiplication separately in the expression ((x2*260+x1)*260+x0)*10+(x4*260+x3)*260+x5-47366978
, where hi = buffer[i]
. The implementation is:
inline word hash1(unsigned char *buffer)
{
word y0, y1, y2, y3, y4;
word x0 = buffer[0]; word x1 = buffer[1];
word x2 = buffer[2]; word x3 = buffer[3];
word x4 = buffer[4]; word x5 = buffer[5];
y0 = (x4 << 2) + (x4 << 8) + x3;
y1 = (y0 << 2) + (y0 << 8) + x5;
y2 = (x2 << 2) + (x2 << 8) + x1;
y3 = (y2 << 2) + (y2 << 8) + x0;
y4 = (y3 << 3) + (y3 << 1) + y1;
return y4 - 47366978;
}
Unfortunately, hash2
is a little bit slower than hash1
. This is where I run out of good ideas. I could of course try making a function that simply shifts the significant bits of each character, stacking them on top of each other, forming a 227 bit number, but that would require a 16MB vector = too large.
So, whether it be using the same principle and changing the code or using an entirely different principle, how can I make my hash function faster according to the requirements I mentioned in the first paragraph?
Upvotes: 1
Views: 1109
Reputation: 154208
A simply method would use the 48-bit array as an integer and then mod by a particular number. Can work with the raw ASCII string. No need to subtract 26 or 10 from each character or even remove the '\n'
. No need for any multiplication. Just 1 %
operation.
typedef union {
unsigned char b[8];
uint64_t u64;
} U;
// Return a value in the range 0 to 33,541,273 which is less than 2*26*26*26*10*10*10
inline uint32_t hash3x26_mod(const unsigned char *buf) {
static const uint32_t mod = 0X1FFCC9A; // Determined by tests, assume little endian.
return (uint32_t) (x->u64 % mod);
}
Usage
fgets(&U.b, sizeof U.b, istream);
// Assume U.b[7] == 0
// Assume U.b[6] == 0 or `\n`, consistently
uint32_t perfect_AAA000_hash = hash3x26k_1(&U);
Alternatively, although OP does not want to use a wider index, the below quickly does generates a 30-bit non-colliding hash with a *
, >>
, and &
inline size_t hash3x26k_1(const unsigned char *buf) {
typedef union {
unsigned char b[6];
uint64_t u64;
} U;
U *x = (U*) buf;
uint64_t y = (x->u64 * (1ull + 16 + 16*16 + 16*16*8 + 16ull*16*8*8 + 16ull*16*8*8*8))
>> 17;
return (size_t) (y & 0x3FFFFFFF);
}
I suspect a multiplication by some TBD constant and masking with 0x01FF_FFFF would also work.
Upvotes: 3
Reputation: 154208
Use 5 least significant bits of the 3 A-Z and multiple the digits into a 10 bit product: 215 + 10 < 2*17,576,000.
Expect this to be faster if <<
is faster than *
. YMMV
Using a const
pointer allows for optimizations that may not have all ready occurred.
inline size_t hash3x26k(const unsigned char *buf) {
return 0x1FFFFFF
& (((buf[0] << 20) ^ (buf[1] << 15) ^ (buf[2] << 10))
^ ((buf[3] * 100 + buf[4] * 10 + buf[5])));
}
Test code to show perfect hash and not more than 2x 263 * 103 entries needed.
unsigned char z[0x1FFFFFF + 1u];
int main() {
size_t max = 0;
unsigned char b[7] = { 0 };
for (b[0] = 'A'; b[0] <= 'Z'; b[0]++) {
for (b[1] = 'A'; b[1] <= 'Z'; b[1]++) {
for (b[2] = 'A'; b[2] <= 'Z'; b[2]++) {
for (b[3] = '0'; b[3] <= '9'; b[3]++) {
for (b[4] = '0'; b[4] <= '9'; b[4]++) {
for (b[5] = '0'; b[5] <= '9'; b[5]++) {
size_t i = hash3x26k(b);
if (i > max) max = i;
//printf("%s %zu\n", b, i);
if (z[i]++) {
printf("%s %zu\n", b, i);
exit(-1);
}
}
}
}
}
}
}
printf("%zu\n", max + 1);
return 0;
}
29,229,056 buckets needed.
Upvotes: 2
Reputation: 1178
Here's my take on the hashing problem. The approach is to use less intermediate values and more constants, to make it easy for the compiler to optimize the code.
#include <stdio.h>
#include <stdint.h>
uint64_t hash1(unsigned char *buffer)
{
return
(
(
(
(
(uint64_t)
buffer[0] * 26
+ buffer[1]
) * 26
+ buffer[2]
) * 10
+ buffer[3]
) * 10
+ buffer[4]
) * 10
+ buffer[5]
- 45700328;
}
uint64_t hash2(const unsigned char *buffer)
{
uint64_t res
= buffer[0] * 676000
+ buffer[1] * 26000
+ buffer[2] * 1000
+ buffer[3] * 100
+ buffer[4] * 10
+ buffer[5] * 1;
return res - 45700328u;
}
int main(void)
{
unsigned char a, b, c, d, e, f;
unsigned char buf[7] = { 0 }; // make it printable
uint64_t h1, h2;
for (a = 'A'; a <= 'Z'; a++) {
buf[0] = a;
for (b = 'A'; b <= 'Z'; b++) {
buf[1] = b;
for (c = 'A'; c <= 'Z'; c++) {
buf[2] = c;
for (d = '0'; d <= '9'; d++) {
buf[3] = d;
for (e = '0'; e <= '9'; e++) {
buf[4] = e;
for (f = '0'; f <= '9'; f++) {
buf[5] = f;
h1 = hash1(buf);
h2 = hash2(buf);
if (h1 != h2) {
printf("Meh: %s mismatch: %llx %llx\n", (const char *)buf,
(unsigned long long)h1, (unsigned long long)h2);
return 1;
}
}
}
}
}
}
}
return 0;
}
Some simple gprofing indicates that hash2() is faster, at least most of the time. The gprof results vary a bit for each run. You may want to experiment yourself.
Upvotes: 1