Reputation: 2582
I have a short string s
(max 8 characters) which I want to search in many strings. Actually I want to search the first occurrence in each string of a stream. Finding the first index of s
has to be as fast as possible for my use-case, because a massive amount of strings per second is processed and the latency is very critical. Of course, machines can be scaled, but a big point is to reduce costs (and latency).
In general I want to create a C (or C++) function which behaves like strstr
, but for a fixed "needle". The needle is not known at compile-time, but only at runtime (at the startup). However, it's okay to generate code at runtime and compile it (or any other "expensive" initialization is fine). Once the needle is known it won't change anymore.
Another detail: The needle will be in almost every string of the input stream. So it's okay if the algorithm is slower for the case that the needle is not available (because that will almost never happen). Also maybe important: The input strings have always extra 64 byte allocated at the end (which might be helpful for SIMD operations).
I was actually surprised that strstr
is already quite fast, but I guess there might be a more optimal algorithm for the case that the needle is does not change?
Thanks a lot
Upvotes: -3
Views: 487
Reputation: 145307
If the needle searched starts with a byte value that is rare in the haystack, a simplistic implementation will beat more complicated alternatives:
char *mystrstr_naive(const char *s, const char *needle) {
char *p = (char *)(uintptr_t)s;
int c = *needle++;
if (c == '\0')
return p;
if (*needle == '\0')
return strchr(p, c);
size_t len = strlen(needle);
while ((p = strchr(p, c)) != NULL) {
p++;
if (!memcmp(p, needle, len))
return p - 1;
}
return NULL;
}
And even faster if the string length is known:
char *mystrstr_naive_len(const char *s, size_t slen, const char *needle) {
char *p = (char *)(uintptr_t)s;
int c = *needle++;
if (c == '\0')
return p;
if (*needle == '\0')
return memchr(p, c, slen);
size_t len = strlen(needle);
if (len < slen) {
char *e = p + slen - len;
while ((p = memchr(p, c, e - p)) != NULL) {
p++;
if (!memcmp(p, needle, len))
return p - 1;
}
}
return NULL;
}
On my system, if the needle searched starts with a byte value that is rare in the haystack, this is 10 to 20 times faster than strstr
and the alternatives presented in my other answer.
To improve performance on specific data, you must study the data carefully. The needle being known in advance is a interesting clue, but some other characteristics might be more fruitful.
Upvotes: 1
Reputation: 145307
If your target handles unaligned reads gracefully, you could use this approach:
#include <stddef.h>
#include <stdint.h>
char *mystrstr8(const char *s, uint64_t str8, uint64_t mask8) {
for (const char *p = s; *p; p++) {
const uint64_t *p64 = (const uint64_t *)(uintptr_t)p;
if ((*p64 & mask8) == str8)
return (char *)(uintptr_t)p;
}
return NULL;
}
If the string is modifiable, has extra slack and its length is provided, you can remove the terminator test:
#include <stddef.h>
#include <stdint.h>
char *mystrstr8_len(char *s, size_t len, uint64_t str8, uint64_t mask8) {
char *end = s + len;
uint64_t *e64 = (uint64_t *)(uintptr_t)end;
uint64_t ee = *e64;
*e64 = str8;
for (const char *p = s;; p++) {
const uint64_t *p64 = (const uint64_t *)(uintptr_t)p;
if ((*p64 & mask8) == str8) {
*e64 = ee;
if (p < end)
return (char *)(uintptr_t)p;
else
return NULL;
}
}
}
str8
and mask8
must be precomputed from the bytes of the needle string and according to the target endianness. For example, to search for Hello
on a little endian machine str8
is 0x6f6c6c6548
and mask8
is 0xffffffffff
.
For short strings, this simplistic brute force approach might perform better than using a tailored Boyer Moore implementation, depending on your specific data: array and needle lengths and contents... You can start by comparing the performance with that of your standard library's strstr
function.
Here is a benchmark for various string lengths:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
char *mystrstr8(const char *s, uint64_t str8, uint64_t mask8) {
for (const char *p = s; *p; p++) {
const uint64_t *p64 = (const uint64_t *)(uintptr_t)p;
if ((*p64 & mask8) == str8)
return (char *)(uintptr_t)p;
}
return NULL;
}
char *mystrstr8_8(const char *s, uint64_t str8) {
for (const char *p = s; *p; p++) {
const uint64_t *p64 = (const uint64_t *)(uintptr_t)p;
if (*p64 == str8)
return (char *)(uintptr_t)p;
}
return NULL;
}
char *mystrstr8_len(char *s, size_t len, uint64_t str8, uint64_t mask8) {
char *end = s + len;
uint64_t *e64 = (uint64_t *)(uintptr_t)end;
uint64_t ee = *e64;
*e64 = str8;
for (char *p = s;; p++) {
uint64_t *p64 = (uint64_t *)(uintptr_t)p;
if ((*p64 & mask8) == str8) {
*e64 = ee;
if (p < end)
return p;
else
return NULL;
}
}
}
char *mystrstr8_len8(char *s, size_t len, uint64_t str8) {
char *end = s + len;
uint64_t *e64 = (uint64_t *)(uintptr_t)end;
uint64_t ee = *e64;
*e64 = str8;
for (char *p = s;; p++) {
uint64_t *p64 = (uint64_t *)(uintptr_t)p;
if (*p64 == str8) {
*e64 = ee;
if (p < end)
return p;
else
return NULL;
}
}
}
int benchmark(int len, const char *needle, char *a) {
char buf[9] = { 0 };
strncat(buf, needle, 8);
int needle_len = strlen(buf);
uint64_t mask8 = needle_len ? 0xFFFFFFFFFFFFFFFF >> (64 - needle_len * 8) : 0;
uint64_t str8;
memcpy(&str8, buf, 8);
memset(a, 'x', len);
a[len] = '\0';
int pos = len - needle_len;
if (pos >= 0 && pos <= len - needle_len)
memcpy(a + pos, needle, needle_len);
clock_t c;
long c1, c2, c3;
long b1 = 1000000, b2 = 1000000, b3 = 1000000;
long n1 = 0, n2 = 0, n3 = 0;
int rep = 100000 / len;
rep += rep == 0;
int res = 0;
void *p1[rep], *p2[rep], *p3[rep];
while (n1 < 10000) {
c = clock();
for (int i = 0; i < rep; i++)
p1[i] = strstr(a, needle);
c1 = clock() - c;
if (needle_len == 8) {
c = clock();
for (int i = 0; i < rep; i++)
p2[i] = mystrstr8_8(a, str8);
c2 = clock() - c;
c = clock();
for (int i = 0; i < rep; i++)
p3[i] = mystrstr8_len8(a, len, str8);
c3 = clock() - c;
} else {
c = clock();
for (int i = 0; i < rep; i++)
p2[i] = mystrstr8(a, str8, mask8);
c2 = clock() - c;
c = clock();
for (int i = 0; i < rep; i++)
p3[i] = mystrstr8_len(a, len, str8, mask8);
c3 = clock() - c;
}
n1 += c1;
n2 += c2;
n3 += c3;
b1 -= (b1 - c1) * (b1 > c1);
b2 -= (b2 - c2) * (b2 > c2);
b3 -= (b3 - c3) * (b3 > c3);
res = (p1[rep - 1] != p2[rep - 1] || p1[rep - 1] != p3[rep - 1]);
}
if (p2[0] != p1[0]) {
printf("bench(%d, '%s'): mystrstr8 failure: %p, expected %p\n",
len, needle, p2[0], p1[0]);
}
if (p3[0] != p1[0]) {
printf("bench(%d, '%s'): mystrstr8_len failure: %p, expected %p\n",
len, needle, p3[0], p1[0]);
}
if (res == 0) {
printf("%-8d %-8s %13.3f %13.3f %13.3f\n", len, needle,
(double)b1 / rep, (double)b2 / rep, (double)b3 / rep);
}
return res;
}
#define MAX_LEN 1000000
int main(int argc, char *argv[]) {
char *a = malloc(MAX_LEN + 8);
// ensure full output is buffered
setvbuf(stdout, NULL, _IOFBF, 16384);
printf("%-8s %-8s %13s %13s %13s\n",
"len", "needle", "strstr", "mystrstr8", "mystrstr8_len");
for (int len = 10; len <= MAX_LEN; len *= 10) {
benchmark(len, "a", a);
benchmark(len, "ab", a);
benchmark(len, "abc", a);
benchmark(len, "abcd", a);
benchmark(len, "abcde", a);
benchmark(len, "abcdef", a);
benchmark(len, "abcdefg", a);
benchmark(len, "abcdefgh", a);
}
free(a);
return 0;
}
Here are the results on my 2015 Mac x86_64 laptop:
len needle strstr mystrstr8 mystrstr8_len
10 a 0.013 0.005 0.008
10 ab 0.013 0.005 0.008
10 abc 0.014 0.005 0.008
10 abcd 0.013 0.004 0.007
10 abcde 0.013 0.004 0.007
10 abcdef 0.013 0.003 0.007
10 abcdefg 0.012 0.003 0.007
10 abcdefgh 0.012 0.002 0.002
100 a 0.076 0.057 0.046
100 ab 0.076 0.056 0.045
100 abc 0.077 0.056 0.045
100 abcd 0.076 0.055 0.044
100 abcde 0.077 0.055 0.044
100 abcdef 0.076 0.054 0.044
100 abcdefg 0.076 0.054 0.043
100 abcdefgh 0.076 0.045 0.040
1000 a 0.610 0.480 0.410
1000 ab 0.610 0.470 0.410
1000 abc 0.610 0.480 0.410
1000 abcd 0.610 0.480 0.410
1000 abcde 0.610 0.470 0.400
1000 abcdef 0.610 0.470 0.410
1000 abcdefg 0.610 0.470 0.400
1000 abcdefgh 0.610 0.400 0.370
10000 a 5.900 4.800 4.100
10000 ab 5.900 4.800 4.100
10000 abc 5.900 4.800 4.100
10000 abcd 5.900 4.800 4.100
10000 abcde 5.900 4.800 4.100
10000 abcdef 5.900 4.800 4.100
10000 abcdefg 5.900 4.800 4.100
10000 abcdefgh 5.900 4.000 3.800
100000 a 59.000 50.000 41.000
100000 ab 59.000 49.000 41.000
100000 abc 59.000 49.000 41.000
100000 abcd 59.000 49.000 41.000
100000 abcde 59.000 49.000 41.000
100000 abcdef 59.000 49.000 41.000
100000 abcdefg 59.000 50.000 41.000
100000 abcdefgh 59.000 40.000 39.000
1000000 a 593.000 493.000 415.000
1000000 ab 589.000 472.000 415.000
1000000 abc 592.000 496.000 413.000
1000000 abcd 590.000 496.000 416.000
1000000 abcde 589.000 495.000 415.000
1000000 abcdef 589.000 495.000 416.000
1000000 abcdefg 589.000 495.000 417.000
1000000 abcdefgh 589.000 406.000 385.000
This hack consistently improves performance by 15 to 30% on long strings and even more on shorter ones. I made a special case of 8 byte needles that could be adapted for 1, 2 and 4 byte needles too.
Upvotes: 1