Reputation: 3741
In some Python code I'm writing, I need to count the number of occurrences of any of a set of characters in a string. In other words, I need to count the total occurrences of characters [c1, c2, c3,...,cn] in a string.
In C, the function called strpbrk()
that can be used to do this, often with special instructions on x86 processors to make it faster.
In Python, I have written the following code, but it is the slowest part of my application.
haystack = <query string>
gc_characters = 0
for c in ['c', 'C', 'g', 'G']:
gc_characters += haystack.count(c)
Is there a faster way to do this?
Upvotes: 2
Views: 664
Reputation: 169378
I may have gone a little overboard here, but the tl;dr is that is the original code is actually faster than (EDIT: macOS's) strpbrk()
, but some strpbrk()
implementations may be faster!
str.count()
uses this bundle of strange and beautiful magic in its guts – no wonder it's fast.
The full code is available at https://github.com/akx/so55822235
These approaches are in pure Python, including OP's original
def gc_characters_original(haystack):
gc_characters = 0
for c in ("c", "C", "g", "G"):
gc_characters += haystack.count(c)
return gc_characters
def gc_characters_counter(haystack):
counter = Counter(haystack)
return sum(counter.get(c, 0) for c in ["c", "C", "g", "G"])
def gc_characters_manual(haystack):
gc_characters = 0
for x in haystack:
if x in ("c", "C", "g", "G"):
gc_characters += 1
return gc_characters
def gc_characters_iters(haystack):
gc_characters = haystack.count("c") + haystack.count("C") + haystack.count("g") + haystack.count("G")
return gc_characters
strpbrk()
from libc.string cimport strpbrk
cdef int _count(char* s, char *key):
assert s is not NULL, "byte string value is NULL"
cdef int n = 0
cdef char* pch = strpbrk (s, key)
while pch is not NULL:
n += 1
pch = strpbrk (pch + 1, key)
return n
def count(s, key):
return _count(s, key)
...
def gc_characters_cython(haystack_bytes):
return charcount_cython.count(haystack_bytes, b"cCgG")
strpbrk()
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <string.h>
static unsigned int count(const char *str, const char *key) {
unsigned int n = 0;
char *pch = strpbrk(str, key);
while (pch != NULL) {
n++;
pch = strpbrk(pch + 1, key);
}
return n;
}
static PyObject *charcount_count(PyObject *self, PyObject *args) {
const char *str, *key;
Py_ssize_t strl, keyl;
if (!PyArg_ParseTuple(args, "s#s#", &str, &strl, &key, &keyl)) {
PyErr_SetString(PyExc_RuntimeError, "invalid arguments");
return NULL;
}
int n = count(str, key);
return PyLong_FromLong(n);
}
static PyMethodDef CharCountMethods[] = {
{"count", charcount_count, METH_VARARGS,
"Count the total occurrences of any s2 characters in s1"},
{NULL, NULL, 0, NULL},
};
static struct PyModuleDef spammodule = {PyModuleDef_HEAD_INIT, "charcount",
NULL, -1, CharCountMethods};
PyMODINIT_FUNC PyInit_charcount(void) { return PyModule_Create(&spammodule); }
...
def gc_characters_cext_b(haystack_bytes):
return charcount.count(haystack_bytes, b"cCgG")
def gc_characters_cext_u(haystack):
return charcount.count(haystack, "cCgG")
On my Mac, counting cCgG
in an one-million-character string of random letters, i.e.
haystack = "".join(random.choice(string.ascii_letters) for x in range(1_000_000))
haystack_bytes = haystack.encode()
print("original", timeit.timeit(lambda: gc_characters_original(haystack), number=100))
print("unrolled", timeit.timeit(lambda: gc_characters_iters(haystack), number=100))
print("cython", timeit.timeit(lambda: gc_characters_cython(haystack_bytes), number=100))
print("c extension, bytes", timeit.timeit(lambda: gc_characters_cext_b(haystack_bytes), number=100))
print("c extension, unicode", timeit.timeit(lambda: gc_characters_cext_u(haystack), number=100))
print("manual loop", timeit.timeit(lambda: gc_characters_manual(haystack), number=100))
print("counter", timeit.timeit(lambda: gc_characters_counter(haystack), number=100))
yields these results:
original 0.34033612700000004
unrolled 0.33661798900000006
cython 0.6542106270000001
c extension, bytes 0.46668797900000003
c extension, unicode 0.4761082090000004
manual loop 11.625232557
counter 7.0389275090000005
So, unless the strpbrk()
implementation in my mac's libc
is horribly underpowered (EDIT: which it is), it's just best to use .count()
.
I added glibc's strcspn()
/strpbrk()
and it's startlingly faster than the näive version of strpbrk()
shipped with macOS:
original 0.329256
unrolled 0.333872
cython 0.433299
c extension, bytes 0.432552
c extension, unicode 0.437332
c extension glibc, bytes 0.169704 <-- new
c extension glibc, unicode 0.158153 <-- new
glibc
also has SSE2 and SSE4 versions of the functions, which would probably be even faster still.
I went back to this one more time since I had an epiphany about how glibc's strcspn()
's clever lookup table could be used for character counting:
size_t fastcharcount(const char *str, const char *haystack) {
size_t count = 0;
// Prepare lookup table.
// It will contain 1 for all characters in the haystack.
unsigned char table[256] = {0};
unsigned char *ts = (unsigned char *)haystack;
while(*ts) table[*ts++] = 1;
unsigned char *s = (unsigned char *)str;
#define CHECK_CHAR(i) { if(!s[i]) break; count += table[s[i]]; }
for(;;) {
CHECK_CHAR(0);
CHECK_CHAR(1);
CHECK_CHAR(2);
CHECK_CHAR(3);
s += 4;
}
#undef CHECK_CHAR
return count;
}
The results are very impressive, outperforming the glibc implementation 4-fold and the original Python implementation 8.5-fold.
original | 6.463880 sec / 2000 iter | 309 iter/s
unrolled | 6.378582 sec / 2000 iter | 313 iter/s
cython libc | 8.443358 sec / 2000 iter | 236 iter/s
cython glibc | 2.936697 sec / 2000 iter | 681 iter/s
cython fast | 0.766082 sec / 2000 iter | 2610 iter/s
c extension, bytes | 8.373438 sec / 2000 iter | 238 iter/s
c extension, unicode | 8.394805 sec / 2000 iter | 238 iter/s
c extension glib, bytes | 2.988184 sec / 2000 iter | 669 iter/s
c extension glib, unicode | 2.992429 sec / 2000 iter | 668 iter/s
c extension fast, bytes | 0.754072 sec / 2000 iter | 2652 iter/s
c extension fast, unicode | 0.762074 sec / 2000 iter | 2624 iter/s
Upvotes: 3
Reputation: 7210
.count
will iterate over haystack
every time you call it - but is heavily optimized over the alternatives I suggest here. It depends on how many characters are in your real case. You can try
from collections import Counter
cnt = Counter(haystack)
gc_characters = sum(cnt.get(e, 0) for e in ['c', 'C', 'g', 'G']])
as this will iterate over the string once and store the counts of every occurring character. It may be marginally faster to only look for the characters you care about and using a set for those such characters for faster __contains__
.
gc_chars = {'c', 'C', 'g', 'G'}
counts = {e: 0 for e in gc_chars}
for c in gc_chars:
if c in gc_chars:
counts[c] += 1
gc_characters = sum(counts.values())
If you provide more details of the composition of hastack
and how often this is called, we could try to help you more.
Another idea is that if hastack
is frequently the same, you could perhaps keep an in-memory cache of the answer
from functools import lru_cache
@lru_cache
def haystack_metric(hastack):
return sum(haystack.count(c) for c in ['c', 'C', 'g', 'G']))
(with whatever implementation you settle on). You could also explore ctypes - but I have little experience with it.
Upvotes: 1