Reputation: 15040
Consider 8 digit characters like 12345678
as a string. It can be converted to a number where every byte contains a digit like this:
const char* const str = "12345678";
const char* const base = "00000000";
const uint64_t unpacked = *reinterpret_cast<const uint64_t*>(str)
- *reinterpret_cast<const uint64_t*>(base);
Then unpacked
will be 0x0807060504030201
on a little-endian system.
What is the fastest way to convert the number into 12345678
, perhaps by multiplying it by some magic number or using SIMD up to AVX2?
UPDATE: 12345678
has to be a number stored in a 32-bit or 64-bit integer, not a string.
Upvotes: 4
Views: 1379
Reputation: 26115
On an older x86-64 system without AVX2, this simple version based on gathering digits in tree fashion is quite efficient, with performance on par with a simple SWAR-based implementation per my measurements. This requires a processor with a lot of instruction-level parallelism however, as it comprises 50% more instructions than the SWAR -based code when compiled with full optimizations.
/* convert a string of exactly eight 'char' into a 32-bit unsigned integer */
uint32_t string_to_number (const char * s)
{
uint32_t t0 = s[0] * 10 + s[1];
uint32_t t1 = s[2] * 10 + s[3];
uint32_t t2 = s[4] * 10 + s[5];
uint32_t t3 = s[6] * 10 + s[7];
uint32_t s0 = t0 * 100 + t1;
uint32_t s1 = t2 * 100 + t3;
uint32_t num = s0 * 10000 + s1;
uint32_t corr =
'0' * 10000000 +
'0' * 1000000 +
'0' * 100000 +
'0' * 10000 +
'0' * 1000 +
'0' * 100 +
'0' * 10 +
'0' * 1;
return num - corr;
}
Upvotes: 1
Reputation: 1185
Multiplication in binary is just a series of shift & adds. A SWAR approach shouldn't be too hard to understand. For a detailed walk-thru see:
// http://govnokod.ru/13461
static inline
uint32_t parse_8digits_swar_classic (char* str) {
uint64_t v;
memcpy(&v, str, 8);
v = (v & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
v = (v & 0x00FF00FF00FF00FF) * 6553601 >> 16;
v = (v & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
return v;
}
// attempt to improve the latency
static inline
uint32_t parse_8digits_swar_aqrit (char* str) {
const uint64_t mask = 0x000000FF000000FF;
uint64_t v, t;
memcpy(&v, str, 8);
v = (v * 10) + (v >> 8);
t = (v & mask) * 0x000F424000000064;
v = ((v >> 16) & mask) * 0x0000271000000001;
v = (v + t + 0xFF0915C600000000ULL) >> 32;
return v;
}
// SSSE3 needs less `shift & mask` operations...
static inline
uint32_t parse_8digits_simd_ssse3 (char* str) {
const __m128i mul1 = _mm_set_epi32(0, 0, 0x010A0A64, 0x14C814C8);
const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);
const __m128i mask = _mm_set1_epi8(0x0F);
__m128i v;
v = _mm_loadl_epi64((__m128i*)(void*)str);
v = _mm_and_si128(v, mask);
v = _mm_madd_epi16(_mm_maddubs_epi16(mul1, v), mul2);
v = _mm_add_epi32(_mm_add_epi32(v, v), _mm_shuffle_epi32(v, 1));
return (uint32_t)_mm_cvtsi128_si32(v);
}
Upvotes: 2
Reputation: 15040
I've implemented a small program to test some ideas. AVX2 implementation is ~1.5 times faster than the naive, with the table implementation in the middle:
AVX2: 12345678 in 3.42759
Naive: 12345678 in 5.12581
Table: 12345678 in 4.49478
Source code:
#include <cstdlib>
#include <cstdint>
#include <immintrin.h>
#include <iostream>
using namespace std;
const __m256i mask = _mm256_set1_epi32(0xf);
const __m256i mul = _mm256_setr_epi32(10000000, 1000000, 100000, 10000, 1000, 100, 10, 1);
const volatile char* str = "12345678";
volatile uint32_t h;
const int64_t nIter = 1000LL * 1000LL * 1000LL;
inline void parse_avx2() {
const char* const base = "00000000";
const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
- *reinterpret_cast<const uint64_t*>(base);
const __m128i a = _mm_set1_epi64x(unpacked);
const __m256i b = _mm256_cvtepu8_epi32(a);
const __m256i d = _mm256_mullo_epi32(b, mul);
const __m128i e = _mm_add_epi32(_mm256_extractf128_si256(d, 0), _mm256_extractf128_si256(d, 1));
const uint64_t f0 = _mm_extract_epi64(e, 0);
const uint64_t f1 = _mm_extract_epi64(e, 1);
const uint64_t g = f0 + f1;
h = (g>>32) + (g&0xffffffff);
}
inline void parse_naive() {
const char* const base = "00000000";
const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
- *reinterpret_cast<const uint64_t*>(base);
const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
h = a[7] + a[6]*10 + a[5]*100 + a[4]*1000 + a[3]*10000 + a[2]*100000 + a[1]*1000000 + a[0]*10000000;
}
uint32_t summands[8][10];
inline void parse_table() {
const char* const base = "00000000";
const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
- *reinterpret_cast<const uint64_t*>(base);
const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
h = summands[7][a[0]] + summands[6][a[1]] + summands[5][a[2]] + summands[4][a[3]]
+ summands[3][a[4]] + summands[2][a[5]] + summands[1][a[6]] + summands[0][a[7]];
}
int main() {
clock_t start = clock();
for(int64_t i=0; i<nIter; i++) {
parse_avx2();
}
clock_t end = clock();
cout << "AVX2: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;
start = clock();
for(int64_t i=0; i<nIter; i++) {
parse_naive();
}
end = clock();
cout << "Naive: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;
uint32_t mul=1;
for(int i=0; i<8; i++, mul*=10) {
for(int j=0; j<9; j++) {
summands[i][j] = j*mul;
}
}
start = clock();
for(int64_t i=0; i<nIter; i++) {
parse_table();
}
end = clock();
cout << "Table: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;
return 0;
}
Upvotes: 0
Reputation: 11920
If you change your input format to breadth-first element order like this:
Sample 9 numbers, interleaved
digit[]:
1 1 1 1 1 1 1 1 1 ... 2 2 2 2 2 2 2 2 2 ...
... 3 3 3 3 3 3 3 3 3 ....
for(int j=0; j<num_parse; j+=9)
{
for(int i=0; i<9; i++)
{
value[i] +=
(multiplier[i]*=10)*
(digit[i+j]-'0');
}
// value vector copied to output
// clear value & multiplier vectors
}
And if you convert more than just 9 values, like 512 or 8192 with padding to any multiple of 32, compiler should vectorize it.
To prepare input, you can use 8 different channels, 1 per digit of every parsed value.
Upvotes: 0