Reputation: 23799
I've implemented a method for parsing an unsigned integer string of length <= 8 using SIMD intrinsics available in .NET as follows:
public unsafe static uint ParseUint(string text)
{
fixed (char* c = text)
{
var parsed = Sse3.LoadDquVector128((byte*) c);
var shift = (8 - text.Length) * 2;
var shifted = Sse2.ShiftLeftLogical128BitLane(parsed,
(byte) (shift));
Vector128<byte> digit0 = Vector128.Create((byte) '0');
var reduced = Sse2.SubtractSaturate(shifted, digit0);
var shortMult = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
var collapsed2 = Sse2.MultiplyAddAdjacent(reduced.As<byte, short>(), shortMult);
var repack = Sse41.PackUnsignedSaturate(collapsed2, collapsed2);
var intMult = Vector128.Create((short)0, 0, 0, 0, 100, 1, 100, 1);
var collapsed3 = Sse2.MultiplyAddAdjacent(repack.As<ushort,short>(), intMult);
var e1 = collapsed3.GetElement(2);
var e2 = collapsed3.GetElement(3);
return (uint) (e1 * 10000 + e2);
}
}
Sadly, a comparison with a baseline uint.Parse()
gives the following, rather unimpressive, result:
Method | Mean | Error | StdDev |
---|---|---|---|
Baseline | 15.157 ns | 0.0325 ns | 0.0304 ns |
ParseSimd | 3.269 ns | 0.0115 ns | 0.0102 ns |
What are some of the ways the above code can be improved? My particular areas of concern are:
text.Length
MultiplyAddAdjacent
involving a vector of 0
s and 1
~~GetElement()
-- maybe there's some ToScalar()
call that can happen somwehere?Upvotes: 7
Views: 2198
Reputation: 1185
C# (thanks @aepot)
public unsafe uint ParseUint(string text)
{
fixed (char* c = text)
{
Vector128<byte> mul1 = Vector128.Create(0x14C814C8, 0x010A0A64, 0, 0).AsByte();
Vector128<short> mul2 = Vector128.Create(0x00FA61A8, 0x0001000A, 0, 0).AsInt16();
Vector128<long> shift_amount = Sse2.ConvertScalarToVector128Int32(8 - text.Length << 3).AsInt64();
Vector128<short> vs = Sse2.LoadVector128((short*)c);
Vector128<byte> vb = Sse2.PackUnsignedSaturate(vs, vs);
vb = Sse2.SubtractSaturate(vb, Vector128.Create((byte)'0'));
vb = Sse2.ShiftLeftLogical(vb.AsInt64(), shift_amount).AsByte();
Vector128<int> v = Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(mul1, vb.AsSByte()), mul2);
v = Sse2.Add(Sse2.Add(v, v), Sse2.Shuffle(v, 1));
return (uint)v.GetElement(0);
}
}
C solution using SSSE3:
#include <uchar.h> // char16_t
#include <tmmintrin.h> // pmaddubsw
unsigned ParseUint(char16_t* ptr, size_t len) {
const __m128i mul1 = _mm_set_epi32(0, 0, 0x010A0A64, 0x14C814C8);
const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);
const __m128i shift_amount = _mm_cvtsi32_si128((8 - len) * 8);
__m128i v = _mm_loadu_si128((__m128i*)ptr); // unsafe chunking
v = _mm_packus_epi16(v,v); // convert digits from UTF16-LE to ASCII
v = _mm_subs_epu8(v, _mm_set1_epi8('0'));
v = _mm_sll_epi64(v, shift_amount); // shift off non-digit trash
// convert
v = _mm_madd_epi16(_mm_maddubs_epi16(mul1, v), mul2);
v = _mm_add_epi32(_mm_add_epi32(v,v), _mm_shuffle_epi32(v, 1));
return (unsigned)_mm_cvtsi128_si32(v);
}
Regardless of how one shifts/aligns the string (see aepot's anwser), we want to stay away from pmulld
. SSE basically has 16-bit integer multiplication and the 32-bit multiply has double the latency and uops. However, care must be taken around the sign-extension behavior of pmaddubsw
and pmaddwd
.
using scalar x64
:
// untested && I don't know C#
public unsafe static uint ParseUint(string text)
{
fixed (char* c = text)
{
var xmm = Sse2.LoadVector128((ushort*)c); // unsafe chunking
var packed = Sse2.PackSignedSaturate(xmm,xmm); // convert digits from UTF16-LE to ASCII
ulong val = Sse2.X64.ConvertToUInt64(packed); // extract to scalar
val -= 0x3030303030303030; // subtract '0' from each digit
val <<= ((8 - text.Length) * 8); // shift off non-digit trash
// convert
const ulong mask = 0x000000FF000000FF;
const ulong mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
const ulong mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
val = (val * 10) + (val >> 8);
val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
return (uint)val;
}
}
What if we don't know the length of the number ahead of time?
// C pseudocode & assumes ascii text
uint64_t v, m, len;
v = unaligned_load_little_endian_u64(p);
m = v + 0x4646464646464646; // roll '9' to 0x7F
v -= 0x3030303030303030; // unpacked binary coded decimal
m = (m | v) & 0x8080808080808080; // detect first non-digit
m = _tzcnt_u64(m >> 7); // count run of digits
if (((uint8_t)v) > 9) return error_not_a_number;
v <<= 64 - m; // shift off any "trailing" chars that are not digits
p += m >> 3; // consume bytes
v = parse_8_chars(v);
Or if we have a list of strings to process:
// assumes ascii text
__m256i parse_uint_x4(void* base_addr, __m256i offsets_64x4)
{
const __m256i x00 = _mm256_setzero_si256();
const __m256i x0A = _mm256_set1_epi8(0x0A);
const __m256i x30 = _mm256_set1_epi8(0x30);
const __m256i x08 = _mm256_and_si256(_mm256_srli_epi32(x30, 1), x0A);
const __m256i mul1 = _mm256_set1_epi64x(0x010A0A6414C814C8);
const __m256i mul2 = _mm256_set1_epi64x(0x0001000A00FA61A8);
__m256i v, m;
// process 4 strings at once, up to 8 digits in each string...
// (the 64-bit chunks could be manually loaded using 3 shuffles)
v = _mm256_i64gather_epi64((long long*)base_addr, offsets_64x4, 1);
// rebase digits from 0x30..0x39 to 0x00..0x09
v = _mm256_xor_si256(v, x30);
// range check
// (unsigned gte compare)
v = _mm256_min_epu8(v, x0A);
m = _mm256_cmpeq_epi8(x0A, v);
// mask of lowest non-digit and above
m = _mm256_or_si256(m, _mm256_sub_epi64(x00, m));
// align the end of the digit-string to the top of the u64 lane
// (shift off masked bytes and insert leading zeros)
m = _mm256_sad_epu8(_mm256_and_si256(m, x08), x00);
v = _mm256_sllv_epi64(v, m);
// convert to binary
// (the `add(v,v)` allow us to keep `mul2` unsigned)
v = _mm256_madd_epi16(_mm256_maddubs_epi16(mul1, v), mul2);
v = _mm256_add_epi32(_mm256_shuffle_epi32(v, 0x31), _mm256_add_epi32(v,v));
// zero the hi-dwords of each qword
v = _mm256_blend_epi32(v, x00, 0xAA);
return v;
}
Upvotes: 7
Reputation: 4824
I've made some optimizations.
public unsafe uint ParseUint2(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
raw = Sse2.ShiftLeftLogical128BitLane(raw, (byte)(8 - text.Length << 1));
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
Reduced amount of type conversions and final calculations were made with vphaddd
. As result it's by ~10% faster.
But...imm8
must be a compile-time constant. It means you can't use a variable where imm8
is argument. Otherwise JIT compiler won't produce the intrinsic instruction for the operation. It will make an external method call
at this place (maybe some workaround is there). Thanks @PeterCordes for help.
This monster isn't significantly but faster than above one, regardless of text.Length
.
public unsafe uint ParseUint3(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
switch (text.Length)
{
case 0: raw = Vector128<ushort>.Zero; break;
case 1: raw = Sse2.ShiftLeftLogical128BitLane(raw, 14); break;
case 2: raw = Sse2.ShiftLeftLogical128BitLane(raw, 12); break;
case 3: raw = Sse2.ShiftLeftLogical128BitLane(raw, 10); break;
case 4: raw = Sse2.ShiftLeftLogical128BitLane(raw, 8); break;
case 5: raw = Sse2.ShiftLeftLogical128BitLane(raw, 6); break;
case 6: raw = Sse2.ShiftLeftLogical128BitLane(raw, 4); break;
case 7: raw = Sse2.ShiftLeftLogical128BitLane(raw, 2); break;
};
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
Again, @PeterCordes doesn't allow me to write a slow code. The following version got 2 improvements. Now string loaded already shifted, and then subtracted to the shifted mask by the same offset. This avoids the slow fallback for ShiftLeftLogical128BitLane
with a variable count.
The second improvement is replacing vphaddd
with pshufd
+ paddd
.
// Note that this loads up to 14 bytes before the data part of the string. (Or 16 for an empty string)
// This might or might not make it possible to read from an unmapped page and fault, beware.
public unsafe uint ParseUint4(string text)
{
const string mask = "\xffff\xffff\xffff\xffff\xffff\xffff\xffff\xffff00000000";
fixed (char* c = text, m = mask)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c - 8 + text.Length);
Vector128<ushort> mask0 = Sse3.LoadDquVector128((ushort*)m + text.Length);
raw = Sse2.SubtractSaturate(raw, mask0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
Vector128<int> shuf = Sse2.Shuffle(res, 0x1b); // 0 1 2 3 => 3 2 1 0
res = Sse2.Add(shuf, res);
shuf = Sse2.Shuffle(res, 0x41); // 0 1 2 3 => 1 0 3 2
res = Sse2.Add(shuf, res);
return (uint)res.GetElement(0);
}
}
~Twice faster than initial solution. (o_O) At least on my Haswell i7.
Upvotes: 8
Reputation: 21956
First of all, 5x improvement is not “rather unimpressive”.
I would not do the last step with scalar code, here’s an alternative:
// _mm_shuffle_epi32( x, _MM_SHUFFLE( 3, 3, 2, 2 ) )
collapsed3 = Sse2.Shuffle( collapsed3, 0xFA );
// _mm_mul_epu32
var collapsed4 = Sse2.Multiply( collapsed3.As<int, uint>(), Vector128.Create( 10000u, 0, 1, 0 ) ).As<ulong, uint>();
// _mm_add_epi32( x, _mm_srli_si128( x, 8 ) )
collapsed4 = Sse2.Add( collapsed4, Sse2.ShiftRightLogical128BitLane( collapsed4, 8 ) );
return collapsed4.GetElement( 0 );
The C++ version gonna be way faster than what happens on my PC (.NET Core 3.1). The generated code is not good. They initialize constants like this:
00007FFAD10B11B6 xor ecx,ecx
00007FFAD10B11B8 mov dword ptr [rsp+20h],ecx
00007FFAD10B11BC mov dword ptr [rsp+28h],64h
00007FFAD10B11C4 mov dword ptr [rsp+30h],1
00007FFAD10B11CC mov dword ptr [rsp+38h],64h
00007FFAD10B11D4 mov dword ptr [rsp+40h],1
They use stack memory instead of another vector register. It looks like JIT developers forgot there’re 16 vector registers there, the complete function only uses xmm0
.
00007FFAD10B1230 vmovapd xmmword ptr [rbp-0C0h],xmm0
00007FFAD10B1238 vmovapd xmm0,xmmword ptr [rbp-0C0h]
00007FFAD10B1240 vpsrldq xmm0,xmm0,8
00007FFAD10B1245 vpaddd xmm0,xmm0,xmmword ptr [rbp-0C0h]
Upvotes: 3