Reputation: 15386
Given one or more __m128i
or __m256i
containing one nibble per 16-bit element, what's the fastest way to combine and pack those into one byte per 8-bit element (i.e. (hi << 4) | lo
for adjacent 16-bit elements)?
This is the best way I've come up with and is unfortunately comparable to scalar code:
const static __m256i shufmask = _mm256_setr_epi8(
2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255, 255, 255,
2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255, 255, 255);
const static __m256i high4 = _mm256_setr_epi8(
255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0
);
inline static int64_t hnib2byte(__m256i nibbles) { // (a << 4) | b;
// hi 0 lo 0, ...
__m256i upper = _mm256_slli_epi16(nibbles, 4);
// Align upper and lower halves so they can be ORed vertically
// lo 0 0 0, ...
__m256i lower = _mm256_shuffle_epi8(nibbles, shufmask);
// ab x x x, ...
__m256i or = _mm256_or_si256(upper, lower);
// Pack into bytes
or = _mm256_and_si256(or, high4);
__m256i pack16 = _mm256_packus_epi16(or, or);
const int _3to2 = 0b00001000;
__m256i perm16 = _mm256_permute4x64_epi64(pack16, _3to2); // :(
__m256i pack8 = _mm256_packus_epi16(perm16, perm16);
return _mm_cvtsi128_si64(_mm256_castsi256_si128(pack8));
}
Instructions up to and including AVX2 are fair game. Masked shift in AVX-512 opens up nicer options. This is called in a loop, so it's also fair game to do stuff like packing the nibbles into 8-bit elements early on.
Upvotes: 4
Views: 602
Reputation: 3998
The solution hnib2byte_v2
below should be a bit faster than your solution, at least on Intel processors.
Instruction vpermd
or intrinsic _mm256_permutevar8x32_epi32
is slow on AMD Ryzen. On that platform it is better to use _mm256_extracti128_si256
to extract the upper 128-bit lane of pck
, to use _mm256_castsi256_si128
to extract the lower 128-bit lane, and to combine these two with _mm256_or_si256
to get the answer in the lowest 64 bits.
/*
gcc -O3 -m64 -Wall -mavx2 -march=broadwell nibble2byte.c
*/
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
int print_avx2_hex(__m256i ymm);
inline static int64_t hnib2byte_v2(__m256i nibbles) {
__m256i shufmask8 = _mm256_set_epi8(-1,-1,-1,-1, -1,-1,-1,-1, 14,10,6,2, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, 14,10,6,2);
__m256i shufmask32 = _mm256_set_epi32(7,7,7,7,7,7,5,0);
__m256i lower = _mm256_slli_epi32(nibbles, 20);
// 00E0000000C00000 00A0000000800000 0060000000400000 0020000000000000
__m256i up_lo = _mm256_or_si256(lower,nibbles);
// 00EF000E00CD000C 00AB000A00890008 0067000600450004 0023000200010000
__m256i pck = _mm256_shuffle_epi8(up_lo,shufmask8);
// 0000000000000000 EFCDAB8900000000 0000000000000000 0000000067452301
__m256i pck64 = _mm256_permutevar8x32_epi32(pck,shufmask32);
// 0000000000000000 0000000000000000 0000000000000000 EFCDAB8967452301
// print_avx2_hex(lower);
// print_avx2_hex(up_lo);
// print_avx2_hex(pck);
// print_avx2_hex(pck64);
return _mm_cvtsi128_si64(_mm256_castsi256_si128(pck64));
}
inline static int64_t hnib2byte(__m256i nibbles) { // (a << 4) | b;
__m256i shufmask = _mm256_setr_epi8(
2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255, 255, 255,
2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255, 255, 255);
__m256i high4 = _mm256_setr_epi8(
255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0);
// hi 0 lo 0, ...
__m256i upper = _mm256_slli_epi16(nibbles, 4);
// Align upper and lower halves so they can be ORed vertically
// lo 0 0 0, ...
__m256i lower = _mm256_shuffle_epi8(nibbles, shufmask);
// ab x x x, ...
__m256i or = _mm256_or_si256(upper, lower);
// Pack into bytes
or = _mm256_and_si256(or, high4);
__m256i pack16 = _mm256_packus_epi16(or, or);
const int _3to2 = 0b00001000;
__m256i perm16 = _mm256_permute4x64_epi64(pack16, _3to2); // :(
__m256i pack8 = _mm256_packus_epi16(perm16, perm16);
return _mm_cvtsi128_si64(_mm256_castsi256_si128(pack8));
}
int print_avx2_hex(__m256i ymm)
{
long unsigned int x[4];
_mm256_storeu_si256((__m256i*)x,ymm);
printf("%016lX %016lX %016lX %016lX\n", x[3],x[2],x[1],x[0]);
return 0;
}
int main()
{
uint64_t x;
__m256i nibble_x16 = _mm256_set_epi16(0x000F,0x000E,0x000D,0x000C, 0x000B,0x000A,0x0009,0x0008,
0x0007,0x0006,0x0005,0x0004, 0x0003,0x0002,0x0001,0x0000);
printf("AVX variable: \n");
print_avx2_hex(nibble_x16);
x = hnib2byte(nibble_x16);
printf("With hnib2byte x = %016lX \n\n",x);
printf("AVX variable: \n");
print_avx2_hex(nibble_x16);
x = hnib2byte_v2(nibble_x16);
printf("With hnib2byte_v2 x = %016lX \n",x);
return 0;
}
The output is:
$ ./a.out
AVX variable:
000F000E000D000C 000B000A00090008 0007000600050004 0003000200010000
With hnib2byte x = EFCDAB8967452301
AVX variable:
000F000E000D000C 000B000A00090008 0007000600050004 0003000200010000
With hnib2byte_v2 x = EFCDAB8967452301
The output of the two methods is is equal for the input that is chosen here.
Apart from loading the shuffle constants, which should be done outside the loop, it compiles to only five instructions:
vpslld
,vpor
,vpshufb
, vpermd
,and vmovq
, which is three less than your solution.
Upvotes: 3