Reputation: 6774
__builtin_ia32_cvtb2mask512()
is the GNU C builtin for vpmovb2m k, zmm
.
The Intel intrinsic for it is _mm512_movepi8_mask
.
It extracts the most-significant bit from each byte, producing an integer mask.
The SSE2 and AVX2 instructions pmovmskb
and vpmovmskb
do the same thing for 16 or 32-byte vectors, producing the mask in a GPR instead of an AVX-512 mask register. (_mm_movemask_epi8
and _mm256_movemask_epi8
).
I have attached a basic scalar implementation in C. For those trying to implement this in ARM, we care about the high bit, but each byte's high bit (in a 128bit vector), can be easily shifted to the low bit using the ARM NEON intrinsic: vshrq_n_u8(). Note that I would prefer not to store the bitmap to memory, it should just be the return value of the function similar to the following function.
#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char V _(6); // 64 bytes, 512 bits
typedef unsigned long U;
#undef _
U generic_cvtb2mask512(V v) {
U mask=0;int i=0;
while(i<64){
// shift mask by 1 and OR with MSB of v[i] byte
mask=(mask<<1)|((v[i]&0x80)>>7);
i++;}
return mask;
}
This is one possible algorithm for 16 bytes (128b vector), it would just need to be put into a loop for 64 bytes (512b vector):
#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char g4 _(4); // 16 bytes, 128 bits
typedef char g3 _(3); // 8 bytes, 64 bits
typedef unsigned long U;
#undef _
unsigned short get_16msb(g4 v) {
unsigned short = ret;
// per byte, make every bit same as msb
g4 msb = vdupq_n_u8(0x80);
g4 filled = vceqq_u8(v, msb);
// create a mask of each bit value
g4 b = {0x80, 0x40, 0x20, 0x01, 0x08, 0x04, 0x02, 0x01,
0x80, 0x40, 0x20, 0x01, 0x08, 0x04, 0x02, 0x01};
// and vectors together
g4 z = vandq_u8 (filled,b);
// extract lower 8 bytes, hi 8 bytes
g3 lo = vget_low_u8(z);
g3 hi = vget_high_u8(z);
// 'or' the 8 bytes of lo together ...
// put in byte 1 of ret
// 'or' the 8 bytes of hi together ...
// put in byte 2 of ret
return ret;
}
Upvotes: 3
Views: 257
Reputation: 6774
Here is another solution using ARM NEON.
#include <stdio.h>
// clang -Ofast -ob0 b0.c -funsigned-char -fno-unwind-tables -w
#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char g4 _(4),g6 _(6); // 64 * 8b
typedef unsigned long U,j6 _(6); // 8 * 64b
#undef _
#define ATM __attribute((minsize,noinline))
ATM void p4(g4 v) { for(int i=0;i<sizeof(g4);i++) { printf("%u,%s", v[i], (i+1)%8?"":" ");} putchar('\n');}
ATM void pu(U u) {for(int i=63;i>=0;i--){putchar((u & (1ULL<<i))?'1':'0'); if(0==(i)%8) putchar(' ');} putchar('b'); putchar('\n');} //print
#define bu(f) __builtin_neon_v##f
#define xi ((g4*)&x)[i]
#define vadd bu(addv_u8)
#define vbsl bu(bslq_v)
#define vs __builtin_shufflevector
//static g4 MB={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80};
static g4 MC={0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80};
static g4 M0={0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0};
ATM static U getmsb_neon(g6 x) {
U u;g4 c;
for (int i = 0; i < 4; ++i) {
c = vbsl(xi>=128,MC,M0,48); // if >128, lookup //p4(c);
((char*)&u)[i*2] = vadd(vs(c,c,0,1,2,3,4,5,6,7));
((char*)&u)[i*2+1] = vadd(vs(c,c,8,9,10,11,12,13,14,15));
}
return u;
}
int main(int argc, char *argv[]) {
g6 a = {255,255,2,3,4,5,255,7, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 255,255,255,128,128,128,128,128};
U u = getmsb_neon(a);
pu(u);
return 0;
}
Upvotes: 0
Reputation: 6774
//credit goes to Arthur Whitney
//clang -Ofast -c foo.c -ofoo -funsigned-char -fno-unwind-tables -w
#include <stdio.h>
#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char g6 _(6); // 64 * 8b
typedef unsigned long U,j6 _(6); // 8 * 64b
void pu(U u) {for(int i=63;i>=0;i--){putchar((u & (1ULL<<i))?'1':'0'); if(0==(i)%8) putchar(' ');} putchar('b'); putchar('\n');} //print
// getmsb (for 8b) & pack bits
__attribute((minsize,noinline))
static U getmsb(g6 x) {
U u = 0; // resulting 64-bit mask
x=(j6)x & 0x8080808080808080; // get MSB of each byte
// 64 bytes, iterate over each 8 bytes to multiple by magic number
// shift right by 64 bits % 256
// then 'or' with result
for (int i = 0; i < 8; ++i) {
U bit = ((__uint128_t)0x204081020408100 * ((j6)x)[i] >> 64) % 256;
u |= bit << (8 * i);
}
return u;
}
int main (int charc, char **argv) {
g6 t = {0,1,2,3,4,255,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,255};
U u = getmsb(t);
pu(u);
return 0;
}
Upvotes: 0
Reputation: 487
There's a difficulty in wanting to optimise the generic, when most/best optimisations are with the specific. Especially what you want to do with the results.
eg the code for "checking if any high bit is set" can be much cheaper than "check which high bit is set".
// per byte, make every bit same as msb
g4 msb = vdupq_n_u8(0x80);
g4 filled = vceqq_u8(v, msb);
won't make a difference in performance, but it's checking if the sign bit is set, so just do vcltzq_s8(v)
. i.e. instead of v == 0x80
just check if in signed comparison the value is negative.
If you only care about whether there is a value which has the signed bit set, for Adv SIMD you can just use vpmaxq_s8
on the result of the comparison and just do:
if (vgetq_lane_s64 (vreinterpretq_s64_s8 (res), 0))
For SVE you don't need this as the compare itself sets flags. You can do ptest
on the predicate result of the compare and branch on that. The compiler should be able to remove the ptest
during optimization.
If you need which element to use, there are various methods. As Peter Cordes says in comments, you can use an AND with a special mask and clz
for Adv. SIMD.
These patterns are common and are essentially strchr
from the standard library. So for the best sequences I'd recommend checking whatever we have in Arm optimized Routines which we constantly update as we find better ways.
for Neon: https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strchr.S is the file and does as above.
for SVE: https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strchr-sve.S There's some additional code there as strchr
needs to check for the null terminator, but the general idea is the same.
Upvotes: 1