AG1
AG1

Reputation: 6774

Pack high bit of every byte in ARM, for 64 bytes like AVX512 vpmovb2m?

__builtin_ia32_cvtb2mask512() is the GNU C builtin for vpmovb2m k, zmm.
The Intel intrinsic for it is _mm512_movepi8_mask.

It extracts the most-significant bit from each byte, producing an integer mask.

The SSE2 and AVX2 instructions pmovmskb and vpmovmskb do the same thing for 16 or 32-byte vectors, producing the mask in a GPR instead of an AVX-512 mask register. (_mm_movemask_epi8 and _mm256_movemask_epi8).

  1. I would like an implementation for ARM that is faster than below
  2. I would like an implementation for ARM NEON
  3. I would like an implementation for ARM SVE

I have attached a basic scalar implementation in C. For those trying to implement this in ARM, we care about the high bit, but each byte's high bit (in a 128bit vector), can be easily shifted to the low bit using the ARM NEON intrinsic: vshrq_n_u8(). Note that I would prefer not to store the bitmap to memory, it should just be the return value of the function similar to the following function.

#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char V  _(6); // 64 bytes, 512 bits
typedef unsigned long U;
#undef _
U generic_cvtb2mask512(V v) {
   U mask=0;int i=0; 
   while(i<64){
     // shift mask by 1 and OR with MSB of v[i] byte
     mask=(mask<<1)|((v[i]&0x80)>>7);
     i++;}
   return mask;
}

This is one possible algorithm for 16 bytes (128b vector), it would just need to be put into a loop for 64 bytes (512b vector):

#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char g4 _(4); // 16 bytes, 128 bits
typedef char g3 _(3); // 8 bytes,   64 bits
typedef unsigned long U;
#undef _

unsigned short get_16msb(g4 v) {
  unsigned short = ret;

  // per byte, make every bit same as msb
  g4 msb = vdupq_n_u8(0x80);
  g4 filled = vceqq_u8(v, msb);

  // create a mask of each bit value
  g4 b = {0x80, 0x40, 0x20, 0x01, 0x08, 0x04, 0x02, 0x01,
          0x80, 0x40, 0x20, 0x01, 0x08, 0x04, 0x02, 0x01};

  // and vectors together
  g4 z = vandq_u8 (filled,b);

  // extract lower 8 bytes, hi 8 bytes
  g3 lo = vget_low_u8(z);
  g3 hi = vget_high_u8(z);

  // 'or' the 8 bytes of lo together ...
  // put in byte 1 of ret
  // 'or' the 8 bytes of hi together ...  
  // put in byte 2 of ret

  return ret;
}

Upvotes: 3

Views: 257

Answers (3)

AG1
AG1

Reputation: 6774

Here is another solution using ARM NEON.

#include <stdio.h>
// clang -Ofast -ob0 b0.c -funsigned-char -fno-unwind-tables -w
#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char g4 _(4),g6 _(6); // 64 * 8b
typedef unsigned long U,j6 _(6); // 8 * 64b
#undef _
#define ATM __attribute((minsize,noinline)) 
ATM void p4(g4 v) { for(int i=0;i<sizeof(g4);i++) { printf("%u,%s", v[i], (i+1)%8?"":" ");} putchar('\n');}
ATM void pu(U u) {for(int i=63;i>=0;i--){putchar((u & (1ULL<<i))?'1':'0'); if(0==(i)%8) putchar(' ');} putchar('b'); putchar('\n');} //print

#define bu(f) __builtin_neon_v##f
#define xi ((g4*)&x)[i]
#define vadd bu(addv_u8)
#define vbsl bu(bslq_v)
#define vs __builtin_shufflevector
//static g4 MB={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80};
static g4 MC={0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80};
static g4 M0={0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0};

ATM static U getmsb_neon(g6 x) {
  U u;g4 c;
  for (int i = 0; i < 4; ++i) {
    c = vbsl(xi>=128,MC,M0,48); // if >128, lookup //p4(c);
    ((char*)&u)[i*2]   = vadd(vs(c,c,0,1,2,3,4,5,6,7));
    ((char*)&u)[i*2+1] = vadd(vs(c,c,8,9,10,11,12,13,14,15));
  }
  return u;
}

int main(int argc, char *argv[]) {
 g6 a = {255,255,2,3,4,5,255,7, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7,
         0,0,0,0,0,0,0,0,       0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 255,255,255,128,128,128,128,128};

 U u = getmsb_neon(a);
 pu(u);
 return 0;
}

Upvotes: 0

AG1
AG1

Reputation: 6774

//credit goes to Arthur Whitney
//clang -Ofast -c foo.c -ofoo -funsigned-char -fno-unwind-tables -w
#include <stdio.h>
#define _(n) __attribute((vector_size(1<<n),aligned(1)))
typedef char g6 _(6); // 64 * 8b
typedef unsigned long U,j6 _(6); // 8 * 64b
void pu(U u) {for(int i=63;i>=0;i--){putchar((u & (1ULL<<i))?'1':'0'); if(0==(i)%8) putchar(' ');} putchar('b'); putchar('\n');} //print

// getmsb (for 8b) & pack bits
__attribute((minsize,noinline))
static U getmsb(g6 x) {
    U u = 0;  // resulting 64-bit mask
    x=(j6)x & 0x8080808080808080; // get MSB of each byte
    // 64 bytes, iterate over each 8 bytes to multiple by magic number 
    // shift right by 64 bits % 256
    // then 'or' with result
    for (int i = 0; i < 8; ++i) {
        U bit = ((__uint128_t)0x204081020408100 * ((j6)x)[i] >> 64) % 256;
        u |= bit << (8 * i);
    }
    return u;
}

int main (int charc, char **argv) {
  g6 t = {0,1,2,3,4,255,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,255};
  U u = getmsb(t);
  pu(u);
  return 0;
}

Upvotes: 0

Ben Clark
Ben Clark

Reputation: 487

There's a difficulty in wanting to optimise the generic, when most/best optimisations are with the specific. Especially what you want to do with the results.

eg the code for "checking if any high bit is set" can be much cheaper than "check which high bit is set".

  // per byte, make every bit same as msb
  g4 msb = vdupq_n_u8(0x80);
  g4 filled = vceqq_u8(v, msb);

won't make a difference in performance, but it's checking if the sign bit is set, so just do vcltzq_s8(v). i.e. instead of v == 0x80 just check if in signed comparison the value is negative.

If you only care about whether there is a value which has the signed bit set, for Adv SIMD you can just use vpmaxq_s8 on the result of the comparison and just do:

if (vgetq_lane_s64 (vreinterpretq_s64_s8 (res), 0))

For SVE you don't need this as the compare itself sets flags. You can do ptest on the predicate result of the compare and branch on that. The compiler should be able to remove the ptest during optimization.

If you need which element to use, there are various methods. As Peter Cordes says in comments, you can use an AND with a special mask and clz for Adv. SIMD.

These patterns are common and are essentially strchr from the standard library. So for the best sequences I'd recommend checking whatever we have in Arm optimized Routines which we constantly update as we find better ways.

for Neon: https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strchr.S is the file and does as above.

for SVE: https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strchr-sve.S There's some additional code there as strchr needs to check for the null terminator, but the general idea is the same.

Upvotes: 1

Related Questions