Global bitwise shift of 128, 256, 512 bit registry using intrinsics?

Question

Consider an array of 64 bit unsigned integers, like:

std::array a;

What is the fastest way, including using intel or compiler intrinsics (this or that) (using g++ 5.3), to perform a global bitshift (right or left) as this array was a single bit integer?

user1940376 · Accepted Answer

Here are some x86 left shift functions that use xmm and ymm registers through intrinsics. It shouldn't be too hard to make corresponding right shift functions. They are taken from a software lfsr benchmark:

//----------------------------------------------------------------------------
// bit shift left a 128-bit value using xmm registers
//          __m128i *data - data to shift
//          int count     - number of bits to shift
// return:  __m128i       - carry out bit(s)

static __m128i bitShiftLeft128xmm (__m128i *data, int count)
   {
   __m128i innerCarry, carryOut;

   innerCarry = _mm_srli_epi64 (*data, 64 - count);      // carry outs in bit 0 of each qword
   carryOut   = _mm_shuffle_epi32 (innerCarry, 0xFE);    // upper carry in xmm bit 0, others zero
   innerCarry = _mm_shuffle_epi32 (innerCarry, 0xCF);    // lower carry in xmm bit 64, others zero
   *data = _mm_slli_epi64 (*data, count);                // shift all qwords left
   *data = _mm_or_si128 (*data, innerCarry);             // propagate carry out from low qword
   return carryOut;
   }

//----------------------------------------------------------------------------
// bit shift left a 256-bit value using xmm registers
//          __m128i *data - data to shift, ls part stored first 
//          int count     - number of bits to shift
// return:  __m128i       - carry out bit(s)

static __m128i bitShiftLeft256xmm (__m128i *data, int count)
   {
   __m128i carryOut0, carryOut1;

   carryOut0 = bitShiftLeft128xmm (&data [0], count);
   carryOut1 = bitShiftLeft128xmm (&data [1], count);
   data [1] = _mm_or_si128 (data [1], carryOut0);
   return carryOut1;
   }

//----------------------------------------------------------------------------
// bit shift left a 512-bit value using xmm registers
//          __m128i *data - data to shift, ls part stored first 
//          int count     - number of bits to shift
// return:  __m128i       - carry out bit(s)

static __m128i bitShiftLeft512xmm (__m128i *data, int count)
   {
   __m128i carryOut0, carryOut1;

   carryOut0 = bitShiftLeft256xmm (&data [0], count);
   carryOut1 = bitShiftLeft256xmm (&data [2], count);
   data [2] = _mm_or_si128 (data [2], carryOut0);
   return carryOut1;
   }


//----------------------------------------------------------------------------
// bit shift left a 256-bit value using ymm registers
//          __m256i *data - data to shift
//          int count     - number of bits to shift
// return:  __m256i       - carry out bit(s)

static __m256i bitShiftLeft256ymm (__m256i *data, int count)
   {
   __m256i innerCarry, carryOut, rotate;

   innerCarry = _mm256_srli_epi64 (*data, 64 - count);                        // carry outs in bit 0 of each qword
   rotate     = _mm256_permute4x64_epi64 (innerCarry, 0x93);                  // rotate ymm left 64 bits
   innerCarry = _mm256_blend_epi32 (_mm256_setzero_si256 (), rotate, 0xFC);   // clear lower qword
   *data    = _mm256_slli_epi64 (*data, count);                               // shift all qwords left
   *data    = _mm256_or_si256 (*data, innerCarry);                            // propagate carrys from low qwords
   carryOut   = _mm256_xor_si256 (innerCarry, rotate);                        // clear all except lower qword
   return carryOut;
   }

//----------------------------------------------------------------------------
// bit shift left a 512-bit value using ymm registers
//          __m256i *data - data to shift, ls part stored first 
//          int count     - number of bits to shift
// return:  __m256i       - carry out bit(s)

static __m256i bitShiftLeft512ymm (__m256i *data, int count)
   {
   __m256i carryOut0, carryOut1;

   carryOut0 = bitShiftLeft256ymm (&data [0], count);
   carryOut1 = bitShiftLeft256ymm (&data [1], count);
   data [1] = _mm256_or_si256 (data [1], carryOut0);
   return carryOut1;
   }

//----------------------------------------------------------------------------

Global bitwise shift of 128, 256, 512 bit registry using intrinsics?

Answers (2)

Related Questions