Marcin Badtke
Marcin Badtke

Reputation: 759

How to build 32bit integers from array of 8bit integers using Intel intrinsics?

I have an array which consists of 32 bytes. I need to build 8 4 bytes integers out of this array. E.g 0x00,0x11,0x22,0x33 8bit ints need to be one 0x00112233 32bit int. I decided to use AVX instructions because I can load whole array to a register with one command.

Code I wrote:

#include <stdio.h>
#include "immintrin.h"

typedef unsigned int        uint32_t;
typedef unsigned char       uint8_t;

main() {
  const uint8_t block[32] __attribute((aligned(32))) = {
   0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
  ,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
  };
  uint32_t m[8] __attribute((aligned(32)));

  __m256i ymm9 = _mm256_set_epi8(
        block[ 0],block[ 1],block[ 2],block[ 3],block[ 4],block[ 5],block[ 6],block[ 7],
        block[ 8],block[ 9],block[10],block[11],block[12],block[13],block[14],block[15],
        block[16],block[17],block[18],block[19],block[20],block[21],block[22],block[23],
        block[24],block[25],block[26],block[27],block[28],block[29],block[30],block[31]);
  _mm256_store_si256(&(m[0]),ymm9);
  int i;
  for(i=0;i<32;++i) printf("i=%d, 0x%02x\n",i,block[i]);
  for(i=0;i<8;++i) printf("i=%d, 0x%08x\n",i,m[i]);
}

Do you think it is optimal in terms of performance ? Can it be done better and run faster ? I use Linux @x86_64 and gcc 4.8.2.

I am a beginner in the world of Intel intrinsics. Thanks for your help.

Upvotes: 3

Views: 454

Answers (2)

Marcin Badtke
Marcin Badtke

Reputation: 759

Thanks all for comments. Especially harold's and Zboson's.

This is my second try:

const uint8_t block[32] __attribute((aligned(32))) = {
  0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
  0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff,
  0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
  0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff};
uint32_t m[8] __attribute((aligned(32)));
const uint8_t maska[16] __attribute((aligned(16))) = {
  0x0F,0x0E,0x0D,0x0C,0x0B,0x0A,0x09,0x08,
  0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00};
__m128i mask = _mm_load_si128(maska);
__m128i xmm0 = _mm_load_si128(block);
_mm_store_si128((__m128i*)&(m[0]),_mm_shuffle_epi8(xmm0, mask));
xmm0 = _mm_load_si128(block+16);
_mm_store_si128((__m128i*)&(m[4]),_mm_shuffle_epi8(xmm0, mask));

What do you think about that ? I am pretty sure there is a room for improvment. I do not know if _mm_load_si128 is the best way to copy data from memory to register. Assembler for first iteration:

/* Create a vector with element 0 as *P and the rest zero.  */

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128 (__m128i const *__P)
{
return *__P;
mov    0x8(%rsp),%rax
vmovdqa (%rax),%xmm0
    0x0F,0x0E,0x0D,0x0C,0x0B,0x0A,0x09,0x08,
    0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00};
__m128i mask = _mm_load_si128(maska);
vmovdqa %xmm0,0x30(%rsp)
lea    0xf0(%rsp),%rax
mov    %rax,0x10(%rsp)
mov    0x10(%rsp),%rax
vmovdqa (%rax),%xmm0
__m128i xmm0 = _mm_load_si128(block);
vmovdqa %xmm0,0x40(%rsp)
vmovdqa 0x40(%rsp),%xmm0
vmovdqa %xmm0,0x50(%rsp)
vmovdqa 0x30(%rsp),%xmm0
vmovdqa %xmm0,0x60(%rsp)
}

extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y);
vmovdqa 0x60(%rsp),%xmm1
vmovdqa 0x50(%rsp),%xmm0
vpshufb %xmm1,%xmm0,%xmm0
lea    0xb0(%rsp),%rax
mov    %rax,0x18(%rsp)
vmovdqa %xmm0,0x70(%rsp)
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128 (__m128i *__P, __m128i __B)
{
*__P = __B;
mov    0x18(%rsp),%rax
vmovdqa 0x70(%rsp),%xmm0
vmovdqa %xmm0,(%rax)
lea    0xf0(%rsp),%rax
add    $0x10,%rax
mov    %rax,0x20(%rsp)

What do you think ?

Upvotes: 1

user555045
user555045

Reputation: 64913

As usual, check the disassembly. Then as it turns out, with the compiler I used anyway, that it relies on that data being a compile time constant, and it rearranges it so that it can be loaded easily. If that is actually the case in your real code, this is fine (but then why not use an array of uints to begin with?). But if, as I suspect it is, this is just an example and the actual array with be variable, this is a disaster, just look at it:

movzx   eax, BYTE PTR [rsp+95]
xor ebx, ebx
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+93]
vmovd   xmm7, DWORD PTR [rsp]
vpinsrb xmm7, xmm7, BYTE PTR [rsp+94], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+91]
vmovd   xmm3, DWORD PTR [rsp]
vpinsrb xmm3, xmm3, BYTE PTR [rsp+92], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+89]
vmovd   xmm1, DWORD PTR [rsp]
vpinsrb xmm1, xmm1, BYTE PTR [rsp+90], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+87]
vmovd   xmm6, DWORD PTR [rsp]
vpunpcklwd  xmm3, xmm7, xmm3
vpinsrb xmm6, xmm6, BYTE PTR [rsp+88], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+85]
vmovd   xmm5, DWORD PTR [rsp]
vpinsrb xmm5, xmm5, BYTE PTR [rsp+86], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+83]
vmovd   xmm2, DWORD PTR [rsp]
vpunpcklwd  xmm1, xmm1, xmm6
vpinsrb xmm2, xmm2, BYTE PTR [rsp+84], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+81]
vmovd   xmm0, DWORD PTR [rsp]
vpunpckldq  xmm1, xmm3, xmm1
vpinsrb xmm0, xmm0, BYTE PTR [rsp+82], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+79]
vmovd   xmm4, DWORD PTR [rsp]
vpunpcklwd  xmm2, xmm5, xmm2
vpinsrb xmm4, xmm4, BYTE PTR [rsp+80], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+77]
vmovd   xmm8, DWORD PTR [rsp]
vpinsrb xmm8, xmm8, BYTE PTR [rsp+78], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+75]
vpunpcklwd  xmm0, xmm0, xmm4
vmovd   xmm4, DWORD PTR [rsp]
vpinsrb xmm4, xmm4, BYTE PTR [rsp+76], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+73]
vpunpckldq  xmm0, xmm2, xmm0
vmovd   xmm2, DWORD PTR [rsp]
vpinsrb xmm2, xmm2, BYTE PTR [rsp+74], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+71]
vmovd   xmm7, DWORD PTR [rsp]
vpunpcklqdq xmm1, xmm1, xmm0
vpunpcklwd  xmm4, xmm8, xmm4
vpinsrb xmm7, xmm7, BYTE PTR [rsp+72], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+69]
vmovd   xmm6, DWORD PTR [rsp]
vpinsrb xmm6, xmm6, BYTE PTR [rsp+70], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+67]
vmovd   xmm0, DWORD PTR [rsp]
vpunpcklwd  xmm2, xmm2, xmm7
vpinsrb xmm0, xmm0, BYTE PTR [rsp+68], 1
mov BYTE PTR [rsp], al
movzx   eax, BYTE PTR [rsp+65]
vmovd   xmm5, DWORD PTR [rsp]
vpunpckldq  xmm2, xmm4, xmm2
vpinsrb xmm5, xmm5, BYTE PTR [rsp+66], 1
mov BYTE PTR [rsp], al
vmovd   xmm3, DWORD PTR [rsp]
vpunpcklwd  xmm0, xmm6, xmm0
vpinsrb xmm3, xmm3, BYTE PTR [rsp+64], 1
vpunpcklwd  xmm3, xmm5, xmm3
vpunpckldq  xmm0, xmm0, xmm3
vpunpcklqdq xmm0, xmm2, xmm0
vinserti128 ymm0, ymm1, xmm0, 0x1
vmovdqa YMMWORD PTR [rsp+32], ymm0

Wow. Ok, not so good. Indeed worse than if the same thing was done without intrinsics, but not all is lost. It would be better to load the data as little endian uints, and then swap them around with a _mm256_shuffle_epi8, sort of like this (but check that shuffle mask, I didn't test it)

__m256i ymm9 = _mm256_shuffle_epi8(_mm256_load_si256((__m256i*)block), _mm256_set_epi8(
    0, 1, 2, 3,
    4, 5, 6, 7,
    8, 9, 10, 11,
    12, 13, 14, 15,
    0, 1, 2, 3,
    4, 5, 6, 7,
    8, 9, 10, 11,
    12, 13, 14, 15));
ymm9 = _mm256_permute2x128_si256(ymm9, ymm9, 1);
_mm256_store_si256((__m256i*)m, ymm9);

In general, be very careful with the "set" family of intrinsics, they can compile to very bad instruction sequences.

Upvotes: 3

Related Questions