Reputation: 759
I have an array which consists of 32 bytes. I need to build 8 4 bytes integers out of this array. E.g 0x00,0x11,0x22,0x33 8bit ints need to be one 0x00112233 32bit int. I decided to use AVX instructions because I can load whole array to a register with one command.
Code I wrote:
#include <stdio.h>
#include "immintrin.h"
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
main() {
const uint8_t block[32] __attribute((aligned(32))) = {
0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
};
uint32_t m[8] __attribute((aligned(32)));
__m256i ymm9 = _mm256_set_epi8(
block[ 0],block[ 1],block[ 2],block[ 3],block[ 4],block[ 5],block[ 6],block[ 7],
block[ 8],block[ 9],block[10],block[11],block[12],block[13],block[14],block[15],
block[16],block[17],block[18],block[19],block[20],block[21],block[22],block[23],
block[24],block[25],block[26],block[27],block[28],block[29],block[30],block[31]);
_mm256_store_si256(&(m[0]),ymm9);
int i;
for(i=0;i<32;++i) printf("i=%d, 0x%02x\n",i,block[i]);
for(i=0;i<8;++i) printf("i=%d, 0x%08x\n",i,m[i]);
}
Do you think it is optimal in terms of performance ? Can it be done better and run faster ? I use Linux @x86_64 and gcc 4.8.2.
I am a beginner in the world of Intel intrinsics. Thanks for your help.
Upvotes: 3
Views: 454
Reputation: 759
Thanks all for comments. Especially harold's and Zboson's.
This is my second try:
const uint8_t block[32] __attribute((aligned(32))) = {
0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff,
0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff};
uint32_t m[8] __attribute((aligned(32)));
const uint8_t maska[16] __attribute((aligned(16))) = {
0x0F,0x0E,0x0D,0x0C,0x0B,0x0A,0x09,0x08,
0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00};
__m128i mask = _mm_load_si128(maska);
__m128i xmm0 = _mm_load_si128(block);
_mm_store_si128((__m128i*)&(m[0]),_mm_shuffle_epi8(xmm0, mask));
xmm0 = _mm_load_si128(block+16);
_mm_store_si128((__m128i*)&(m[4]),_mm_shuffle_epi8(xmm0, mask));
What do you think about that ? I am pretty sure there is a room for improvment. I do not know if _mm_load_si128 is the best way to copy data from memory to register. Assembler for first iteration:
/* Create a vector with element 0 as *P and the rest zero. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128 (__m128i const *__P)
{
return *__P;
mov 0x8(%rsp),%rax
vmovdqa (%rax),%xmm0
0x0F,0x0E,0x0D,0x0C,0x0B,0x0A,0x09,0x08,
0x07,0x06,0x05,0x04,0x03,0x02,0x01,0x00};
__m128i mask = _mm_load_si128(maska);
vmovdqa %xmm0,0x30(%rsp)
lea 0xf0(%rsp),%rax
mov %rax,0x10(%rsp)
mov 0x10(%rsp),%rax
vmovdqa (%rax),%xmm0
__m128i xmm0 = _mm_load_si128(block);
vmovdqa %xmm0,0x40(%rsp)
vmovdqa 0x40(%rsp),%xmm0
vmovdqa %xmm0,0x50(%rsp)
vmovdqa 0x30(%rsp),%xmm0
vmovdqa %xmm0,0x60(%rsp)
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y);
vmovdqa 0x60(%rsp),%xmm1
vmovdqa 0x50(%rsp),%xmm0
vpshufb %xmm1,%xmm0,%xmm0
lea 0xb0(%rsp),%rax
mov %rax,0x18(%rsp)
vmovdqa %xmm0,0x70(%rsp)
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128 (__m128i *__P, __m128i __B)
{
*__P = __B;
mov 0x18(%rsp),%rax
vmovdqa 0x70(%rsp),%xmm0
vmovdqa %xmm0,(%rax)
lea 0xf0(%rsp),%rax
add $0x10,%rax
mov %rax,0x20(%rsp)
What do you think ?
Upvotes: 1
Reputation: 64913
As usual, check the disassembly. Then as it turns out, with the compiler I used anyway, that it relies on that data being a compile time constant, and it rearranges it so that it can be loaded easily. If that is actually the case in your real code, this is fine (but then why not use an array of uints to begin with?). But if, as I suspect it is, this is just an example and the actual array with be variable, this is a disaster, just look at it:
movzx eax, BYTE PTR [rsp+95]
xor ebx, ebx
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+93]
vmovd xmm7, DWORD PTR [rsp]
vpinsrb xmm7, xmm7, BYTE PTR [rsp+94], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+91]
vmovd xmm3, DWORD PTR [rsp]
vpinsrb xmm3, xmm3, BYTE PTR [rsp+92], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+89]
vmovd xmm1, DWORD PTR [rsp]
vpinsrb xmm1, xmm1, BYTE PTR [rsp+90], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+87]
vmovd xmm6, DWORD PTR [rsp]
vpunpcklwd xmm3, xmm7, xmm3
vpinsrb xmm6, xmm6, BYTE PTR [rsp+88], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+85]
vmovd xmm5, DWORD PTR [rsp]
vpinsrb xmm5, xmm5, BYTE PTR [rsp+86], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+83]
vmovd xmm2, DWORD PTR [rsp]
vpunpcklwd xmm1, xmm1, xmm6
vpinsrb xmm2, xmm2, BYTE PTR [rsp+84], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+81]
vmovd xmm0, DWORD PTR [rsp]
vpunpckldq xmm1, xmm3, xmm1
vpinsrb xmm0, xmm0, BYTE PTR [rsp+82], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+79]
vmovd xmm4, DWORD PTR [rsp]
vpunpcklwd xmm2, xmm5, xmm2
vpinsrb xmm4, xmm4, BYTE PTR [rsp+80], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+77]
vmovd xmm8, DWORD PTR [rsp]
vpinsrb xmm8, xmm8, BYTE PTR [rsp+78], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+75]
vpunpcklwd xmm0, xmm0, xmm4
vmovd xmm4, DWORD PTR [rsp]
vpinsrb xmm4, xmm4, BYTE PTR [rsp+76], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+73]
vpunpckldq xmm0, xmm2, xmm0
vmovd xmm2, DWORD PTR [rsp]
vpinsrb xmm2, xmm2, BYTE PTR [rsp+74], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+71]
vmovd xmm7, DWORD PTR [rsp]
vpunpcklqdq xmm1, xmm1, xmm0
vpunpcklwd xmm4, xmm8, xmm4
vpinsrb xmm7, xmm7, BYTE PTR [rsp+72], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+69]
vmovd xmm6, DWORD PTR [rsp]
vpinsrb xmm6, xmm6, BYTE PTR [rsp+70], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+67]
vmovd xmm0, DWORD PTR [rsp]
vpunpcklwd xmm2, xmm2, xmm7
vpinsrb xmm0, xmm0, BYTE PTR [rsp+68], 1
mov BYTE PTR [rsp], al
movzx eax, BYTE PTR [rsp+65]
vmovd xmm5, DWORD PTR [rsp]
vpunpckldq xmm2, xmm4, xmm2
vpinsrb xmm5, xmm5, BYTE PTR [rsp+66], 1
mov BYTE PTR [rsp], al
vmovd xmm3, DWORD PTR [rsp]
vpunpcklwd xmm0, xmm6, xmm0
vpinsrb xmm3, xmm3, BYTE PTR [rsp+64], 1
vpunpcklwd xmm3, xmm5, xmm3
vpunpckldq xmm0, xmm0, xmm3
vpunpcklqdq xmm0, xmm2, xmm0
vinserti128 ymm0, ymm1, xmm0, 0x1
vmovdqa YMMWORD PTR [rsp+32], ymm0
Wow. Ok, not so good. Indeed worse than if the same thing was done without intrinsics, but not all is lost. It would be better to load the data as little endian uints, and then swap them around with a _mm256_shuffle_epi8
, sort of like this (but check that shuffle mask, I didn't test it)
__m256i ymm9 = _mm256_shuffle_epi8(_mm256_load_si256((__m256i*)block), _mm256_set_epi8(
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15));
ymm9 = _mm256_permute2x128_si256(ymm9, ymm9, 1);
_mm256_store_si256((__m256i*)m, ymm9);
In general, be very careful with the "set" family of intrinsics, they can compile to very bad instruction sequences.
Upvotes: 3