Reputation: 794
I know there has been a question with fast stride-3 gather with AVX2. I am wondering what is the fastest stride 2 gather sequence, say I want to load all odd elements of a length 16 vector into ymm0
.
In particular, I am wondering about the relative benefits and costs of
If 2) is always better than 1), what is the best sequence of instructions to use?
Upvotes: 1
Views: 644
Reputation: 3968
Since vshufps
and vpermps
both execute on port 5 (intel Skylake), I would prefer vblendps
+vpermps
over vshufps
+vpermps
, for a better instruction mix. On intel skylake vblendps
can execute on port 0, 1 or 5. The following solution uses 2 overlapping vector loads:
#include <stdio.h>
#include <immintrin.h>
__m256 stide_2_load_odd(float * a){
__m256 x_lo = _mm256_loadu_ps(&a[1]);
__m256 x_hi = _mm256_loadu_ps(&a[8]);
__m256 x_b = _mm256_blend_ps(x_lo, x_hi, 0b10101010);
__m256 y = _mm256_permutevar8x32_ps(x_b, _mm256_set_epi32(7,5,3,1,6,4,2,0));
return y;
}
__m256 stide_2_load_even(float * a){
__m256 x_lo = _mm256_loadu_ps(&a[0]);
__m256 x_hi = _mm256_loadu_ps(&a[7]);
__m256 x_b = _mm256_blend_ps(x_lo, x_hi, 0b10101010);
__m256 y = _mm256_permutevar8x32_ps(x_b, _mm256_set_epi32(7,5,3,1,6,4,2,0));
return y;
}
int main()
{
float a[] = {0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1, 11.1, 12.1, 13.1, 14.1, 15.1};
float b[8];
__m256 y = stide_2_load_odd(a);
_mm256_storeu_ps(b, y);
printf("odd indices 1, 3, 5, ...\n");
for(int i=0; i<8; i++){
printf("y[%i] = %f \n", i, b[i]);
}
y = stide_2_load_even(a);
_mm256_storeu_ps(b, y);
printf("\neven indices 0, 2, 4, ...\n");
for(int i=0; i<8; i++){
printf("y[%i] = %f \n", i, b[i]);
}
return 0;
}
The output is:
$gcc -Wall -O3 -march=skylake -o main *.c
$main
odd indices 1, 3, 5, ...
y[0] = 1.100000
y[1] = 3.100000
y[2] = 5.100000
y[3] = 7.100000
y[4] = 9.100000
y[5] = 11.100000
y[6] = 13.100000
y[7] = 15.100000
even indices 0, 2, 4, ...
y[0] = 0.100000
y[1] = 2.100000
y[2] = 4.100000
y[3] = 6.100000
y[4] = 8.100000
y[5] = 10.100000
y[6] = 12.100000
y[7] = 14.100000
Here unaligned loads are used. On modern cpu’s these do not cause any performance penalty, as long as the read operation from memory does not cross any cache line boundary. Therefore it is preferable to call these two functions with a 64 byte aligned address a
. See also Peter Cordes’ comment.
Upvotes: 1