Reputation: 2126
I'm trying to translate some scalar code (calc_offsets
below) into the AVX2 equivalent. It takes a series of 'counts' and generates a table of offset positions, starting from some provided base value.
My attempt at converting this to AVX2 (avx2_calc_offsets
), which I think is correct, seems to be about half the speed of the simple array approach. This is part of an effort to translate a larger hot section of (bottlenecked) code into AVX2 instructions and I'm looking to process the offsets further as vectors. I'd like to avoid jumping between AVX2 and scalar code for operations like this.
There's some example and simple benchmarking code provided. I'm getting around 2.15 seconds runtime for the array version and 4.41 seconds for the AVX2 version (on Ryzen Zen v1).
Is there a better approach using AVX2 to make this operation faster? I need to consider older AVX2 CPUs such as Haswell and original Ryzen series.
#include <immintrin.h>
#include <inttypes.h>
#include <stdio.h>
typedef uint32_t u32;
typedef uint64_t u64;
void calc_offsets (const u32 base, const u32 *counts, u32 *offsets)
{
offsets[0] = base;
offsets[1] = offsets[0] + counts[0];
offsets[2] = offsets[1] + counts[1];
offsets[3] = offsets[2] + counts[2];
offsets[4] = offsets[3] + counts[3];
offsets[5] = offsets[4] + counts[4];
offsets[6] = offsets[5] + counts[5];
offsets[7] = offsets[6] + counts[6];
}
__m256i avx2_calc_offsets (const u32 base, const __m256i counts)
{
const __m256i shuff = _mm256_set_epi32 (6, 5, 4, 3, 2, 1, 0, 7);
__m256i v, t;
// shift whole vector `v` 4 bytes left and insert `base`
v = _mm256_permutevar8x32_epi32 (counts, shuff);
v = _mm256_insert_epi32 (v, base, 0);
// accumulate running total within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// add highest value in right-hand lane to each value in left
t = _mm256_set1_epi32 (_mm256_extract_epi32 (v, 3));
v = _mm256_blend_epi32 (_mm256_add_epi32 (v, t), v, 0x0F);
return v;
}
void main()
{
u32 base = 900000000;
u32 counts[8] = { 5, 50, 500, 5000, 50000, 500000, 5000000, 50000000 };
u32 offsets[8];
calc_offsets (base, &counts[0], &offsets[0]);
printf ("calc_offsets: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
__m256i v, t;
v = _mm256_loadu_si256 ((__m256i *) &counts[0]);
t = avx2_calc_offsets (base, v);
_mm256_storeu_si256 ((__m256i *) &offsets[0], t);
printf ("avx2_calc_offsets: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
// --- benchmarking ---
#define ITERS 1000000000
// uncomment to benchmark AVX2 version
// #define AVX2_BENCH
#ifdef AVX2_BENCH
// benchmark AVX2 version
for (u64 i = 0; i < ITERS; i++) {
v = avx2_calc_offsets (base, v);
}
_mm256_storeu_si256 ((__m256i *) &offsets[0], v);
#else
// benchmark array version
u32 *c = &counts[0];
u32 *o = &offsets[0];
for (u64 i = 0; i < ITERS; i++) {
calc_offsets (base, c, o);
// feedback results to prevent optimizer 'cleverness'
u32 *tmp = c;
c = o;
o = tmp;
}
#endif
printf ("offsets after benchmark: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
}
I'm using gcc -O2 -mavx2 ...
to build. Godbolt link.
Upvotes: 2
Views: 886
Reputation: 2126
Eliminating the upfront _mm256_permutevar8x32_epi32
(vpermd
) seems to make a massive difference here. This is likely because of its large latency (8 cycles on Ryzen?) and the immediate dependency upon it of all subsequent instructions.
Instead of feeding in the base value upfront I'm combining it with during the addition that carries the prefix-sum between 128-bit lanes.
__m256i avx2_calc_offsets_2 (const u32 base, const __m256i counts)
{
__m256i b, t, v;
v = counts;
// accumulate running totals within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// extract highest value in right-hand lane and combine with base offset
t = _mm256_set1_epi32 (_mm256_extract_epi32 (v, 3));
b = _mm256_set1_epi32 (base);
t = _mm256_blend_epi32 (_mm256_add_epi32 (b, t), b, 0x0F);
// combine with shifted running totals
v = _mm256_add_epi32 (_mm256_slli_si256 (v, 4), t);
return v;
}
Assembly comparison between the two versions:
avx2_calc_offsets:
vmovdqa ymm1, YMMWORD PTR .LC0[rip]
vpermd ymm0, ymm1, ymm0
vpinsrd xmm1, xmm0, edi, 0
vinserti128 ymm0, ymm0, xmm1, 0x0
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpsrldq xmm1, xmm0, 12
vpbroadcastd ymm1, xmm1
vpaddd ymm1, ymm1, ymm0
vpblendd ymm0, ymm1, ymm0, 15
ret
avx2_calc_offsets_2:
vpslldq ymm1, ymm0, 4
vmovd xmm2, edi
vpaddd ymm1, ymm1, ymm0
vpbroadcastd ymm2, xmm2
vpslldq ymm0, ymm1, 8
vpaddd ymm1, ymm1, ymm0
vpsrldq xmm0, xmm1, 12
vpslldq ymm1, ymm1, 4
vpbroadcastd ymm0, xmm0
vpaddd ymm0, ymm2, ymm0
vpblendd ymm0, ymm0, ymm2, 15
vpaddd ymm0, ymm0, ymm1
ret
Overall the same number of instructions, just less expensive in uops/latency I suppose.
The benchmark using avx2_calc_offsets_2
now runs in 2.7 seconds, which is around 63% faster than the previous version.
Update 1: GCC's inlining of avx2_calc_offsets_2
into the benchmark loop further explains the increased performance. As Peter predicts, the vmovd
/ vpbroadcastd
instructions corresponding to _mm256_set1_epi32 (base)
are indeed hoisted out into a single load outside of the loop.
Loop assembly:
...
// loop setup
vmovdqa ymm2, YMMWORD PTR .LC5[rip] // hoisted load of broadcasted base
vmovdqa ymm0, YMMWORD PTR [rbp-176]
vmovdqa ymm1, YMMWORD PTR [rbp-144]
mov eax, 1000000000
jmp .L10
.L17: // loop body
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpsrldq xmm1, xmm0, 12
vpslldq ymm0, ymm0, 4
vpbroadcastd ymm1, xmm1
vpaddd ymm1, ymm1, ymm2
vpblendd ymm1, ymm1, ymm2, 15
.L10: // loop entry
vpaddd ymm0, ymm1, ymm0
sub rax, 1
jne .L17
...
.LC5: // broadcasted `base`
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
Update 2:
Focusing on the inlining case and replacing the _mm256_blend_epi32
/ vpblendd
with an __m128i
insertion into the high lane of a zeroed __m256i
, then adding to the final vector yields further performance and code-size improvements (thanks Peter).
__m256i avx2_calc_offsets_3 (const u32 base, const __m256i counts)
{
const __m256i z = _mm256_setzero_si256 ();
const __m256i b = _mm256_set1_epi32 (base);
__m256i v, t;
__m128i lo;
v = counts;
// accumulate running totals within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// capture the max total in low-lane and broadcast into high-lane
lo = _mm_shuffle_epi32 (_mm256_castsi256_si128 (v), _MM_SHUFFLE (3, 3, 3, 3));
t = _mm256_inserti128_si256 (z, lo, 1);
// shift totals, add base and low-lane max
v = _mm256_slli_si256 (v, 4);
v = _mm256_add_epi32 (v, b);
v = _mm256_add_epi32 (v, t);
return v;
}
The assembly for the inlined version in the loop now looks like:
// compiled with GCC version 10.3: gcc -O2 -mavx2 ...
// loop setup
vmovdqa ymm2, YMMWORD PTR .LC5[rip] // load broadcasted base
vmovdqa ymm0, YMMWORD PTR [rbp-176]
vmovdqa ymm1, YMMWORD PTR [rbp-144]
mov eax, 1000000000
vpxor xmm3, xmm3, xmm3
jmp .L12
.L20: // loop body
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpshufd xmm1, xmm0, 255
vpslldq ymm0, ymm0, 4
vinserti128 ymm1, ymm3, xmm1, 0x1
.L12: // loop entry
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub rax, 1
jne .L20
The loop body is down to only 9 vector instructions :).
There's an optimization bug in GCC when using -O3 where an extraneous vmovdqa ymm0, ymm1
is inserted at the end of the loop body, reducing the benchmark performance by a couple of percent. (At least for GCC versions 11.x, 10.x, and 9.x).
Update 3: Another slight performance gain. If we add in the low-lane's max total using a SSE/128-bit instruction before the 128-bit insertion, we shorten the critical path for v
allowing better use of the shuffle port.
__m256i avx2_calc_offsets_4 (const u32 base, const __m256i counts)
{
const __m256i b = _mm256_set1_epi32 (base);
__m256i v, t;
__m128i lo;
v = counts;
// accumulate running totals within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// capture the max total in low-lane, broadcast into high-lane and add to base
lo = _mm_shuffle_epi32 (_mm256_castsi256_si128 (v), _MM_SHUFFLE (3, 3, 3, 3));
lo = _mm_add_epi32 (_mm256_castsi256_si128 (b), lo);
t = _mm256_inserti128_si256 (b, lo, 1);
// shift totals, add base and low-lane max
v = _mm256_slli_si256 (v, 4);
v = _mm256_add_epi32 (v, t);
return v;
}
.L23: // loop body
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpshufd xmm2, xmm0, 255
vpslldq ymm1, ymm0, 4
.L14: // loop entry
vpaddd xmm0, xmm2, xmm3
vinserti128 ymm0, ymm4, xmm0, 0x1
vpaddd ymm0, ymm1, ymm0
sub rax, 1
jne .L23
This looks fairly optimal to my (non-expert) eye, at least for early AVX2 chips. Benchmark time is brought down to ~2.17 seconds.
Strangely, if I reduce the size of the source code by deleting one of the previous function definitions, GCC 10 and 11 go a bit haywire and insert 3 (!) additional of vmovdqa
instructions into the loop (Godbolt). The result is a slowdown of ~18% in my benchmark. GCC 9.x seems unaffected. I'm not sure what's going on here but it seems like a pretty nasty bug in GCC's optimizer. I'll try to reduce it and file a bug.
The benchmark using avx2_calc_offsets_3
now runs at effectively the same speed as the scalar version, which is a win in my case since it removes the need to jump to scalar code for performance reasons.
Upvotes: 3