Reputation: 505
I'm trying to optimize some code and I'm at a state where I have 4 vectors __m256d
and I want to store the sum of each of them inside another __m256d
.
So basically result = [sum(a), sum(b), sum(c), sum(d)]
. I know there is a way to do this using 2 hadds a blend and permute but I realised that hadd is too expensive.
So I was wondering if there is an intrinsic that allows to do this faster.
Upvotes: 2
Views: 1073
Reputation:
Three options:
good: conceptually simple, using a generally useful algorithm (matrix transpose), portable code
bad: code size, latency, throughput
vhaddpd
efficientlygood: small code (good for Icache), good latency and throughput on Intel uArchs
bad: requires architecture specific code, problematic on some uArch
good: good latency, good throughput
bad: not quite as small as the vhaddpd
-code, not as easy to understand as the full matrix transpose
Have your compiler optimize this for you. With gcc
vector extensions*, code to sum over a transposed matrix could look like this:
#include <stdint.h>
typedef uint64_t v4u64 __attribute__((vector_size(32)));
typedef double v4f64 __attribute__((vector_size(32)));
v4f64 dfoo(v4f64 sv0, v4f64 sv1, v4f64 sv2, v4f64 sv3)
{
v4f64 tv[4];
tv[0] = __builtin_shuffle(sv0, sv1, (v4u64){0,4,2,6});
tv[1] = __builtin_shuffle(sv0, sv1, (v4u64){1,5,3,7});
tv[2] = __builtin_shuffle(sv2, sv3, (v4u64){0,4,2,6});
tv[3] = __builtin_shuffle(sv2, sv3, (v4u64){1,5,3,7});
v4f64 fv[4];
fv[0] = __builtin_shuffle(tv[0], tv[2], (v4u64){0,1,4,5});
fv[1] = __builtin_shuffle(tv[0], tv[2], (v4u64){2,3,6,7});
fv[2] = __builtin_shuffle(tv[1], tv[3], (v4u64){0,1,4,5});
fv[3] = __builtin_shuffle(tv[1], tv[3], (v4u64){2,3,6,7});
return fv[0]+fv[1]+fv[2]+fv[3];
}
gcc-9.2.1
produces the following assembly:
dfoo:
vunpcklpd %ymm3, %ymm2, %ymm5
vunpcklpd %ymm1, %ymm0, %ymm4
vunpckhpd %ymm1, %ymm0, %ymm0
vinsertf128 $1, %xmm5, %ymm4, %ymm1
vperm2f128 $49, %ymm5, %ymm4, %ymm4
vunpckhpd %ymm3, %ymm2, %ymm2
vaddpd %ymm4, %ymm1, %ymm1
vinsertf128 $1, %xmm2, %ymm0, %ymm3
vperm2f128 $49, %ymm2, %ymm0, %ymm0
vaddpd %ymm3, %ymm1, %ymm1
vaddpd %ymm0, %ymm1, %ymm0
ret
Agner Fog's tables say:
vunpck[h/l]pd
: 1 cycle latency, 1 per cycle throughput, 1 uOP port5.vinsertf128
: 3 cycles latency, 1 per cycle throughput, 1 uOP port5.vperm2f128
: 3 cycles latency, 1 per cycle throughput, 1 uOP port5.vaddpd
: 4 cycles latency, 2 per cycle throughput, 1 uOP port01.In all, there are
Throughput will bottleneck on port5. Latency is pretty bad at about ~18 cycles. Code size is about 60 bytes.
Code (sensibly) using vhadd
is not easy to get via gcc vector extensions, so the code needs Intel-specific intrinsics:
v4f64 dfoo_hadd(v4f64 sv0, v4f64 sv1, v4f64 sv2, v4f64 sv3)
{
v4f64 hv[2];
hv[0] = __builtin_ia32_haddpd256(sv0, sv1); //[00+01, 10+11, 02+03, 12+13]
hv[1] = __builtin_ia32_haddpd256(sv2, sv3); //[20+21, 30+31, 22+23, 32+33]
v4f64 fv[2];
fv[0] = __builtin_shuffle(hv[0], hv[1], (v4u64){0, 1, 4, 5}); //[00+01, 10+11, 20+21, 30+31]
fv[1] = __builtin_shuffle(hv[0], hv[1], (v4u64){2, 3, 6, 7}); //[02+03, 12+13, 22+23, 32+33]
return fv[0] + fv[1]; //[00+01+02+03, 10+11+12+13, 20+21+22+23, 30+31+32+33]
}
this generates the following asssembly:
dfoo_hadd:
vhaddpd %ymm3, %ymm2, %ymm2
vhaddpd %ymm1, %ymm0, %ymm0
vinsertf128 $1, %xmm2, %ymm0, %ymm1
vperm2f128 $49, %ymm2, %ymm0, %ymm0
vaddpd %ymm0, %ymm1, %ymm0
ret
According to Agner Fog's instruction tables,
vhaddpd
: 6 cycles latency, 0.5 per cycle throughput, 3 uOPS port01 + 2*port5.In all, there are
Throughput is also limited by port5, and this has more throughput than the transposing code. Latency should be about ~16 cycles, also faster than the transposing code. Code size is about 25 bytes.
Implementing @PeterCordes comment:
v4f64 dfoo_PC(v4f64 sv0, v4f64 sv1, v4f64 sv2, v4f64 sv3)
{
v4f64 tv[4];
v4f64 av[2];
tv[0] = __builtin_shuffle(sv0, sv1, (v4u64){0,4,2,6});//[00, 10, 02, 12]
tv[1] = __builtin_shuffle(sv0, sv1, (v4u64){1,5,3,7});//[01, 11, 03, 13]
av[0] = tv[0] + tv[1];//[00+01, 10+11, 02+03, 12+13]
tv[2] = __builtin_shuffle(sv2, sv3, (v4u64){0,4,2,6});//[20, 30, 22, 32]
tv[3] = __builtin_shuffle(sv2, sv3, (v4u64){1,5,3,7});//[21, 31, 23, 33]
av[1] = tv[2] + tv[3];//[20+21, 30+31, 22+23, 32+33]
v4f64 fv[2];
fv[0] = __builtin_shuffle(av[0], av[1], (v4u64){0,1,4,5});//[00+01, 10+11, 20+21, 30+31]
fv[1] = __builtin_shuffle(av[0], av[1], (v4u64){2,3,6,7});//[02+03, 12+13, 22+23, 32+33]
return fv[0]+fv[1];//[00+01+02+03, 10+11+12+13, 20+21+22+23, 30+31+32+33]
}
This generates:
dfoo_PC:
vunpcklpd %ymm1, %ymm0, %ymm4
vunpckhpd %ymm1, %ymm0, %ymm1
vunpcklpd %ymm3, %ymm2, %ymm0
vunpckhpd %ymm3, %ymm2, %ymm2
vaddpd %ymm1, %ymm4, %ymm1
vaddpd %ymm2, %ymm0, %ymm2
vinsertf128 $1, %xmm2, %ymm1, %ymm0
vperm2f128 $49, %ymm2, %ymm1, %ymm1
vaddpd %ymm1, %ymm0, %ymm0
ret
In all, there are
This gets to the same number of port5 uOPs as the hadd
-code. The code still bottlenecks on port5, latency is about ~16 cycles.
Code size is about 41 bytes.
If you wanted to increase throughput, you would have to shift work away from port5. Unfortunately, almost all permute/insert/shuffle instructions require port5, and lane-crossing instructions (which are required here) have a minimum of 3 cycles latency. One interesting instruction that almost helps is vblendpd
, which has 3/cycle throughput, 1 cycle latency, and can execute on port015, but using it to replace one of the permute/insert/shuffles would require a 64-bit shift of a 128-bit lane of a vector, which is implemented by vpsrldq/vpslldq
, which -you guessed it- takes a port5 uOP (so this would help with vectors of 32-bit float
, because vpsllq/vpsrlq
do not require port5). There is no free lunch here.
* gcc vector extensions quick description:
The code is using gcc vector extensions, which allow using basic operators (+-*/=><>><<
etc.) on vectors, operating element-wise. They also include a few __builtin_*
functions, in particular __builtin_shuffle()
, which has a 3-operand form where the first two are two (same-length N) vectors of the same type T, which (logically) are concatenated to a double-length (2N) vector of that type T, the third is a vector of an integer type (IT) the same width and length (N) as the type of the original vectors. The result is a vector of the same type T and width N of the original vector, with elements selected by the indices in the integer-type vector.
Originally, my answer was about uint64_t
, kept here for context:
#include <stdint.h>
typedef uint64_t v4u64 __attribute__((vector_size(32)));
v4u64 foo(v4u64 sv0, v4u64 sv1, v4u64 sv2, v4u64 sv3)
{
v4u64 tv[4];
tv[0] = __builtin_shuffle(sv0, sv1, (v4u64){0,4,2,6});
tv[1] = __builtin_shuffle(sv0, sv1, (v4u64){1,5,3,7});
tv[2] = __builtin_shuffle(sv2, sv3, (v4u64){0,4,2,6});
tv[3] = __builtin_shuffle(sv2, sv3, (v4u64){1,5,3,7});
v4u64 fv[4];
fv[0] = __builtin_shuffle(tv[0], tv[2], (v4u64){0,1,4,5});
fv[1] = __builtin_shuffle(tv[0], tv[2], (v4u64){2,3,6,7});
fv[2] = __builtin_shuffle(tv[1], tv[3], (v4u64){0,1,4,5});
fv[3] = __builtin_shuffle(tv[1], tv[3], (v4u64){2,3,6,7});
return fv[0]+fv[1]+fv[2]+fv[3];
}
The translation generated by gcc-9.2.1
on skylake-avx2 could look like this:
foo:
vpunpcklqdq %ymm3, %ymm2, %ymm5
vpunpcklqdq %ymm1, %ymm0, %ymm4
vpunpckhqdq %ymm3, %ymm2, %ymm2
vpunpckhqdq %ymm1, %ymm0, %ymm0
vperm2i128 $32, %ymm2, %ymm0, %ymm3
vperm2i128 $32, %ymm5, %ymm4, %ymm1
vperm2i128 $49, %ymm2, %ymm0, %ymm0
vperm2i128 $49, %ymm5, %ymm4, %ymm4
vpaddq %ymm4, %ymm1, %ymm1
vpaddq %ymm0, %ymm3, %ymm0
vpaddq %ymm0, %ymm1, %ymm0
ret
Note that the assembly has an almost line for line correspondence to the gcc vector extensions.
According to Agner Fog's instruction tables for Skylake,
vpunpck[h/l]qdq
: 1 cycle latency, 1 per cycle throughput, port5.vperm2i128
: 3 cycles latency, 1 per cycle throughput, port5.vpaddq
: 1 cycle latency, 3 per cycle throughput, ports015.So the transpose takes 10 cycles (4 for unpack, 4 throughput + 2 latency for permute). Of the three adds, only two can be executed in parallel, so that would take 2 cycles, for 12 cycles total.
Upvotes: 2