Reputation: 99
I try to speed up calculation of average of 4d vectors placed in an array. Here is my code:
#include <sys/time.h>
#include <sys/param.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h>
typedef float dot[4];
#define N 1000000
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, 0);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
void calc_avg1 (dot res, const dot array[], int n)
{
int i,j;
memset (res, 0, sizeof (dot));
for (i = 0; i < n; i++)
{
for (j = 0; j<4; j++) res[j] += array[i][j];
}
for (j = 0; j<4; j++) res[j] /= n;
}
void calc_avg2 (dot res, const dot array[], int n)
{
int i;
__v4sf r = _mm_set1_ps (0.0);
for (i=0; i<n; i++) r += _mm_load_ps (array[i]);
r /= _mm_set1_ps ((float)n);
_mm_store_ps (res, r);
}
int main ()
{
void *space = malloc (N*sizeof(dot)+15);
dot *array = (dot*)(((unsigned long)space+15) & ~(unsigned long)15);
dot avg __attribute__((aligned(16)));
int i;
double time;
for (i = 0; i < N; i++)
{
array[i][0] = 1.0*random();
array[i][1] = 1.0*random();
array[i][2] = 1.0*random();
}
time = gettime();
calc_avg1 (avg, array, N);
time = gettime() - time;
printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);
time = gettime();
calc_avg2 (avg, array, N);
time = gettime() - time;
printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);
return 0;
}
So as you can see calc_avg1
uses naive loops from 0 to 4 and calc_avg2
replaces them with SSE instructions. I compile this code with clang 3.4:
cc -O2 -o test test.c
Here is disassemble of calc_avgX functions:
0000000000400860 <calc_avg1>:
400860: 55 push %rbp
400861: 48 89 e5 mov %rsp,%rbp
400864: 85 d2 test %edx,%edx
400866: 0f 57 c0 xorps %xmm0,%xmm0
400869: 0f 11 07 movups %xmm0,(%rdi)
40086c: 7e 42 jle 4008b0 <calc_avg1+0x50>
40086e: 48 83 c6 0c add $0xc,%rsi
400872: 0f 57 c0 xorps %xmm0,%xmm0
400875: 89 d0 mov %edx,%eax
400877: 0f 57 c9 xorps %xmm1,%xmm1
40087a: 0f 57 d2 xorps %xmm2,%xmm2
40087d: 0f 57 db xorps %xmm3,%xmm3
400880: f3 0f 58 5e f4 addss -0xc(%rsi),%xmm3
400885: f3 0f 11 1f movss %xmm3,(%rdi)
400889: f3 0f 58 56 f8 addss -0x8(%rsi),%xmm2
40088e: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
400893: f3 0f 58 4e fc addss -0x4(%rsi),%xmm1
400898: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
40089d: f3 0f 58 06 addss (%rsi),%xmm0
4008a1: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008a6: 48 83 c6 10 add $0x10,%rsi
4008aa: ff c8 dec %eax
4008ac: 75 d2 jne 400880 <calc_avg1+0x20>
4008ae: eb 0c jmp 4008bc <calc_avg1+0x5c>
4008b0: 0f 57 c0 xorps %xmm0,%xmm0
4008b3: 0f 57 c9 xorps %xmm1,%xmm1
4008b6: 0f 57 d2 xorps %xmm2,%xmm2
4008b9: 0f 57 db xorps %xmm3,%xmm3
4008bc: f3 0f 2a e2 cvtsi2ss %edx,%xmm4
4008c0: f3 0f 5e dc divss %xmm4,%xmm3
4008c4: f3 0f 11 1f movss %xmm3,(%rdi)
4008c8: f3 0f 5e d4 divss %xmm4,%xmm2
4008cc: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
4008d1: f3 0f 5e cc divss %xmm4,%xmm1
4008d5: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
4008da: f3 0f 5e c4 divss %xmm4,%xmm0
4008de: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008e3: 5d pop %rbp
4008e4: c3 retq
4008e5: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1)
4008ec: 00 00 00 00
00000000004008f0 <calc_avg2>:
4008f0: 55 push %rbp
4008f1: 48 89 e5 mov %rsp,%rbp
4008f4: 85 d2 test %edx,%edx
4008f6: 0f 57 c0 xorps %xmm0,%xmm0
4008f9: 7e 10 jle 40090b <calc_avg2+0x1b>
4008fb: 89 d0 mov %edx,%eax
4008fd: 0f 1f 00 nopl (%rax)
400900: 0f 58 06 addps (%rsi),%xmm0
400903: 48 83 c6 10 add $0x10,%rsi
400907: ff c8 dec %eax
400909: 75 f5 jne 400900 <calc_avg2+0x10>
40090b: 66 0f 6e ca movd %edx,%xmm1
40090f: 66 0f 70 c9 00 pshufd $0x0,%xmm1,%xmm1
400914: 0f 5b c9 cvtdq2ps %xmm1,%xmm1
400917: 0f 5e c1 divps %xmm1,%xmm0
40091a: 0f 29 07 movaps %xmm0,(%rdi)
40091d: 5d pop %rbp
40091e: c3 retq
40091f: 90 nop
And here is the result:
> ./test
0.004287
1073864320.000000 1074018048.000000 1073044224.000000
0.003661
1073864320.000000 1074018048.000000 1073044224.000000
So SSE version is 1.17 times faster. But when I try to do seemingly the same job, which is to calculate average of single precision scalars in an array (as described here SSE reduction of float vector for example), SSE version runs 3.32 times faster. Here is the code of calc_avgX functions:
float calc_avg1 (const float array[], int n)
{
int i;
float avg = 0;
for (i = 0; i < n; i++) avg += array[i];
return avg / n;
}
float calc_avg3 (const float array[], int n)
{
int i;
__v4sf r = _mm_set1_ps (0.0);
for (i=0; i<n; i+=4) r += _mm_load_ps (&(array[i]));
r = _mm_hadd_ps (r, r);
r = _mm_hadd_ps (r, r);
return r[0] / n;
}
So my question is this: Why I benefit from SSE so much in the last example (calculation of average of single float scalars) and do not in the first (calculation of average of 4d vectors)? For me, these jobs are almost the same. What is the right way to speed up calculations in the first example, if I do it wrong?
UPD: If you think it's relevant, I also supply disassembly of the second example, where average of scalars is calculated (also compiled with clang3.4 -O2).
0000000000400860 <calc_avg1>:
400860: 55 push %rbp
400861: 48 89 e5 mov %rsp,%rbp
400864: 85 d2 test %edx,%edx
400866: 0f 57 c0 xorps %xmm0,%xmm0
400869: 0f 11 07 movups %xmm0,(%rdi)
40086c: 7e 42 jle 4008b0 <calc_avg1+0x50>
40086e: 48 83 c6 0c add $0xc,%rsi
400872: 0f 57 c0 xorps %xmm0,%xmm0
400875: 89 d0 mov %edx,%eax
400877: 0f 57 c9 xorps %xmm1,%xmm1
40087a: 0f 57 d2 xorps %xmm2,%xmm2
40087d: 0f 57 db xorps %xmm3,%xmm3
400880: f3 0f 58 5e f4 addss -0xc(%rsi),%xmm3
400885: f3 0f 11 1f movss %xmm3,(%rdi)
400889: f3 0f 58 56 f8 addss -0x8(%rsi),%xmm2
40088e: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
400893: f3 0f 58 4e fc addss -0x4(%rsi),%xmm1
400898: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
40089d: f3 0f 58 06 addss (%rsi),%xmm0
4008a1: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008a6: 48 83 c6 10 add $0x10,%rsi
4008aa: ff c8 dec %eax
4008ac: 75 d2 jne 400880 <calc_avg1+0x20>
4008ae: eb 0c jmp 4008bc <calc_avg1+0x5c>
4008b0: 0f 57 c0 xorps %xmm0,%xmm0
4008b3: 0f 57 c9 xorps %xmm1,%xmm1
4008b6: 0f 57 d2 xorps %xmm2,%xmm2
4008b9: 0f 57 db xorps %xmm3,%xmm3
4008bc: f3 0f 2a e2 cvtsi2ss %edx,%xmm4
4008c0: f3 0f 5e dc divss %xmm4,%xmm3
4008c4: f3 0f 11 1f movss %xmm3,(%rdi)
4008c8: f3 0f 5e d4 divss %xmm4,%xmm2
4008cc: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
4008d1: f3 0f 5e cc divss %xmm4,%xmm1
4008d5: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
4008da: f3 0f 5e c4 divss %xmm4,%xmm0
4008de: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008e3: 5d pop %rbp
4008e4: c3 retq
4008e5: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1)
4008ec: 00 00 00 00
00000000004008d0 <calc_avg3>:
4008d0: 55 push %rbp
4008d1: 48 89 e5 mov %rsp,%rbp
4008d4: 31 c0 xor %eax,%eax
4008d6: 85 f6 test %esi,%esi
4008d8: 0f 57 c0 xorps %xmm0,%xmm0
4008db: 7e 0f jle 4008ec <calc_avg3+0x1c>
4008dd: 0f 1f 00 nopl (%rax)
4008e0: 0f 58 04 87 addps (%rdi,%rax,4),%xmm0
4008e4: 48 83 c0 04 add $0x4,%rax
4008e8: 39 f0 cmp %esi,%eax
4008ea: 7c f4 jl 4008e0 <calc_avg3+0x10>
4008ec: 66 0f 70 c8 01 pshufd $0x1,%xmm0,%xmm1
4008f1: f3 0f 58 c8 addss %xmm0,%xmm1
4008f5: 66 0f 70 d0 03 pshufd $0x3,%xmm0,%xmm2
4008fa: 0f 12 c0 movhlps %xmm0,%xmm0
4008fd: f3 0f 58 c1 addss %xmm1,%xmm0
400901: f3 0f 58 c2 addss %xmm2,%xmm0
400905: 0f 57 c9 xorps %xmm1,%xmm1
400908: f3 0f 2a ce cvtsi2ss %esi,%xmm1
40090c: f3 0f 5e c1 divss %xmm1,%xmm0
400910: 5d pop %rbp
400911: c3 retq
400912: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1)
400919: 1f 84 00 00 00 00 00
Upvotes: 3
Views: 617
Reputation: 365842
Sorry this answer got a little long and rambling. I ran some benchmarks, but I didn't take a long time editing earlier stuff after thinking of something else to try.
Your working set is 15.25MiB (16MB). Normally to benchmark a routine like this, you'd average a smaller buffer multiple times, so it fits in the cache. You don't see much difference between the slow version and the fast version because the diff is hidden by the memory bottleneck.
calc_avg1
doesn't auto-vectorize at all (note the addss
. ss
means scalar, single-precision, as opposed to addps
(packed single-precision)). I think it can't autovectorize even when inlined into main because it can't be sure there aren't NaN
s in the 4th vector position, which would cause an FP exception that the scalar code wouldn't have. I tried compiling it for Sandybridge with gcc 4.9.2 -O3 -march=native -ffast-math
, and with clang-3.5, but no luck with either.
Even so, the version inlined into main
only runs slightly slower, because memory is the bottleneck. 32bit loads can nearly keep up with 128b loads, when hitting main memory. (The non-inlined version will be bad, though: Every +=
result is stored to the res
array, because the loop accumulates directly into memory that might have other references to it. So it has to make every operation visible with a store. This is the version you posted the disassembly for, BTW. Sorting out which part of main was which was assisted by compiling with -S -fverbose-asm
.)
Somewhat disappointingly, clang and gcc can't auto-vectorize the __v4sf
from 4-wide to 8-wide AVX.
Scratch that, after wrapping a for (int i=0; i<4000 ; i++)
around the calls to calc_avgX
, and reducing N
to 10k, gcc -O3
turns the inner inner-loop of avg1 into:
400690: c5 f8 10 08 vmovups (%rax),%xmm1
400694: 48 83 c0 20 add $0x20,%rax
400698: c4 e3 75 18 48 f0 01 vinsertf128 $0x1,-0x10(%rax),%ymm1,%ymm1
40069f: c5 fc 58 c1 vaddps %ymm1,%ymm0,%ymm0
4006a3: 48 39 d8 cmp %rbx,%rax
4006a6: 75 e8 jne 400690 <main+0xe0>
$ (get CPU to max-turbo frequency) && time ./a.out
0.016515
1071570752.000000 1066917696.000000 1073897344.000000
0.032875
1071570944.000000 1066916416.000000 1073895680.000000
This is bizzare; I have no idea why it doesn't just use 32B loads. It does use 32B vaddps
, which is the bottleneck when working with a data set that fits in L2 cache.
IDK why it managed to auto-vectorize the inner loop when it was inside another loop. Note that this only applies to the version inlined into main
. The callable version is still scalar-only. Also note that only gcc managed this. clang 3.5 didn't. Maybe gcc knew that it was going to use malloc
in a way that returned a zeroed buffer (so it didn't have to worry about NaN
s in the 4th element)?
I'm also surprised that clang's non-vectorized avg1
is not slower, when everything fits in cache. N=10000
, repeat-count = 40k.
3.3GHz SNB i5 2500k, max turbo = 3.8GHz.
avg1: 0.350422s: clang -O3 -march=native (not vectorized. loop of 6 scalar addss with memory operands)
avg2: 0.320173s: clang -O3 -march=native
avg1: 0.497040s: clang -O3 -march=native -ffast-math (haven't looked at asm to see what happened)
avg1: 0.160374s: gcc -O3 -march=native (256b addps, with 2 128b loads)
avg2: 0.321028s: gcc -O3 -march=native (128b addps with a memory operand)
avg2: ~0.16: clang, unrolled with 2 dependency chains to hide latency (see below).
avg2: ~0.08: unrolled with 4 dep chains
avg2: ~0.04: in theory unrolled-by-4 with 256b AVX. I didn't try unrolling the one gcc auto-vectorized with 256b addps
So the big surprise is that the scalar-only clang avg1
code keeps up with avg2
. Perhaps the loop-carried dependency chain is the bigger bottleneck?
perf
is showing 1.47 insns per cycle for clang's non-vectorized avg1
, which is probably saturating the FP add unit on port 1. (Most of the loop instructions are adds).
However, avg2
, using 128b addps
with a memory operand, is only getting 0.58 insns per cycle. Reducing the array size by another factor of 10, to N=1000
, gets 0.60 insns per cycle, probably because of more time spend in prologue/epilogue. So there's a serious issue with the loop-carried dependency chain, I think. clang unrolls the loop by 4, but only uses a single accumulator. The loop has 7 instructions, which decodes to 10 uops. (Each vaddps
is 2, since it's used with a memory operand with a 2-register addressing mode, preventing micro-fusion. The cmp
and jne
macro-fuse). http://www.brendangregg.com/perf.html says the perf
event for UOPS_DISPATCHED.CORE
is r2b1
, so:
$ perf stat -d -e cycles,instructions,r2b1 ./a.out
0.031793
1053298112.000000 1052673664.000000 1116960256.000000
Performance counter stats for './a.out':
118,453,541 cycles
71,181,299 instructions # 0.60 insns per cycle
102,025,443 r2b1 # this is uops, but perf doesn't have a nice name for it
40,256,019 L1-dcache-loads
21,254 L1-dcache-load-misses # 0.05% of all L1-dcache hits
9,588 LLC-loads
0 LLC-load-misses:HG # 0.00% of all LL-cache hits
0.032276233 seconds time elapsed
This confirms my 7:10 instructions to uops analysis. That's actually not relevant to the performance problem here: The loop is running way slower than the 4 uops per cycle upper limit. Changing the inner loop to get two separate dep chains going doubles the throughput (60M cycles instead of 117M, but 81M insns instead of 71M):
for (i=0; i<n-1; i+=2) { // TODO: make sure the loop end condition is correct
r0 += _mm_load_ps (array[i]);
r1 += _mm_load_ps (array[i+1]);
}
r0 += r1;
Unrolling by 4 (with 4 accumulators that you merge at the end of the loop) doubles the performance again. (down to 42M cycles, 81M insns, 112M uops.) The inner loop has 4x vaddps -0x30(%rcx),%xmm4,%xmm4
(and similar), 2x add
, cmp
, jl
. This form of vaddps
should micro-fuse, but I'm still seeing a lot more uops than instructions, so I guess r2b1
counts unfused-domain uops. (Linux perf
doesn't have good docs for platform-specific HW events). Cranking up N
again, to make sure it's the innermost loop that completely dominates all the counts, I see a uop:insn ratio of 1.39, which matches well with 8 insns, 11 uops (1.375) (counting vaddps
as 2, but counting cmp
+ jl
as one). I found http://www.bnikolic.co.uk/blog/hpc-prof-events.html, which has a full list of supported perf events, including their codes for Sandybridge. (And instructions on how to dump the table for any other CPU). (Look for the Code:
line in each block. You need a umask byte, and then the code, as an arg to perf
.)
# a.out does only avg2, as an unrolled-by-4 version.
$ perf stat -d -e cycles,instructions,r14a1,r2b1,r10e,r2c2,r1c2 ./a.out
0.011331
1053298752.000000 1052674496.000000 1116959488.000000
Performance counter stats for './a.out':
42,250,312 cycles [34.11%]
56,103,429 instructions # 1.33 insns per cycle
20,864,416 r14a1 # UOPS_DISPATCHED_PORT: 0x14=port2&3 loads
111,943,380 r2b1 # UOPS_DISPATCHED: (2->umask 00 -> this core, any thread).
72,208,772 r10e # UOPS_ISSUED: fused-domain
71,422,907 r2c2 # UOPS_RETIRED: retirement slots used (fused-domain)
111,597,049 r1c2 # UOPS_RETIRED: ALL (unfused-domain)
0 L1-dcache-loads
18,470 L1-dcache-load-misses # 0.00% of all L1-dcache hits
5,717 LLC-loads [66.05%]
0 LLC-load-misses:HG # 0.00% of all LL-cache hits
0.011920301 seconds time elapsed
So yeah, it looks like this can count fused-domain and unfused-domain uops!
Unrolling by 8 doesn't help at all: still 42M cycles. (but down to 61M insns, and 97M uops, thanks to less loop overhead). Neat, clang uses sub $-128, %rsi
instead of add, because -128 fits in an imm8
, but +128 doesn't. So I guess unrolling by 4 is enough to saturate the FP add port.
As for your 1avg functions that return a single float, rather than a vector, well clang didn't auto-vectorize the first one, but gcc does. It emits a giant prologue and epilogue to do scalar sums until it gets to an aligned address, then does 32B AVX vaddps
in a small loop. You say you found a much bigger speed diff with them, but were you maybe testing with a smaller buffer? That would account for seeing a big speedup for vector code vs. non-vector.
Upvotes: 1