Reputation: 30645
This question is an update to this: Accessing three static arrays is quicker than one static array containing 3x data? but I have too many updates so it made sense to start a new one.
I have 700 "items" and each item has three attributes- a
, b
and e
. I loop through every item and use the three atrributes to calculate values for two "global" attributes called c
and d
.
I used two techniques to implement this:
1) Three 700-element arrays, one array for each of the three attributes:
item0.a = array1[0]
item0.b = array2[0]
item0.e = array3[0]
2) One 2100-element array containing data for the three attributes consecutively:
item0.a = array[offset + 0]
item0.b = array[offset + 1]
item0.e = array[offset + 2]
Now the three item attributes a
, b
and e
are used together within the loop- therefore it would make sense that if you store them in one array the performance should be better than if you use the three-array technique.
This is because in technique #1 each of the three attributes (per same item) are located (700 x 4 bytes) apart. I would therefore have to retrieve three different cache lines for an item.
However, with technique #2 the three attributes are located adjacent in memory, so one cache line will give me all three item attributes. In fact because the cache line is 64 bytes and I am using integers- there will be 16 integers available per cache line load. This means I can process 16/3 = 5 items per cache line.
In other words, technique #1 should require at least 3 cache line loads to process 5 items whereas technique #2 would only require one cache line load. I was expecting technique #2 to be a lot faster.
Results (Win 7 64, MSVC 2012 with desktop Ivy Bridge CPU):
Technique #1 C++:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array1[700];
unsigned int array2[700];
unsigned int array3[700];
//I have left out code for simplicity. You can assume by now the arrays are populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned int a= array1[i]; //Array 1
unsigned int b= array2[i]; //Array 2
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array3[i]; //Array 3
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
if(data_for_all_items){
std::cout << finish - start << std::endl;
//This line prevents compiler over-optimizing
std::cout << c << d << std::endl;
}
Assembly for Technique #1:
unsigned int x;
unsigned int y;
start = __rdtscp(&x);
rdtscp
shl rdx,20h
lea r8,[this]
for(int i=0; i < 700; i++){
sub rdi,rsi
or rax,rdx
lea r10,[rsi+4]
mov ebx,46h
mov dword ptr [r8],ecx
mov r11,rax
sub r9,rsi
unsigned int a = array1[i];
unsigned int b = array2[i];
all_instr_found = all_instr_found & (a != -1 & b != -1);
cmp dword ptr [r10-4],0FFFFFFFFh
xorps xmm0,xmm0
setne cl
cmp dword ptr [r10+rdi-4],0FFFFFFFFh
setne al
and cl,al
and bpl,cl
unsigned int e = array3[i];
mov ecx,dword ptr [r10+r9-4]
c += (a * e);
mov eax,ecx
d += (b * e);
imul ecx,dword ptr [r10-4]
imul eax,dword ptr [r10+rdi-4]
cmp dword ptr [r10],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [rdi+r10],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r9+r10]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10]
xorps xmm0,xmm0
imul eax,dword ptr [rdi+r10]
cmp dword ptr [r10+4],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [rdi+r10+4],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r9+r10+4]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+4]
xorps xmm0,xmm0
imul eax,dword ptr [rdi+r10+4]
cmp dword ptr [r10+8],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [rdi+r10+8],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+8]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+8]
xorps xmm0,xmm0
imul eax,dword ptr [rdi+r10+8]
cmp dword ptr [r10+0Ch],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+0Ch],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+0Ch]
mov eax,ecx
d += (b * e);
addsd xmm7,xmm0
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+0Ch]
imul ecx,dword ptr [r10+0Ch]
cvtsi2sd xmm0,rax
addsd xmm6,xmm0
cmp dword ptr [r10+10h],0FFFFFFFFh
mov eax,ecx
xorps xmm0,xmm0
setne cl
cmp dword ptr [r10+rdi+10h],0FFFFFFFFh
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+10h]
addsd xmm7,xmm0
mov eax,ecx
xorps xmm0,xmm0
imul ecx,dword ptr [r10+10h]
imul eax,dword ptr [r10+rdi+10h]
cmp dword ptr [r10+14h],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+14h],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+14h]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+14h]
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+14h]
cmp dword ptr [r10+18h],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+18h],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+18h]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+18h]
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+18h]
cmp dword ptr [r10+1Ch],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+1Ch],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+1Ch]
mov eax,ecx
addsd xmm7,xmm0
imul ecx,dword ptr [r10+1Ch]
xorps xmm0,xmm0
imul eax,dword ptr [r10+rdi+1Ch]
cmp dword ptr [r10+20h],0FFFFFFFFh
cvtsi2sd xmm0,rax
mov eax,ecx
setne cl
cmp dword ptr [r10+rdi+20h],0FFFFFFFFh
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
and bpl,cl
mov ecx,dword ptr [r10+r9+20h]
mov eax,ecx
addsd xmm7,xmm0
imul eax,dword ptr [r10+rdi+20h]
imul ecx,dword ptr [r10+20h]
xorps xmm0,xmm0
add r10,28h
cvtsi2sd xmm0,rax
mov eax,ecx
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm7,xmm0
dec rbx
jne 013FC6DA71h
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
(It looks like the compiler was unrolling the loop five times for Technique #1)
Technique #2 C++:
unsigned int x;
unsigned int y;
unsigned short pos = 0;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array[2100];
//I have left out code for simplicity. You can assume by now the array is populated.
start = __rdtscp(&x);
while(pos < 2100){
unsigned int a = array[pos + 0];
unsigned int b = array[pos + 1];
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array[pos + 2];
c += (a * e);
d += (b * e);
pos = pos + 3;
}
finish = __rdtscp(&y);
if(data_for_all_items){
std::cout << finish - start << std::endl;
//This line prevents compiler over-optimizing
std::cout << c << d << std::endl;
}
Assembly for Technique #2:
start = __rdtscp(&x);
rdtscp
shl rdx,20h
lea r9,[this]
mov r11d,2BCh
or rax,rdx
mov dword ptr [r9],ecx
add r10,8
mov rbx,rax
nop word ptr [rax+rax]
while(pos < 2100){
unsigned int a = array[pos];
unsigned int b = array[pos + 1];
all_instr_found = all_instr_found & (a != -1 & b != -1);
cmp dword ptr [r10-4],0FFFFFFFFh
xorps xmm0,xmm0
setne dl
cmp dword ptr [r10-8],0FFFFFFFFh
setne cl
unsigned int e = array[pos + 2];
c += (a * e);
d += (b * e);
pos = pos + 3;
add r10,0Ch
and dl,cl
mov ecx,dword ptr [r10-0Ch]
mov eax,ecx
and r15b,dl
imul ecx,dword ptr [r10-10h]
imul eax,dword ptr [r10-14h]
cvtsi2sd xmm0,rax
mov eax,ecx
addsd xmm6,xmm0
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm7,xmm0
dec r11
jne 013F21DA30h
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
So no loop unrolling? This must be the reason right? Well I manually unrolled the loop using this C++:
while(pos < 2100){
unsigned int a1 = array[pos + 0];
unsigned int b1 = array[pos + 1];
unsigned int e1 = array[pos + 2];
unsigned int a2 = array[pos + 3];
unsigned int b2 = array[pos + 4];
unsigned int e2 = array[pos + 5];
unsigned int a3 = array[pos + 6];
unsigned int b3 = array[pos + 7];
unsigned int e3 = array[pos + 8];
unsigned int a4 = array[pos + 9];
unsigned int b4 = array[pos + 10];
unsigned int e4 = array[pos + 11];
unsigned int a5 = array[pos + 12];
unsigned int b5 = array[pos + 13];
unsigned int e5 = array[pos + 14];
c += (a1 * e1) + (a2 * e2) + (a3 * e3) + (a4 * e4) + (a5 * e5);
d += (b1 * e1) + (b2 * e2) + (b3 * e3) + (b4 * e4) + (b5 * e5);
data_for_all_items = data_for_all_items & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
pos += 15;
}
if(data_for_all_items){
std::cout << finish - start << std::endl;
//This line prevents compiler over-optimizing
std::cout << c << d << std::endl;
}
generating this assembly:
start = __rdtscp(&x);
rdtscp
lea r9,[x]
shl rdx,20h
mov qword ptr [rsp+108h],8Ch
mov dword ptr [r9],ecx
lea r9,[r8+8]
or rax,rdx
mov qword ptr [y],r9
mov qword ptr [rsp+20h],rax
nop
unsigned int a1 = array[pos + 0];
unsigned int b1 = array[pos + 1];
unsigned int e1 = array[pos + 2];
mov edi,dword ptr [r9]
unsigned int a2 = array[pos + 3];
mov r13d,dword ptr [r9+4]
unsigned int b2 = array[pos + 4];
mov r12d,dword ptr [r9+8]
xorps xmm0,xmm0
unsigned int e2 = array[pos + 5];
mov r10d,dword ptr [r9+0Ch]
unsigned int a3 = array[pos + 6];
mov r15d,dword ptr [r9+10h]
unsigned int b3 = array[pos + 7];
mov r14d,dword ptr [r9+14h]
unsigned int e3 = array[pos + 8];
mov r8d,dword ptr [r9+18h]
unsigned int a4 = array[pos + 9];
mov ebp,dword ptr [r9+1Ch]
unsigned int b4 = array[pos + 10];
mov esi,dword ptr [r9+20h]
unsigned int e4 = array[pos + 11];
mov edx,dword ptr [r9+24h]
unsigned int a5 = array[pos + 12];
mov ebx,dword ptr [r9+28h]
unsigned int b5 = array[pos + 13];
mov r11d,dword ptr [r9+2Ch]
unsigned int e5 = array[pos + 14];
mov r9d,dword ptr [r9+30h]
c += (a1 * e1) + (a2 * e2) + (a3 * e3) + (a4 * e4) + (a5 * e5);
mov eax,edx
mov dword ptr [x],r13d
d += (b1 * e1) + (b2 * e2) + (b3 * e3) + (b4 * e4) + (b5 * e5);
imul edx,esi
imul eax,ebp
mov ecx,r9d
imul r9d,r11d
imul ecx,ebx
add r9d,edx
add ecx,eax
mov eax,r8d
imul r8d,r14d
imul eax,r15d
add r9d,r8d
all_instr_found = all_instr_found & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
movzx r8d,byte ptr [this]
add ecx,eax
mov eax,r10d
imul r10d,r12d
add r9d,r10d
imul eax,r13d
mov r13,qword ptr [y]
add ecx,eax
mov eax,edi
imul eax,dword ptr [r13-8]
add eax,ecx
cvtsi2sd xmm0,rax
mov rax,r13
mov edx,dword ptr [rax-4]
imul edi,edx
cmp r11d,0FFFFFFFFh
addsd xmm6,xmm0
setne cl
all_instr_found = all_instr_found & (a1 != -1 & b1 != -1) & (a2 != -1 & b2 != -1) & (a3 != -1 & b3 != -1) & (a4 != -1 & b4 != -1) & (a5 != -1 & b5 != -1);
cmp ebx,0FFFFFFFFh
lea eax,[rdi+r9]
mov r9,r13
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
setne al
and cl,al
cmp esi,0FFFFFFFFh
setne al
and cl,al
cmp ebp,0FFFFFFFFh
setne al
and cl,al
cmp r14d,0FFFFFFFFh
addsd xmm7,xmm0
setne al
and cl,al
cmp r15d,0FFFFFFFFh
setne al
and cl,al
cmp r12d,0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [x],0FFFFFFFFh
setne al
and cl,al
cmp edx,0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [r13-8],0FFFFFFFFh
setne al
and cl,al
and r8b,cl
pos += 15;
add r9,3Ch
mov qword ptr [y],r9
mov byte ptr [this],r8b
while(pos < 2100){
dec qword ptr [rsp+108h]
jne 013F31DA50h
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r9,[y]
mov dword ptr [r9],ecx
mov r15,qword ptr [rsp+0A8h]
mov r13,qword ptr [rsp+0B0h]
mov r12,qword ptr [rsp+0B8h]
or rax,rdx
The new manual loop-unrolling takes 3500 CPU cycles on average, slower than the un-rolled original version which was 3400 CPU cycles.
Can somebody please explain why code which should fetch from the cache 3x less is slower and whether there is a way to help the compiler or get this code faster than technique #1?
Upvotes: 2
Views: 264
Reputation: 3350
Intel Ivy Brige CPUs can predict-fetch multiple cache lines, and as long as the prediction guesses right, you'd not notice any difference between a single stream of data and up to ~6 streams of data (the exact number of streams depends on CPU design).
Most likely the differences you see have more to do with the change in assembly output than L-cache behavior. The asm generated is clearly very different in each case, with the last version (your hand-unrolled one) having the most instructions per iteration - a pretty safe indication that it should be slower simply for forcing more "real work" through the CPU's instruction decoder and execution units.
Moreover, cvtsi2sd
in particular is a slow instruction on Ivy Bridge, and could skew your benchmark result simply depending on where in the instruction stream it's placed in relation to the input (a dependency) and the output (the result latency). If you want to isolate your test to measure memory performance alone. I would strongly recommend eliminating that instruction from the benchmark.
Upvotes: 0
Reputation: 31300
Some naive back-of-the-envelope arithmetic.
Fetching from an int array abc[3N] in a loop will require (3N*4+63)/64 cache fetches. After processing processing 5.33 triplets (5*4*3 = 60) you'll need the next cache line.
Fetching from three discontinuous arrays requires 3 fetches for the first iteration, but then these three cache lines can stay in memory for the next 15 iterations (64/4 = 16). So you'll need 3*((N*4+63)/64) fetches.
Using 700 for N:
F1 = (8400+63)/64 = 132
F2 = 3*((700*4+63)/64) = 132
Given the cache sizes of the CPUs there's no chance that lines are thrown out while the iteration proceeds on three arrays in parallel.
Upvotes: 0
Reputation: 45684
Use some prefetching in #2, and you might just reach that lofty performance goal.
For gcc, use inline assembler and the 3dNow Prefetch instructions (just about all left from that extension nowadays).
Your example number one probably runs faster because doing 3 independent memory requests (due to out-of-order-execution) means better throughput on the memory bus (less idle time).
That's a nice example for micro-optimisation not only making things harder on you, but being self-defeating.
The processor designer and maker as well as the compiler writers spend really amazing amounts of money, time and talent optimizing for the common case so second-guessing them is hard.
Upvotes: 1