Reputation: 4474
I am having trouble understanding how MSVC compiler is unrolling the following loop (sorry for my poor understanding of the assembly language):
#define NUM_ITERATIONS (1000 * 1000 * 1000)
double dummySum = 0;
for (int x = 0; x < NUM_ITERATIONS; x++) {
if (x & 1)
dummySum += x;
}
This is the generated assembly:
00007FF7B4511070 xorps xmm1,xmm1
double dummySum = 0;
00007FF7B4511073 mov ecx,2
00007FF7B4511078 nop dword ptr [rax+rax]
if (x & 1)
00007FF7B4511080 lea eax,[rcx-2]
00007FF7B4511083 mov r8d,eax
00007FF7B4511086 and r8d,1
00007FF7B451108A je someTest+28h (07FF7B4511098h)
dummySum += x;
00007FF7B451108C movd xmm0,eax
00007FF7B4511090 cvtdq2pd xmm0,xmm0
00007FF7B4511094 addsd xmm1,xmm0
if (x & 1)
00007FF7B4511098 lea edx,[rcx-1]
00007FF7B451109B and edx,1
00007FF7B451109E je someTest+3Fh (07FF7B45110AFh)
dummySum += x;
00007FF7B45110A0 lea eax,[rcx-1]
00007FF7B45110A3 movd xmm0,eax
00007FF7B45110A7 cvtdq2pd xmm0,xmm0
00007FF7B45110AB addsd xmm1,xmm0
00007FF7B45110AF test r8d,r8d
if (x & 1)
00007FF7B45110B2 je someTest+50h (07FF7B45110C0h)
dummySum += x;
00007FF7B45110B4 movd xmm0,ecx
00007FF7B45110B8 cvtdq2pd xmm0,xmm0
00007FF7B45110BC addsd xmm1,xmm0
00007FF7B45110C0 test edx,edx
if (x & 1)
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
dummySum += x;
00007FF7B45110C4 lea eax,[rcx+1]
00007FF7B45110C7 movd xmm0,eax
00007FF7B45110CB cvtdq2pd xmm0,xmm0
00007FF7B45110CF addsd xmm1,xmm0
00007FF7B45110D3 test r8d,r8d
if (x & 1)
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
dummySum += x;
00007FF7B45110D8 lea eax,[rcx+2]
00007FF7B45110DB movd xmm0,eax
00007FF7B45110DF cvtdq2pd xmm0,xmm0
00007FF7B45110E3 addsd xmm1,xmm0
00007FF7B45110E7 test edx,edx
if (x & 1)
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
dummySum += x;
00007FF7B45110EB lea eax,[rcx+3]
00007FF7B45110EE movd xmm0,eax
00007FF7B45110F2 cvtdq2pd xmm0,xmm0
00007FF7B45110F6 addsd xmm1,xmm0
00007FF7B45110FA test r8d,r8d
if (x & 1)
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
dummySum += x;
00007FF7B45110FF lea eax,[rcx+4]
00007FF7B4511102 movd xmm0,eax
00007FF7B4511106 cvtdq2pd xmm0,xmm0
00007FF7B451110A addsd xmm1,xmm0
00007FF7B451110E test edx,edx
if (x & 1)
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
dummySum += x;
00007FF7B4511112 lea eax,[rcx+5]
00007FF7B4511115 movd xmm0,eax
00007FF7B4511119 cvtdq2pd xmm0,xmm0
00007FF7B451111D addsd xmm1,xmm0
00007FF7B4511121 test r8d,r8d
if (x & 1)
00007FF7B4511124 je someTest+0C5h (07FF7B4511135h)
dummySum += x;
00007FF7B4511126 lea eax,[rcx+6]
00007FF7B4511129 movd xmm0,eax
00007FF7B451112D cvtdq2pd xmm0,xmm0
00007FF7B4511131 addsd xmm1,xmm0
00007FF7B4511135 test edx,edx
if (x & 1)
00007FF7B4511137 je someTest+0D8h (07FF7B4511148h)
dummySum += x;
00007FF7B4511139 lea eax,[rcx+7]
00007FF7B451113C movd xmm0,eax
00007FF7B4511140 cvtdq2pd xmm0,xmm0
00007FF7B4511144 addsd xmm1,xmm0
for (int x = 0; x < NUM_ITERATIONS; x++) {
00007FF7B4511148 add ecx,0Ah
00007FF7B451114B lea eax,[rcx-2]
00007FF7B451114E cmp eax,3B9ACA00h
00007FF7B4511153 jl someTest+10h (07FF7B4511080h)
}
I understand this part (the beginning of the loop):
// if (x % 2 == 0) jump over the sumation
00007FF7B4511073 mov ecx,2 // ecx/rcx = 2
00007FF7B4511080 lea eax,[rcx-2] // eax = rcx - 2
00007FF7B4511083 mov r8d,eax // r8d = eax
00007FF7B4511086 and r8d,1 // r8x & 1
00007FF7B451108A je someTest+28h (07FF7B4511098h) // jump if zero
// add double
00007FF7B451108C movd xmm0,eax
00007FF7B4511090 cvtdq2pd xmm0,xmm0
00007FF7B4511094 addsd xmm1,xmm0
But I don't understand how subsequent jump instructions seem to skip the next lea
instruction, if I look at the addresses (this is presuming that a jump takes place) - note that I omitted instructions between jumps, from the listing above:
00007FF7B45110C0 test edx,edx
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
... addresses in between omitted ...
00007FF7B45110D3 test r8d,r8d
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
... addresses in between omitted ...
00007FF7B45110E7 test edx,edx
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
... addresses in between omitted ...
00007FF7B45110FA test r8d,r8d
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
... addresses in between omitted ...
00007FF7B451110E test edx,edx
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
If each jump takes place, it seems it would just be alternating test r8d,r8d
and test edx,edx
instructions, without loading the next value.
What am I interpreting incorrectly here?
Upvotes: 1
Views: 105
Reputation: 4474
Ok, got it, I went throgh the disassembly step by step; the compiler is rather smart. The loop is unrolled to execute 10 times per iteration, and these instructions are arranged so that r8d
and edx
are loaded only once per iteration:
lea eax,[rcx-2]
mov r8d,eax
and r8d,1 // r8d is 0 here
...
lea edx,[rcx-1]
and edx,1 // edx is 1 here
After that, these registers are not loaded again for the rest of the iteration, because the compiler obviously realized that & 1
evaluates to true on every odd step:
00007FF7B45110C0 test edx,edx // always 1
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
... addresses in between omitted ...
00007FF7B45110D3 test r8d,r8d // always 0
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
... addresses in between omitted ...
00007FF7B45110E7 test edx,edx // always 1
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
... addresses in between omitted ...
00007FF7B45110FA test r8d,r8d // always 0
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
... addresses in between omitted ...
00007FF7B451110E test edx,edx // always 1
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
Upvotes: 1