Lou
Lou

Reputation: 4474

C compiler loop unrolling clarification

I am having trouble understanding how MSVC compiler is unrolling the following loop (sorry for my poor understanding of the assembly language):

#define NUM_ITERATIONS (1000 * 1000 * 1000)
double dummySum = 0;

for (int x = 0; x < NUM_ITERATIONS; x++) {
    if (x & 1) 
       dummySum += x;
}

This is the generated assembly:

00007FF7B4511070  xorps       xmm1,xmm1  
        double dummySum = 0;
00007FF7B4511073  mov         ecx,2  
00007FF7B4511078  nop         dword ptr [rax+rax]  
        if (x & 1) 
00007FF7B4511080  lea         eax,[rcx-2]  
00007FF7B4511083  mov         r8d,eax  
00007FF7B4511086  and         r8d,1  
00007FF7B451108A  je          someTest+28h (07FF7B4511098h)  
            dummySum += x;
00007FF7B451108C  movd        xmm0,eax  
00007FF7B4511090  cvtdq2pd    xmm0,xmm0  
00007FF7B4511094  addsd       xmm1,xmm0  
        if (x & 1) 
00007FF7B4511098  lea         edx,[rcx-1]  
00007FF7B451109B  and         edx,1  
00007FF7B451109E  je          someTest+3Fh (07FF7B45110AFh)  
            dummySum += x;
00007FF7B45110A0  lea         eax,[rcx-1]  
00007FF7B45110A3  movd        xmm0,eax  
00007FF7B45110A7  cvtdq2pd    xmm0,xmm0  
00007FF7B45110AB  addsd       xmm1,xmm0  
00007FF7B45110AF  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110B2  je          someTest+50h (07FF7B45110C0h)  
            dummySum += x;
00007FF7B45110B4  movd        xmm0,ecx  
00007FF7B45110B8  cvtdq2pd    xmm0,xmm0  
00007FF7B45110BC  addsd       xmm1,xmm0  
00007FF7B45110C0  test        edx,edx  
        if (x & 1) 
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h)  
            dummySum += x;
00007FF7B45110C4  lea         eax,[rcx+1]  
00007FF7B45110C7  movd        xmm0,eax  
00007FF7B45110CB  cvtdq2pd    xmm0,xmm0  
00007FF7B45110CF  addsd       xmm1,xmm0  
00007FF7B45110D3  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  
            dummySum += x;
00007FF7B45110D8  lea         eax,[rcx+2]  
00007FF7B45110DB  movd        xmm0,eax  
00007FF7B45110DF  cvtdq2pd    xmm0,xmm0  
00007FF7B45110E3  addsd       xmm1,xmm0  
00007FF7B45110E7  test        edx,edx  
        if (x & 1) 
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  
            dummySum += x;
00007FF7B45110EB  lea         eax,[rcx+3]  
00007FF7B45110EE  movd        xmm0,eax  
00007FF7B45110F2  cvtdq2pd    xmm0,xmm0  
00007FF7B45110F6  addsd       xmm1,xmm0  
00007FF7B45110FA  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  
            dummySum += x;
00007FF7B45110FF  lea         eax,[rcx+4]  
00007FF7B4511102  movd        xmm0,eax  
00007FF7B4511106  cvtdq2pd    xmm0,xmm0  
00007FF7B451110A  addsd       xmm1,xmm0  
00007FF7B451110E  test        edx,edx  
        if (x & 1) 
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)  
            dummySum += x;
00007FF7B4511112  lea         eax,[rcx+5]  
00007FF7B4511115  movd        xmm0,eax  
00007FF7B4511119  cvtdq2pd    xmm0,xmm0  
00007FF7B451111D  addsd       xmm1,xmm0  
00007FF7B4511121  test        r8d,r8d  
        if (x & 1) 
00007FF7B4511124  je          someTest+0C5h (07FF7B4511135h)  
            dummySum += x;
00007FF7B4511126  lea         eax,[rcx+6]  
00007FF7B4511129  movd        xmm0,eax  
00007FF7B451112D  cvtdq2pd    xmm0,xmm0  
00007FF7B4511131  addsd       xmm1,xmm0  
00007FF7B4511135  test        edx,edx  
        if (x & 1) 
00007FF7B4511137  je          someTest+0D8h (07FF7B4511148h)  
            dummySum += x;
00007FF7B4511139  lea         eax,[rcx+7]  
00007FF7B451113C  movd        xmm0,eax  
00007FF7B4511140  cvtdq2pd    xmm0,xmm0  
00007FF7B4511144  addsd       xmm1,xmm0  

    for (int x = 0; x < NUM_ITERATIONS; x++) {
00007FF7B4511148  add         ecx,0Ah  
00007FF7B451114B  lea         eax,[rcx-2]  
00007FF7B451114E  cmp         eax,3B9ACA00h  
00007FF7B4511153  jl          someTest+10h (07FF7B4511080h)  
    }

I understand this part (the beginning of the loop):

// if (x % 2 == 0) jump over the sumation

00007FF7B4511073  mov         ecx,2                          // ecx/rcx = 2
00007FF7B4511080  lea         eax,[rcx-2]                    // eax = rcx - 2
00007FF7B4511083  mov         r8d,eax                        // r8d = eax
00007FF7B4511086  and         r8d,1                          // r8x & 1
00007FF7B451108A  je          someTest+28h (07FF7B4511098h)  // jump if zero

// add double 

00007FF7B451108C  movd        xmm0,eax  
00007FF7B4511090  cvtdq2pd    xmm0,xmm0  
00007FF7B4511094  addsd       xmm1,xmm0  

But I don't understand how subsequent jump instructions seem to skip the next lea instruction, if I look at the addresses (this is presuming that a jump takes place) - note that I omitted instructions between jumps, from the listing above:

00007FF7B45110C0  test        edx,edx  
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h) 

... addresses in between omitted ...

00007FF7B45110D3  test        r8d,r8d  
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  

... addresses in between omitted ...

00007FF7B45110E7  test        edx,edx  
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  

... addresses in between omitted ...

00007FF7B45110FA  test        r8d,r8d  
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  

... addresses in between omitted ...

00007FF7B451110E  test        edx,edx  
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)  

If each jump takes place, it seems it would just be alternating test r8d,r8d and test edx,edx instructions, without loading the next value.

What am I interpreting incorrectly here?

Upvotes: 1

Views: 105

Answers (1)

Lou
Lou

Reputation: 4474

Ok, got it, I went throgh the disassembly step by step; the compiler is rather smart. The loop is unrolled to execute 10 times per iteration, and these instructions are arranged so that r8d and edx are loaded only once per iteration:

lea         eax,[rcx-2]  
mov         r8d,eax  
and         r8d,1        // r8d is 0 here
...
lea         edx,[rcx-1]  
and         edx,1        // edx is 1 here

After that, these registers are not loaded again for the rest of the iteration, because the compiler obviously realized that & 1 evaluates to true on every odd step:

00007FF7B45110C0  test        edx,edx  // always 1
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h) 

... addresses in between omitted ...

00007FF7B45110D3  test        r8d,r8d  // always 0
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  

... addresses in between omitted ...

00007FF7B45110E7  test        edx,edx  // always 1
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  

... addresses in between omitted ...

00007FF7B45110FA  test        r8d,r8d  // always 0
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  

... addresses in between omitted ...

00007FF7B451110E  test        edx,edx  // always 1
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)  

Upvotes: 1

Related Questions