Reputation: 30605

Intel compiler produces code 68% slower than MSVC (full example provided)

I have C++ code processing three consecutive values from one single 1800-element array. The code compiled by ICC 14.0 is approximately 68% slower (1600 vs 2700 CPU cycles) than the code produced by the MSVC. I cannot understand why. Could somebody please help? Even when I set the Intel compiler -O3 switch it doesn't change the timing. The CPU is Ivy Bridge.

#include <iostream>

int main(){
        int data[1200];

        //Dummy-populate data
        for(int y=0; y<1200; y++){
            data[y] = y/2 + 7;
        }

        int counter = 0;

        //Just to repeat the test
        while(counter < 10000){

            int Accum = 0;
            long long start = 0;
            long long end = 0;
            int p = 0;

            start = __rdtsc();

            while(p < 1200){
                unsigned int level1 = data[p];  
                unsigned int factor = data[p + 1];
                Accum += (level1 * factor);
                p = p + 2;
            }

            end = __rdtsc();
            std::cout << (end - start) << "  " << Accum << std::endl;
            counter++;
        }
}

Upvotes: 6

Answers (2)

osgx

Reputation: 94175

user997112, I tested your new code (only one level and accum), and I get only 5% difference between gcc and icc with -O3 option (-march=native -mtune=native may help you). I have Core 2 Q6600 fixed on 2.4 GHz, best results are 1800 for gcc and 1900 for icc.

Here is my version of test (rdtsc() redefined with gnu asm, runtimes are saved in array, and only minimal (best) runtime is printed:

$ cat my.cc
#include <iostream>

#if 1
// my cpu has no rdtscp, so use asm
inline unsigned long long rdtsc() __attribute__((always_inline));
inline unsigned long long rdtsc() {
  unsigned int lo, hi;
  asm volatile (
     "cpuid \n"
     "rdtsc" 
   : "=a"(lo), "=d"(hi) /* outputs */
   : "a"(0)             /* inputs */
   : "%ebx", "%ecx");     /* clobbers*/
  return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
}
#else
#define rdtsc __rdtsc
#endif

int main(){
        int data[1200];
        int dummy[10000];
        int stats[10000];

        //Dummy-populate data
        for(int y=0; y<1200; y++){
            data[y] = y/2 + 7;
        }
        for(int y=0; y<10000; y++){
            stats[y]=0;
        }

        int counter = 0;

        //Just to repeat the test
        while(counter < 10000){

            int Accum = 0;
            long long start = 0;
            long long end = 0;
            int p = 0;

            start = rdtsc();

            while(p < 1200){
                unsigned int level1 = data[p];  
                unsigned int factor = data[p + 1];
                Accum += (level1 * factor);
                p = p + 2;
            }

            end = rdtsc();
            stats[counter]=(end - start);
            dummy[counter]=Accum;
            counter++;
        }

        int min=0xfffff;
        for(int y=0; y<10000; y++) {
            if(stats[y] < min) {
                min = stats[y];
                std::cout << min << std::endl;
                std::cout << "accum " << dummy[y] << std::endl;
            }
        }
        std::cout << min << std::endl;
}

Compiled with icc 14 and gcc 4.8 as:

$ g++ my.cc -o mygccO3t -O3 -march=native -mtune=native
$ icc my.cc -o myiccO3t -O3 -march=native -mtune=native

Results (CPU frequency changing is disabled at 2.4 GHz, core is fixed by taskset, measured by Linux PMU access tool perf):

$ taskset -c 3 perf stat -e cycles:u,instructions:u ./myiccO3t |tail -n 1 
 Performance counter stats for './myiccO3t':
    23 875 260 cycles:u                 
    28 866 440 instructions:u            #    1,21  insns per cycle        

   0,011297567 seconds time elapsed
1899

$ taskset -c 3 perf stat -e cycles:u,instructions:u ./mygccO3t |tail -n 1 
 Performance counter stats for './mygccO3t':
    22 389 238 cycles:u                 
    43 551 129 instructions:u            #    1,95  insns per cycle        

   0,010683920 seconds time elapsed
1800

So, we can see, that gcc needs much more instructions to handle same amount of data, but also it achieves better IPC (instruction per clock) rate.

There is simple assembler code for inner loop from gcc:

  4009b9:       45 31 c0                xor    %r8d,%r8d
  4009bc:       45 31 c9                xor    %r9d,%r9d
  4009bf:       90                      nop
  4009c0:       44 89 c8                mov    %r9d,%eax
  4009c3:       0f a2                   cpuid  
  4009c5:       0f 31                   rdtsc  
  4009c7:       49 89 d2                mov    %rdx,%r10
  4009ca:       89 c0                   mov    %eax,%eax
  4009cc:       48 89 e2                mov    %rsp,%rdx
  4009cf:       49 c1 e2 20             shl    $0x20,%r10
  4009d3:       31 ff                   xor    %edi,%edi
  4009d5:       49 09 c2                or     %rax,%r10
  4009d8:       0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
  4009df:       00 

   vvvv
  4009e0:       8b 4a 04                mov    0x4(%rdx),%ecx
  4009e3:       48 83 c2 08             add    $0x8,%rdx
  4009e7:       0f af 4a f8             imul   -0x8(%rdx),%ecx
  4009eb:       48 39 d5                cmp    %rdx,%rbp
  4009ee:       8d 34 39                lea    (%rcx,%rdi,1),%esi
  4009f1:       89 f7                   mov    %esi,%edi
  4009f3:       75 eb                   jne    4009e0 <main+0x90>
   ^^^^

  4009f5:       44 89 c8                mov    %r9d,%eax
  4009f8:       0f a2                   cpuid  
  4009fa:       0f 31                   rdtsc

And heavy SSE2/unroll from icc (part of loop, 1184 iterations, was vectorized, and tail is handled after loop):

  400e4c:       33 c9                   xor    %ecx,%ecx
  400e4e:       49 89 cd                mov    %rcx,%r13
  400e51:       33 c0                   xor    %eax,%eax
  400e53:       0f a2                   cpuid  
  400e55:       0f 31                   rdtsc  
  400e57:       66 0f ef c9             pxor   %xmm1,%xmm1
  400e5b:       66 0f 6f 05 7d 2f 00    movdqa 0x2f7d(%rip),%xmm0         
  400e62:       00 
  400e63:       41 89 c4                mov    %eax,%r12d
  400e66:       33 c0                   xor    %eax,%eax

   vvvv
  400e68:       66 0f 6f 9c c4 80 38    movdqa 0x13880(%rsp,%rax,8),%xmm3
  400e6f:       01 00 
  400e71:       66 0f 6f 94 c4 90 38    movdqa 0x13890(%rsp,%rax,8),%xmm2
  400e78:       01 00 
  400e7a:       66 0f 6f f3             movdqa %xmm3,%xmm6
  400e7e:       66 0f 62 f2             punpckldq %xmm2,%xmm6
  400e82:       66 0f 6a da             punpckhdq %xmm2,%xmm3
  400e86:       66 0f 6f fe             movdqa %xmm6,%xmm7
  400e8a:       66 0f 62 fb             punpckldq %xmm3,%xmm7
  400e8e:       66 0f 6f ac c4 a0 38    movdqa 0x138a0(%rsp,%rax,8),%xmm5
  400e95:       01 00 
  400e97:       66 44 0f 6f d7          movdqa %xmm7,%xmm10
  400e9c:       66 0f 6a f3             punpckhdq %xmm3,%xmm6
  400ea0:       66 44 0f 6f c5          movdqa %xmm5,%xmm8
  400ea5:       66 0f 6f a4 c4 b0 38    movdqa 0x138b0(%rsp,%rax,8),%xmm4
  400eac:       01 00 
  400eae:       66 0f 73 d7 20          psrlq  $0x20,%xmm7
  400eb3:       66 44 0f f4 d6          pmuludq %xmm6,%xmm10
  400eb8:       66 0f 73 d6 20          psrlq  $0x20,%xmm6
  400ebd:       66 0f f4 fe             pmuludq %xmm6,%xmm7
  400ec1:       66 44 0f 6f ac c4 c0    movdqa 0x138c0(%rsp,%rax,8),%xmm13
  400ec8:       38 01 00 
  400ecb:       66 44 0f db d0          pand   %xmm0,%xmm10
  400ed0:       66 44 0f 62 c4          punpckldq %xmm4,%xmm8
  400ed5:       66 45 0f 6f f5          movdqa %xmm13,%xmm14
  400eda:       66 44 0f 6f a4 c4 d0    movdqa 0x138d0(%rsp,%rax,8),%xmm12
  400ee1:       38 01 00 
  400ee4:       66 45 0f 6f c8          movdqa %xmm8,%xmm9
  400ee9:       66 0f 6a ec             punpckhdq %xmm4,%xmm5
  400eed:       66 0f 73 f7 20          psllq  $0x20,%xmm7
  400ef2:       66 0f 6f a4 c4 e0 38    movdqa 0x138e0(%rsp,%rax,8),%xmm4
  400ef9:       01 00 
  400efb:       66 44 0f eb d7          por    %xmm7,%xmm10
  400f00:       66 0f 6f 9c c4 f0 38    movdqa 0x138f0(%rsp,%rax,8),%xmm3
  400f07:       01 00 
  400f09:       66 41 0f fe ca          paddd  %xmm10,%xmm1
  400f0e:       66 44 0f 62 cd          punpckldq %xmm5,%xmm9
  400f13:       48 83 c0 10             add    $0x10,%rax
  400f17:       66 44 0f 6a c5          punpckhdq %xmm5,%xmm8
  400f1c:       66 0f 6f ec             movdqa %xmm4,%xmm5
  400f20:       66 45 0f 62 f4          punpckldq %xmm12,%xmm14
  400f25:       66 45 0f 6f d9          movdqa %xmm9,%xmm11
  400f2a:       66 45 0f 6a ec          punpckhdq %xmm12,%xmm13
  400f2f:       66 45 0f 6f fe          movdqa %xmm14,%xmm15
  400f34:       66 0f 62 eb             punpckldq %xmm3,%xmm5
  400f38:       66 41 0f 73 d1 20       psrlq  $0x20,%xmm9
  400f3e:       66 45 0f 62 fd          punpckldq %xmm13,%xmm15
  400f43:       66 0f 6f f5             movdqa %xmm5,%xmm6
  400f47:       66 0f 6a e3             punpckhdq %xmm3,%xmm4
  400f4b:       66 41 0f 6f d7          movdqa %xmm15,%xmm2
  400f50:       66 45 0f f4 d8          pmuludq %xmm8,%xmm11
  400f55:       66 41 0f 73 d0 20       psrlq  $0x20,%xmm8
  400f5b:       66 45 0f f4 c8          pmuludq %xmm8,%xmm9
  400f60:       66 45 0f 6a f5          punpckhdq %xmm13,%xmm14
  400f65:       66 41 0f 73 d7 20       psrlq  $0x20,%xmm15
  400f6b:       66 0f 62 f4             punpckldq %xmm4,%xmm6
  400f6f:       66 44 0f db d8          pand   %xmm0,%xmm11
  400f74:       66 41 0f f4 d6          pmuludq %xmm14,%xmm2
  400f79:       66 41 0f 73 d6 20       psrlq  $0x20,%xmm14
  400f7f:       66 45 0f f4 fe          pmuludq %xmm14,%xmm15
  400f84:       66 0f 6a ec             punpckhdq %xmm4,%xmm5
  400f88:       66 0f 6f fe             movdqa %xmm6,%xmm7
  400f8c:       66 0f f4 fd             pmuludq %xmm5,%xmm7
  400f90:       66 0f 73 d6 20          psrlq  $0x20,%xmm6
  400f95:       66 0f 73 d5 20          psrlq  $0x20,%xmm5
  400f9a:       66 41 0f 73 f1 20       psllq  $0x20,%xmm9
  400fa0:       66 0f f4 f5             pmuludq %xmm5,%xmm6
  400fa4:       66 45 0f eb d9          por    %xmm9,%xmm11
  400fa9:       66 0f db d0             pand   %xmm0,%xmm2
  400fad:       66 41 0f 73 f7 20       psllq  $0x20,%xmm15
  400fb3:       66 41 0f fe cb          paddd  %xmm11,%xmm1
  400fb8:       66 41 0f eb d7          por    %xmm15,%xmm2
  400fbd:       66 0f db f8             pand   %xmm0,%xmm7
  400fc1:       66 0f 73 f6 20          psllq  $0x20,%xmm6
  400fc6:       66 0f fe ca             paddd  %xmm2,%xmm1
  400fca:       66 0f eb fe             por    %xmm6,%xmm7
  400fce:       66 0f fe cf             paddd  %xmm7,%xmm1
  400fd2:       48 3d 50 02 00 00       cmp    $0x250,%rax
  400fd8:       0f 82 8a fe ff ff       jb     400e68 <main+0xe8>
   ^^^^

  400fde:       66 0f 6f c1             movdqa %xmm1,%xmm0
  400fe2:       66 0f 73 d8 08          psrldq $0x8,%xmm0
  400fe7:       66 0f fe c8             paddd  %xmm0,%xmm1
  400feb:       66 0f 6f d1             movdqa %xmm1,%xmm2
  400fef:       8b 84 24 00 4b 01 00    mov    0x14b00(%rsp),%eax
  400ff6:       66 0f 73 d2 20          psrlq  $0x20,%xmm2
  400ffb:       0f af 84 24 04 4b 01    imul   0x14b04(%rsp),%eax
  401002:       00 
  401003:       66 0f fe ca             paddd  %xmm2,%xmm1
  401007:       66 0f 7e cb             movd   %xmm1,%ebx
  40100b:       8b 94 24 08 4b 01 00    mov    0x14b08(%rsp),%edx
  401012:       03 d8                   add    %eax,%ebx
  401014:       0f af 94 24 0c 4b 01    imul   0x14b0c(%rsp),%edx
  40101b:       00 
  40101c:       8b b4 24 10 4b 01 00    mov    0x14b10(%rsp),%esi
  401023:       03 da                   add    %edx,%ebx
  401025:       0f af b4 24 14 4b 01    imul   0x14b14(%rsp),%esi
  40102c:       00 
  40102d:       8b bc 24 18 4b 01 00    mov    0x14b18(%rsp),%edi
  401034:       03 de                   add    %esi,%ebx
  401036:       0f af bc 24 1c 4b 01    imul   0x14b1c(%rsp),%edi
  40103d:       00 
  40103e:       44 8b 84 24 20 4b 01    mov    0x14b20(%rsp),%r8d
  401045:       00 
  401046:       03 df                   add    %edi,%ebx
  401048:       44 0f af 84 24 24 4b    imul   0x14b24(%rsp),%r8d
  40104f:       01 00 
  401051:       44 8b 8c 24 28 4b 01    mov    0x14b28(%rsp),%r9d
  401058:       00 
  401059:       41 03 d8                add    %r8d,%ebx
  40105c:       44 0f af 8c 24 2c 4b    imul   0x14b2c(%rsp),%r9d
  401063:       01 00 
  401065:       44 8b 94 24 30 4b 01    mov    0x14b30(%rsp),%r10d
  40106c:       00 
  40106d:       41 03 d9                add    %r9d,%ebx
  401070:       44 0f af 94 24 34 4b    imul   0x14b34(%rsp),%r10d
  401077:       01 00 
  401079:       44 8b 9c 24 38 4b 01    mov    0x14b38(%rsp),%r11d
  401080:       00 
  401081:       41 03 da                add    %r10d,%ebx
  401084:       44 0f af 9c 24 3c 4b    imul   0x14b3c(%rsp),%r11d
  40108b:       01 00 
  40108d:       41 03 db                add    %r11d,%ebx
  401090:       e8 eb 00 00 00          callq  401180 <_Z5rdtscv>

Upvotes: 1

Tony Delroy

Reputation: 106066

ICC sucks here because it's working out the addresses for each data[n] access ala mov edi,dword ptr [rsp+rax*4+44h]... all that run-time multiplication is expensive. You should be able to avoid it by recoding so the indices are constants (could also use *p_data++ three times, but that introduces a sequencing issue that may adversely affect performance).

for (unsigned* p_data = &data[0], *p_end = data + 1800; p_data < p_end; p_data += 3)
{
    unsigned level1 = p_data[0];
    unsigned level2 = p_data[1];
    unsigned factor = p_data[2];

    Accum1 += level1 * factor;
    Accum2 += level2 * factor;
}

Upvotes: 3

Intel compiler produces code 68% slower than MSVC (full example provided)

Answers (2)

Related Questions