Reputation: 30605
I have C++ code processing three consecutive values from one single 1800-element array. The code compiled by ICC 14.0 is approximately 68% slower (1600 vs 2700 CPU cycles) than the code produced by the MSVC. I cannot understand why. Could somebody please help? Even when I set the Intel compiler -O3 switch it doesn't change the timing. The CPU is Ivy Bridge.
#include <iostream>
int main(){
int data[1200];
//Dummy-populate data
for(int y=0; y<1200; y++){
data[y] = y/2 + 7;
}
int counter = 0;
//Just to repeat the test
while(counter < 10000){
int Accum = 0;
long long start = 0;
long long end = 0;
int p = 0;
start = __rdtsc();
while(p < 1200){
unsigned int level1 = data[p];
unsigned int factor = data[p + 1];
Accum += (level1 * factor);
p = p + 2;
}
end = __rdtsc();
std::cout << (end - start) << " " << Accum << std::endl;
counter++;
}
}
Upvotes: 6
Views: 479
Reputation: 94175
user997112, I tested your new code (only one level and accum), and I get only 5% difference between gcc and icc with -O3
option (-march=native -mtune=native
may help you). I have Core 2 Q6600 fixed on 2.4 GHz, best results are 1800 for gcc and 1900 for icc.
Here is my version of test (rdtsc()
redefined with gnu asm, runtimes are saved in array, and only minimal (best) runtime is printed:
$ cat my.cc
#include <iostream>
#if 1
// my cpu has no rdtscp, so use asm
inline unsigned long long rdtsc() __attribute__((always_inline));
inline unsigned long long rdtsc() {
unsigned int lo, hi;
asm volatile (
"cpuid \n"
"rdtsc"
: "=a"(lo), "=d"(hi) /* outputs */
: "a"(0) /* inputs */
: "%ebx", "%ecx"); /* clobbers*/
return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
}
#else
#define rdtsc __rdtsc
#endif
int main(){
int data[1200];
int dummy[10000];
int stats[10000];
//Dummy-populate data
for(int y=0; y<1200; y++){
data[y] = y/2 + 7;
}
for(int y=0; y<10000; y++){
stats[y]=0;
}
int counter = 0;
//Just to repeat the test
while(counter < 10000){
int Accum = 0;
long long start = 0;
long long end = 0;
int p = 0;
start = rdtsc();
while(p < 1200){
unsigned int level1 = data[p];
unsigned int factor = data[p + 1];
Accum += (level1 * factor);
p = p + 2;
}
end = rdtsc();
stats[counter]=(end - start);
dummy[counter]=Accum;
counter++;
}
int min=0xfffff;
for(int y=0; y<10000; y++) {
if(stats[y] < min) {
min = stats[y];
std::cout << min << std::endl;
std::cout << "accum " << dummy[y] << std::endl;
}
}
std::cout << min << std::endl;
}
Compiled with icc 14 and gcc 4.8 as:
$ g++ my.cc -o mygccO3t -O3 -march=native -mtune=native
$ icc my.cc -o myiccO3t -O3 -march=native -mtune=native
Results (CPU frequency changing is disabled at 2.4 GHz, core is fixed by taskset
, measured by Linux PMU access tool perf
):
$ taskset -c 3 perf stat -e cycles:u,instructions:u ./myiccO3t |tail -n 1
Performance counter stats for './myiccO3t':
23 875 260 cycles:u
28 866 440 instructions:u # 1,21 insns per cycle
0,011297567 seconds time elapsed
1899
$ taskset -c 3 perf stat -e cycles:u,instructions:u ./mygccO3t |tail -n 1
Performance counter stats for './mygccO3t':
22 389 238 cycles:u
43 551 129 instructions:u # 1,95 insns per cycle
0,010683920 seconds time elapsed
1800
So, we can see, that gcc needs much more instructions to handle same amount of data, but also it achieves better IPC (instruction per clock) rate.
There is simple assembler code for inner loop from gcc:
4009b9: 45 31 c0 xor %r8d,%r8d
4009bc: 45 31 c9 xor %r9d,%r9d
4009bf: 90 nop
4009c0: 44 89 c8 mov %r9d,%eax
4009c3: 0f a2 cpuid
4009c5: 0f 31 rdtsc
4009c7: 49 89 d2 mov %rdx,%r10
4009ca: 89 c0 mov %eax,%eax
4009cc: 48 89 e2 mov %rsp,%rdx
4009cf: 49 c1 e2 20 shl $0x20,%r10
4009d3: 31 ff xor %edi,%edi
4009d5: 49 09 c2 or %rax,%r10
4009d8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
4009df: 00
vvvv
4009e0: 8b 4a 04 mov 0x4(%rdx),%ecx
4009e3: 48 83 c2 08 add $0x8,%rdx
4009e7: 0f af 4a f8 imul -0x8(%rdx),%ecx
4009eb: 48 39 d5 cmp %rdx,%rbp
4009ee: 8d 34 39 lea (%rcx,%rdi,1),%esi
4009f1: 89 f7 mov %esi,%edi
4009f3: 75 eb jne 4009e0 <main+0x90>
^^^^
4009f5: 44 89 c8 mov %r9d,%eax
4009f8: 0f a2 cpuid
4009fa: 0f 31 rdtsc
And heavy SSE2/unroll from icc (part of loop, 1184 iterations, was vectorized, and tail is handled after loop):
400e4c: 33 c9 xor %ecx,%ecx
400e4e: 49 89 cd mov %rcx,%r13
400e51: 33 c0 xor %eax,%eax
400e53: 0f a2 cpuid
400e55: 0f 31 rdtsc
400e57: 66 0f ef c9 pxor %xmm1,%xmm1
400e5b: 66 0f 6f 05 7d 2f 00 movdqa 0x2f7d(%rip),%xmm0
400e62: 00
400e63: 41 89 c4 mov %eax,%r12d
400e66: 33 c0 xor %eax,%eax
vvvv
400e68: 66 0f 6f 9c c4 80 38 movdqa 0x13880(%rsp,%rax,8),%xmm3
400e6f: 01 00
400e71: 66 0f 6f 94 c4 90 38 movdqa 0x13890(%rsp,%rax,8),%xmm2
400e78: 01 00
400e7a: 66 0f 6f f3 movdqa %xmm3,%xmm6
400e7e: 66 0f 62 f2 punpckldq %xmm2,%xmm6
400e82: 66 0f 6a da punpckhdq %xmm2,%xmm3
400e86: 66 0f 6f fe movdqa %xmm6,%xmm7
400e8a: 66 0f 62 fb punpckldq %xmm3,%xmm7
400e8e: 66 0f 6f ac c4 a0 38 movdqa 0x138a0(%rsp,%rax,8),%xmm5
400e95: 01 00
400e97: 66 44 0f 6f d7 movdqa %xmm7,%xmm10
400e9c: 66 0f 6a f3 punpckhdq %xmm3,%xmm6
400ea0: 66 44 0f 6f c5 movdqa %xmm5,%xmm8
400ea5: 66 0f 6f a4 c4 b0 38 movdqa 0x138b0(%rsp,%rax,8),%xmm4
400eac: 01 00
400eae: 66 0f 73 d7 20 psrlq $0x20,%xmm7
400eb3: 66 44 0f f4 d6 pmuludq %xmm6,%xmm10
400eb8: 66 0f 73 d6 20 psrlq $0x20,%xmm6
400ebd: 66 0f f4 fe pmuludq %xmm6,%xmm7
400ec1: 66 44 0f 6f ac c4 c0 movdqa 0x138c0(%rsp,%rax,8),%xmm13
400ec8: 38 01 00
400ecb: 66 44 0f db d0 pand %xmm0,%xmm10
400ed0: 66 44 0f 62 c4 punpckldq %xmm4,%xmm8
400ed5: 66 45 0f 6f f5 movdqa %xmm13,%xmm14
400eda: 66 44 0f 6f a4 c4 d0 movdqa 0x138d0(%rsp,%rax,8),%xmm12
400ee1: 38 01 00
400ee4: 66 45 0f 6f c8 movdqa %xmm8,%xmm9
400ee9: 66 0f 6a ec punpckhdq %xmm4,%xmm5
400eed: 66 0f 73 f7 20 psllq $0x20,%xmm7
400ef2: 66 0f 6f a4 c4 e0 38 movdqa 0x138e0(%rsp,%rax,8),%xmm4
400ef9: 01 00
400efb: 66 44 0f eb d7 por %xmm7,%xmm10
400f00: 66 0f 6f 9c c4 f0 38 movdqa 0x138f0(%rsp,%rax,8),%xmm3
400f07: 01 00
400f09: 66 41 0f fe ca paddd %xmm10,%xmm1
400f0e: 66 44 0f 62 cd punpckldq %xmm5,%xmm9
400f13: 48 83 c0 10 add $0x10,%rax
400f17: 66 44 0f 6a c5 punpckhdq %xmm5,%xmm8
400f1c: 66 0f 6f ec movdqa %xmm4,%xmm5
400f20: 66 45 0f 62 f4 punpckldq %xmm12,%xmm14
400f25: 66 45 0f 6f d9 movdqa %xmm9,%xmm11
400f2a: 66 45 0f 6a ec punpckhdq %xmm12,%xmm13
400f2f: 66 45 0f 6f fe movdqa %xmm14,%xmm15
400f34: 66 0f 62 eb punpckldq %xmm3,%xmm5
400f38: 66 41 0f 73 d1 20 psrlq $0x20,%xmm9
400f3e: 66 45 0f 62 fd punpckldq %xmm13,%xmm15
400f43: 66 0f 6f f5 movdqa %xmm5,%xmm6
400f47: 66 0f 6a e3 punpckhdq %xmm3,%xmm4
400f4b: 66 41 0f 6f d7 movdqa %xmm15,%xmm2
400f50: 66 45 0f f4 d8 pmuludq %xmm8,%xmm11
400f55: 66 41 0f 73 d0 20 psrlq $0x20,%xmm8
400f5b: 66 45 0f f4 c8 pmuludq %xmm8,%xmm9
400f60: 66 45 0f 6a f5 punpckhdq %xmm13,%xmm14
400f65: 66 41 0f 73 d7 20 psrlq $0x20,%xmm15
400f6b: 66 0f 62 f4 punpckldq %xmm4,%xmm6
400f6f: 66 44 0f db d8 pand %xmm0,%xmm11
400f74: 66 41 0f f4 d6 pmuludq %xmm14,%xmm2
400f79: 66 41 0f 73 d6 20 psrlq $0x20,%xmm14
400f7f: 66 45 0f f4 fe pmuludq %xmm14,%xmm15
400f84: 66 0f 6a ec punpckhdq %xmm4,%xmm5
400f88: 66 0f 6f fe movdqa %xmm6,%xmm7
400f8c: 66 0f f4 fd pmuludq %xmm5,%xmm7
400f90: 66 0f 73 d6 20 psrlq $0x20,%xmm6
400f95: 66 0f 73 d5 20 psrlq $0x20,%xmm5
400f9a: 66 41 0f 73 f1 20 psllq $0x20,%xmm9
400fa0: 66 0f f4 f5 pmuludq %xmm5,%xmm6
400fa4: 66 45 0f eb d9 por %xmm9,%xmm11
400fa9: 66 0f db d0 pand %xmm0,%xmm2
400fad: 66 41 0f 73 f7 20 psllq $0x20,%xmm15
400fb3: 66 41 0f fe cb paddd %xmm11,%xmm1
400fb8: 66 41 0f eb d7 por %xmm15,%xmm2
400fbd: 66 0f db f8 pand %xmm0,%xmm7
400fc1: 66 0f 73 f6 20 psllq $0x20,%xmm6
400fc6: 66 0f fe ca paddd %xmm2,%xmm1
400fca: 66 0f eb fe por %xmm6,%xmm7
400fce: 66 0f fe cf paddd %xmm7,%xmm1
400fd2: 48 3d 50 02 00 00 cmp $0x250,%rax
400fd8: 0f 82 8a fe ff ff jb 400e68 <main+0xe8>
^^^^
400fde: 66 0f 6f c1 movdqa %xmm1,%xmm0
400fe2: 66 0f 73 d8 08 psrldq $0x8,%xmm0
400fe7: 66 0f fe c8 paddd %xmm0,%xmm1
400feb: 66 0f 6f d1 movdqa %xmm1,%xmm2
400fef: 8b 84 24 00 4b 01 00 mov 0x14b00(%rsp),%eax
400ff6: 66 0f 73 d2 20 psrlq $0x20,%xmm2
400ffb: 0f af 84 24 04 4b 01 imul 0x14b04(%rsp),%eax
401002: 00
401003: 66 0f fe ca paddd %xmm2,%xmm1
401007: 66 0f 7e cb movd %xmm1,%ebx
40100b: 8b 94 24 08 4b 01 00 mov 0x14b08(%rsp),%edx
401012: 03 d8 add %eax,%ebx
401014: 0f af 94 24 0c 4b 01 imul 0x14b0c(%rsp),%edx
40101b: 00
40101c: 8b b4 24 10 4b 01 00 mov 0x14b10(%rsp),%esi
401023: 03 da add %edx,%ebx
401025: 0f af b4 24 14 4b 01 imul 0x14b14(%rsp),%esi
40102c: 00
40102d: 8b bc 24 18 4b 01 00 mov 0x14b18(%rsp),%edi
401034: 03 de add %esi,%ebx
401036: 0f af bc 24 1c 4b 01 imul 0x14b1c(%rsp),%edi
40103d: 00
40103e: 44 8b 84 24 20 4b 01 mov 0x14b20(%rsp),%r8d
401045: 00
401046: 03 df add %edi,%ebx
401048: 44 0f af 84 24 24 4b imul 0x14b24(%rsp),%r8d
40104f: 01 00
401051: 44 8b 8c 24 28 4b 01 mov 0x14b28(%rsp),%r9d
401058: 00
401059: 41 03 d8 add %r8d,%ebx
40105c: 44 0f af 8c 24 2c 4b imul 0x14b2c(%rsp),%r9d
401063: 01 00
401065: 44 8b 94 24 30 4b 01 mov 0x14b30(%rsp),%r10d
40106c: 00
40106d: 41 03 d9 add %r9d,%ebx
401070: 44 0f af 94 24 34 4b imul 0x14b34(%rsp),%r10d
401077: 01 00
401079: 44 8b 9c 24 38 4b 01 mov 0x14b38(%rsp),%r11d
401080: 00
401081: 41 03 da add %r10d,%ebx
401084: 44 0f af 9c 24 3c 4b imul 0x14b3c(%rsp),%r11d
40108b: 01 00
40108d: 41 03 db add %r11d,%ebx
401090: e8 eb 00 00 00 callq 401180 <_Z5rdtscv>
Upvotes: 1
Reputation: 106066
ICC sucks here because it's working out the addresses for each data[n]
access ala mov edi,dword ptr [rsp+rax*4+44h]
... all that run-time multiplication is expensive. You should be able to avoid it by recoding so the indices are constants (could also use *p_data++
three times, but that introduces a sequencing issue that may adversely affect performance).
for (unsigned* p_data = &data[0], *p_end = data + 1800; p_data < p_end; p_data += 3)
{
unsigned level1 = p_data[0];
unsigned level2 = p_data[1];
unsigned factor = p_data[2];
Accum1 += level1 * factor;
Accum2 += level2 * factor;
}
Upvotes: 3