Reputation: 21
I've been using ICC on my project, and ICC will utilize vector instructions very well. recently I tried to use GCC (version 5.5) to compile the same code, however on some modules, GCC's version is 10 times slower than ICC's. This happens when I do complex multiply etc.
A sample code will be like:
definitions:
float *ptr1 = _mm_malloc(1280 , 64);
float *ptr2 = _mm_malloc(1280 , 64);
float complex *realptr1 = (float complex *)&ptr1[storageOffset];
float complex *realptr2 = (float complex *)&ptr2[storageOffset];
Pragma and compiler options:
__assume_aligned(realptr1, 64);
__assume_aligned(realptr2, 64);
#pragma ivdep
#pragma vector aligned
for (j = 0; j < 512; j++) {
float complex derSlot0 = realptr1[j] * realptr2[j];
float complex derSlot1 = realptr1[j] + realptr2[j];
realptr1[j] = derSlot0;
realptr2[j] = derSlot1;
}
ICC compiled result of the major loop will be like:
..B1.6: # Preds ..B1.6 ..B1.5
# Execution count [5.12e+02]
vmovups 32(%r15,%rdx,8), %ymm9 #35.29
lea (%r15,%rdx,8), %rax #37.5
vmovups (%rax), %ymm3 #35.29
vaddps 32(%rbx,%rdx,8), %ymm9, %ymm11 #36.43
vaddps (%rbx,%rdx,8), %ymm3, %ymm5 #36.43
vmovshdup 32(%rbx,%rdx,8), %ymm6 #35.43
vshufps $177, %ymm9, %ymm9, %ymm7 #35.43
vmulps %ymm7, %ymm6, %ymm8 #35.43
vmovshdup (%rbx,%rdx,8), %ymm0 #35.43
vshufps $177, %ymm3, %ymm3, %ymm1 #35.43
vmulps %ymm1, %ymm0, %ymm2 #35.43
vmovsldup 32(%rbx,%rdx,8), %ymm10 #35.43
vfmaddsub213ps %ymm8, %ymm9, %ymm10 #35.43
vmovups %ymm11, 32(%rbx,%rdx,8) #38.5
vmovups %ymm10, 32(%rax) #37.5
vmovsldup (%rbx,%rdx,8), %ymm4 #35.43
vfmaddsub213ps %ymm2, %ymm3, %ymm4 #35.43
vmovups %ymm5, (%rbx,%rdx,8) #38.5
vmovups %ymm4, (%rax) #37.5
addq $8, %rdx #32.3
cmpq $512, %rdx #32.3
jb ..B1.6 # Prob 99% #32.3
The command line used for icc is: icc -march=core-avx2 -S -fsource-asm -c test.c
For GCC, what I've already done include: replace "#pragma ivdep" with "#pragma GCC ivdep", replace "__assume_aligned(realptr1, 64);" with "realptr1 = __builtin_assume_aligned(realptr1, 64);"
The command for GCC is: gcc -c -O2 -ftree-vectorize -mavx2 -g -Wa,-a,-ad gcctest.c
and the result for the same loop is something like this:
109 .L7:
110 00d8 C5FA103B vmovss (%rbx), %xmm7
111 00dc 4883C308 addq $8, %rbx
112 00e0 C5FA1073 vmovss -4(%rbx), %xmm6
112 FC
113 00e5 4983C408 addq $8, %r12
114 00e9 C4C17A10 vmovss -8(%r12), %xmm5
114 6C24F8
115 00f0 C4C17A10 vmovss -4(%r12), %xmm4
115 6424FC
116 .LBB2:
117 .loc 1 35 0 discriminator 3
118 00f7 C5F828C7 vmovaps %xmm7, %xmm0
119 00fb C5F828CE vmovaps %xmm6, %xmm1
120 00ff C5FA1165 vmovss %xmm4, -80(%rbp)
120 B0
121 0104 C5F828DC vmovaps %xmm4, %xmm3
122 0108 C5FA116D vmovss %xmm5, -76(%rbp)
122 B4
123 010d C5F828D5 vmovaps %xmm5, %xmm2
124 0111 C5FA1175 vmovss %xmm6, -72(%rbp)
124 B8
125 0116 C5FA117D vmovss %xmm7, -68(%rbp)
125 BC
126 011b E8000000 call __mulsc3
126 00
127 .LVL7:
128 .loc 1 38 0 discriminator 3
129 0120 C5FA107D vmovss -68(%rbp), %xmm7
129 BC
130 0125 C5FA106D vmovss -76(%rbp), %xmm5
130 B4
131 012a C5FA1075 vmovss -72(%rbp), %xmm6
131 B8
132 012f C5D258EF vaddss %xmm7, %xmm5, %xmm5
133 0133 C5FA1065 vmovss -80(%rbp), %xmm4
133 B0
134 .loc 1 35 0 discriminator 3
135 0138 C5F9D645 vmovq %xmm0, -56(%rbp)
135 C8
136 .loc 1 38 0 discriminator 3
137 013d C5DA58E6 vaddss %xmm6, %xmm4, %xmm4
138 .loc 1 35 0 discriminator 3
139 0141 C5FA1045 vmovss -52(%rbp), %xmm0
139 CC
140 .LVL8:
141 .loc 1 37 0 discriminator 3
142 0146 C5FA104D vmovss -56(%rbp), %xmm1
142 C8
143 014b C5FA114B vmovss %xmm1, -8(%rbx)
143 F8
144 .LVL9:
145 0150 C5FA1143 vmovss %xmm0, -4(%rbx)
145 FC
146 .loc 1 38 0 discriminator 3
147 0155 C4C17A11 vmovss %xmm5, -8(%r12)
147 6C24F8
148 015c C4C17A11 vmovss %xmm4, -4(%r12)
148 6424FC
149 .LBE2:
150 .loc 1 32 0 discriminator 3
151 0163 4C39EB cmpq %r13, %rbx
152 0166 0F856CFF jne .L7
152 FFFF
So, I can see that GCC uses some kind of vector instructions, but still it it not as good as ICC.
My question is that, are there any more options I can do to make GCC perform better?
Thanks a lot.
Upvotes: 2
Views: 447
Reputation: 1516
You didn't post full code to test but you may start with adding
-ffast-math
and optionally
-mfma
so more or less you will end up with
vmovaps ymm0, YMMWORD PTR [rbx+rax]
vmovaps ymm3, YMMWORD PTR [r12+rax]
vpermilps ymm2, ymm0, 177
vpermilps ymm4, ymm3, 245
vpermilps ymm1, ymm3, 160
vmulps ymm2, ymm2, ymm4
vmovaps ymm4, ymm0
vfmsub132ps ymm4, ymm2, ymm1
vfmadd132ps ymm1, ymm2, ymm0
vaddps ymm0, ymm0, ymm3
vmovaps YMMWORD PTR [rbx+rax], ymm0
vblendps ymm1, ymm4, ymm1, 170
vmovaps YMMWORD PTR [r12+rax], ymm1
add rax, 32
cmp rax, 4096
jne .L6
Upvotes: 1