How to make GCC generate vector instructions as ICC does?

Question

I've been using ICC on my project, and ICC will utilize vector instructions very well. recently I tried to use GCC (version 5.5) to compile the same code, however on some modules, GCC's version is 10 times slower than ICC's. This happens when I do complex multiply etc.

A sample code will be like:

definitions:

float *ptr1 = _mm_malloc(1280 , 64);
float *ptr2 = _mm_malloc(1280 , 64);
float complex *realptr1 = (float complex *)&ptr1[storageOffset];
float complex *realptr2 = (float complex *)&ptr2[storageOffset];

Pragma and compiler options:

__assume_aligned(realptr1, 64);
__assume_aligned(realptr2, 64);
#pragma ivdep
#pragma vector aligned

for (j = 0; j < 512; j++) {
  float complex derSlot0 = realptr1[j] * realptr2[j];
  float complex derSlot1 = realptr1[j] + realptr2[j];
  realptr1[j] = derSlot0;
  realptr2[j] = derSlot1;
}

ICC compiled result of the major loop will be like:


    ..B1.6:                         # Preds ..B1.6 ..B1.5
                                    # Execution count [5.12e+02]
            vmovups   32(%r15,%rdx,8), %ymm9                        #35.29
            lea       (%r15,%rdx,8), %rax                           #37.5
            vmovups   (%rax), %ymm3                                 #35.29
            vaddps    32(%rbx,%rdx,8), %ymm9, %ymm11                #36.43
            vaddps    (%rbx,%rdx,8), %ymm3, %ymm5                   #36.43
            vmovshdup 32(%rbx,%rdx,8), %ymm6                        #35.43
            vshufps   $177, %ymm9, %ymm9, %ymm7                     #35.43
            vmulps    %ymm7, %ymm6, %ymm8                           #35.43
            vmovshdup (%rbx,%rdx,8), %ymm0                          #35.43
            vshufps   $177, %ymm3, %ymm3, %ymm1                     #35.43
            vmulps    %ymm1, %ymm0, %ymm2                           #35.43
            vmovsldup 32(%rbx,%rdx,8), %ymm10                       #35.43
            vfmaddsub213ps %ymm8, %ymm9, %ymm10                     #35.43
            vmovups   %ymm11, 32(%rbx,%rdx,8)                       #38.5
            vmovups   %ymm10, 32(%rax)                              #37.5
            vmovsldup (%rbx,%rdx,8), %ymm4                          #35.43
            vfmaddsub213ps %ymm2, %ymm3, %ymm4                      #35.43
            vmovups   %ymm5, (%rbx,%rdx,8)                          #38.5
            vmovups   %ymm4, (%rax)                                 #37.5
            addq      $8, %rdx                                      #32.3
            cmpq      $512, %rdx                                    #32.3
            jb        ..B1.6        # Prob 99%                      #32.3

The command line used for icc is: icc -march=core-avx2 -S -fsource-asm -c test.c

For GCC, what I've already done include: replace "#pragma ivdep" with "#pragma GCC ivdep", replace "__assume_aligned(realptr1, 64);" with "realptr1 = __builtin_assume_aligned(realptr1, 64);"

The command for GCC is: gcc -c -O2 -ftree-vectorize -mavx2 -g -Wa,-a,-ad gcctest.c

and the result for the same loop is something like this:


     109                .L7:
     110 00d8 C5FA103B      vmovss  (%rbx), %xmm7
     111 00dc 4883C308      addq    $8, %rbx
     112 00e0 C5FA1073      vmovss  -4(%rbx), %xmm6
     112      FC
     113 00e5 4983C408      addq    $8, %r12
     114 00e9 C4C17A10      vmovss  -8(%r12), %xmm5
     114      6C24F8
     115 00f0 C4C17A10      vmovss  -4(%r12), %xmm4
     115      6424FC
     116                .LBB2:
     117                    .loc 1 35 0 discriminator 3
     118 00f7 C5F828C7      vmovaps %xmm7, %xmm0
     119 00fb C5F828CE      vmovaps %xmm6, %xmm1
     120 00ff C5FA1165      vmovss  %xmm4, -80(%rbp)
     120      B0
     121 0104 C5F828DC      vmovaps %xmm4, %xmm3
     122 0108 C5FA116D      vmovss  %xmm5, -76(%rbp)
     122      B4
     123 010d C5F828D5      vmovaps %xmm5, %xmm2
     124 0111 C5FA1175      vmovss  %xmm6, -72(%rbp)
     124      B8
     125 0116 C5FA117D      vmovss  %xmm7, -68(%rbp)
     125      BC
     126 011b E8000000      call    __mulsc3
     126      00
     127                .LVL7:
     128                    .loc 1 38 0 discriminator 3
     129 0120 C5FA107D      vmovss  -68(%rbp), %xmm7
     129      BC
     130 0125 C5FA106D      vmovss  -76(%rbp), %xmm5
     130      B4
     131 012a C5FA1075      vmovss  -72(%rbp), %xmm6
     131      B8
     132 012f C5D258EF      vaddss  %xmm7, %xmm5, %xmm5
     133 0133 C5FA1065      vmovss  -80(%rbp), %xmm4
     133      B0
     134                    .loc 1 35 0 discriminator 3
     135 0138 C5F9D645      vmovq   %xmm0, -56(%rbp)
     135      C8
     136                    .loc 1 38 0 discriminator 3
     137 013d C5DA58E6      vaddss  %xmm6, %xmm4, %xmm4
     138                    .loc 1 35 0 discriminator 3
     139 0141 C5FA1045      vmovss  -52(%rbp), %xmm0
     139      CC
     140                .LVL8:
     141                    .loc 1 37 0 discriminator 3
     142 0146 C5FA104D      vmovss  -56(%rbp), %xmm1
     142      C8
     143 014b C5FA114B      vmovss  %xmm1, -8(%rbx)
     143      F8
     144                .LVL9:
     145 0150 C5FA1143      vmovss  %xmm0, -4(%rbx)
     145      FC
     146                    .loc 1 38 0 discriminator 3
     147 0155 C4C17A11      vmovss  %xmm5, -8(%r12)
     147      6C24F8
     148 015c C4C17A11      vmovss  %xmm4, -4(%r12)
     148      6424FC
     149                .LBE2:
     150                    .loc 1 32 0 discriminator 3
     151 0163 4C39EB        cmpq    %r13, %rbx
     152 0166 0F856CFF      jne .L7
     152      FFFF

So, I can see that GCC uses some kind of vector instructions, but still it it not as good as ICC.

My question is that, are there any more options I can do to make GCC perform better?

Thanks a lot.

How to make GCC generate vector instructions as ICC does?

Answers (1)

Related Questions