Weird optimization results for this multiply-add code

Question

I'm compiling this code:

#include 

template 
struct vec{ T v[4]; };

template 
vec foo (vec x, vec y, vec z) {
    return {
        x.v[0] + y.v[0] * z.v[0],
        x.v[1] + y.v[1] * z.v[1],
        x.v[2] + y.v[2] * z.v[2],
        x.v[3] + y.v[3] * z.v[3]
    };
}

template vec foo ( vec x, vec y, vec z);
template vec foo ( vec x, vec y, vec z);

at maximum optimization, with clang 6.0 and gcc 7.3. But the results are weird:

No compiler uses fused multiply-adds - for integers or float, although these seem to be the obvious choice. Why?
gcc uses a bazillion instructions for the int64_t case (not for the float case), much more than clang and much more than itself at -O2. Is that really faster?

clang 6.0:

vec foo(vec, vec, vec):             # @vec foo(vec, vec, vec)
        mov     rax, qword ptr [rsp + 72]
        imul    rax, qword ptr [rsp + 40]
        add     rax, qword ptr [rsp + 8]
        mov     qword ptr [rdi], rax
        mov     rax, qword ptr [rsp + 80]
        imul    rax, qword ptr [rsp + 48]
        add     rax, qword ptr [rsp + 16]
        mov     qword ptr [rdi + 8], rax
        mov     rax, qword ptr [rsp + 88]
        imul    rax, qword ptr [rsp + 56]
        add     rax, qword ptr [rsp + 24]
        mov     qword ptr [rdi + 16], rax
        mov     rax, qword ptr [rsp + 96]
        imul    rax, qword ptr [rsp + 64]
        add     rax, qword ptr [rsp + 32]
        mov     qword ptr [rdi + 24], rax
        mov     rax, rdi
        ret
vec foo(vec, vec, vec):             # @vec foo(vec, vec, vec)
        mulps   xmm2, xmm4
        addps   xmm0, xmm2
        mulps   xmm3, xmm5
        addps   xmm1, xmm3
        ret

GCC 7.3:

vec foo(vec, vec, vec):
        movdqu  xmm3, XMMWORD PTR [rsp+56]
        mov     rax, rdi
        movdqu  xmm4, XMMWORD PTR [rsp+88]
        movdqa  xmm1, xmm3
        movdqa  xmm0, xmm3
        movdqa  xmm2, xmm4
        movdqu  xmm5, XMMWORD PTR [rsp+72]
        pmuludq xmm1, xmm4
        psrlq   xmm0, 32
        psrlq   xmm2, 32
        pmuludq xmm0, xmm4
        pmuludq xmm2, xmm3
        movdqu  xmm4, XMMWORD PTR [rsp+40]
        paddq   xmm0, xmm2
        psllq   xmm0, 32
        paddq   xmm0, xmm1
        movdqa  xmm3, xmm5
        movdqu  xmm1, XMMWORD PTR [rsp+24]
        movdqa  xmm2, xmm4
        psrlq   xmm3, 32
        pmuludq xmm3, xmm4
        paddq   xmm1, xmm0
        movdqu  xmm6, XMMWORD PTR [rsp+8]
        pmuludq xmm2, xmm5
        movdqa  xmm0, xmm4
        movups  XMMWORD PTR [rdi+16], xmm1
        psrlq   xmm0, 32
        pmuludq xmm0, xmm5
        paddq   xmm0, xmm3
        psllq   xmm0, 32
        paddq   xmm0, xmm2
        paddq   xmm0, xmm6
        movups  XMMWORD PTR [rdi], xmm0
        ret
vec foo(vec, vec, vec):
        movq    QWORD PTR [rsp-40], xmm2
        movq    QWORD PTR [rsp-32], xmm3
        movq    QWORD PTR [rsp-56], xmm0
        movq    QWORD PTR [rsp-24], xmm4
        movq    QWORD PTR [rsp-16], xmm5
        movq    QWORD PTR [rsp-48], xmm1
        movaps  xmm0, XMMWORD PTR [rsp-40]
        mulps   xmm0, XMMWORD PTR [rsp-24]
        addps   xmm0, XMMWORD PTR [rsp-56]
        movaps  XMMWORD PTR [rsp-56], xmm0
        mov     rax, QWORD PTR [rsp-48]
        movq    xmm0, QWORD PTR [rsp-56]
        mov     QWORD PTR [rsp-56], rax
        movq    xmm1, QWORD PTR [rsp-56]
        ret

Weird optimization results for this multiply-add code

Answers (1)

Related Questions