Fast fixed-size polynomial evaluation: MSVC vs GCC

I need to implement fast bivariate polynomial evaluation (for a polynomial whose size is fixed at compile time). I came up with the following example program:

#include <cmath>
#include <array>
#include <iostream>

int main()
    constexpr size_t NX = 5, NY = 4;
    using XA = std::array< double, NX >;
    using YA = std::array< XA, NY >;
    YA uu{};

    for(size_t yi = 0; yi < NY; yi++) {
        for(size_t xi = 0; xi < NX; xi++) {
            uu[yi][xi] = xi + yi;
            std::cerr << "uu["<< yi << ","<< xi << "] = " << uu[yi][xi] << '\n';

    double sum{0}, x = rand(), y = rand();
    for(auto iy = uu.rbegin(); iy != uu.rend(); iy++) {

        auto ix = iy->rbegin();
        double res = *ix++;
        for(; ix != iy->rend(); ix++) {
            res = std::fma(res, x, *ix);
        sum = std::fma(sum, y, res);
    std::cerr << "XXXX: " << sum << '\n';

    return 0;

When I compile it with GCC 8.1 and -mfma -msse4.2 -O3 -DNDEBUG, I get pretty decent optimization:

.text:0000000000402D2A                 call    rand
.text:0000000000402D2F                 vxorpd  xmm6, xmm6, xmm6
.text:0000000000402D33                 vcvtsi2sd xmm6, xmm6, eax
.text:0000000000402D37                 call    rand
.text:0000000000402D3C                 mov     rcx, cs:_refptr__ZSt4cerr
.text:0000000000402D43                 vxorpd  xmm1, xmm1, xmm1
.text:0000000000402D47                 mov     r8d, 6
.text:0000000000402D4D                 vcvtsi2sd xmm1, xmm1, eax
.text:0000000000402D51                 vmovsd  xmm2, [rsp+128h+var_D8]
.text:0000000000402D57                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_E0]
.text:0000000000402D5E                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_E8]
.text:0000000000402D65                 lea     rdx, aXxxx      ; "XXXX: "
.text:0000000000402D6C                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_F0]
.text:0000000000402D73                 vmovsd  xmm0, [rsp+128h+var_B0]
.text:0000000000402D79                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_F8]
.text:0000000000402D80                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_B8]
.text:0000000000402D87                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_C0]
.text:0000000000402D8E                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_C8]
.text:0000000000402D95                 vmovapd xmm3, xmm0
.text:0000000000402D99                 vmovsd  xmm0, [rsp+128h+var_88]
.text:0000000000402DA2                 vfmadd213sd xmm3, xmm6, [rsp+128h+var_D0]
.text:0000000000402DA9                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_90]
.text:0000000000402DB3                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_98]
.text:0000000000402DBD                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_A0]
.text:0000000000402DC7                 vmovapd xmm4, xmm0
.text:0000000000402DCB                 vmovsd  xmm0, [rsp+128h+var_60]
.text:0000000000402DD4                 vfmadd213sd xmm4, xmm6, [rsp+128h+var_A8]
.text:0000000000402DDE                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_68]
.text:0000000000402DE8                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_70]
.text:0000000000402DF2                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_78]
.text:0000000000402DFC                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_80]
.text:0000000000402E06                 vfmadd231sd xmm0, xmm1, cs:qword_404018
.text:0000000000402E0F                 vfmadd132sd xmm0, xmm4, xmm1
.text:0000000000402E14                 vfmadd132sd xmm0, xmm3, xmm1
.text:0000000000402E19                 vfmadd132sd xmm1, xmm2, xmm0
.text:0000000000402E1E                 vmovapd xmm6, xmm1
.text:0000000000402E22                 call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x ; std::__ostream_insert<char,std::char_traits<char>>(std::basic_ostream<char,std::char_traits<char>> &,char const*,long long)

But when I compile with MSVC 2019 and flags

/W3 /GR /EHsc /Ox /MD /Ob2 /fp:fast /GL /arch:AVX2

I get the following:

.text:000000014000218B                 add     rsi, 8
.text:000000014000218F                 cmp     rdi, 5
.text:0000000140002193                 jb      loc_140002110
.text:0000000140002199                 inc     rbx
.text:000000014000219C                 cmp     rbx, 4
.text:00000001400021A0                 jb      loc_140002100
.text:00000001400021A6                 vxorpd  xmm6, xmm6, xmm6
.text:00000001400021AA                 call    cs:rand
.text:00000001400021B0                 vxorps  xmm7, xmm7, xmm7
.text:00000001400021B4                 vcvtsi2sd xmm7, xmm7, eax
.text:00000001400021B8                 call    cs:rand
.text:00000001400021BE                 vxorps  xmm2, xmm2, xmm2
.text:00000001400021C2                 vcvtsi2sd xmm2, xmm2, eax
.text:00000001400021C6                 lea     rcx, [rsp+0F8h+var_40]
.text:00000001400021CE                 xchg    ax, ax
.text:00000001400021D0 loc_1400021D0:                          ; CODE XREF: main+15B↓j
.text:00000001400021D0                 vmovsd  xmm1, qword ptr [rcx]
.text:00000001400021D4                 lea     rdx, [rcx-20h]
.text:00000001400021D8                 mov     rax, rcx
.text:00000001400021DB                 cmp     rcx, rdx
.text:00000001400021DE                 jz      short loc_1400021F6
.text:00000001400021E0 loc_1400021E0:                          ; CODE XREF: main+144↓j
.text:00000001400021E0                 add     rax, 0FFFFFFFFFFFFFFF8h
.text:00000001400021E4                 vmovaps xmm0, xmm7
.text:00000001400021E8                 vfmadd213sd xmm0, xmm1, qword ptr [rax]
.text:00000001400021ED                 vmovapd xmm1, xmm0
.text:00000001400021F1                 cmp     rax, rdx
.text:00000001400021F4                 jnz     short loc_1400021E0
.text:00000001400021F6 loc_1400021F6:                          ; CODE XREF: main+12E↑j
.text:00000001400021F6                 sub     rcx, 28h ; '('
.text:00000001400021FA                 lea     rdx, [rsp+0F8h+var_D8]
.text:00000001400021FF                 vfmadd213sd xmm6, xmm2, xmm1
.text:0000000140002204                 lea     rax, [rcx+8]
.text:0000000140002208                 cmp     rax, rdx
.text:000000014000220B                 jnz     short loc_1400021D0
.text:000000014000220D                 mov     rcx, cs:?cerr@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A ; std::basic_ostream<char,std::char_traits<char>> std::cerr
.text:0000000140002214                 lea     rdx, aXxxx      ; "XXXX: "
.text:000000014000221B                 call    sub_140001080

So, MSVC even is not able to unroll loops which go over the elements of std::array? Or perhaps am I missing some optimization options ?

