Reputation: 465
I need to implement fast bivariate polynomial evaluation (for a polynomial whose size is fixed at compile time). I came up with the following example program:
#include <cmath>
#include <array>
#include <iostream>
int main()
{
constexpr size_t NX = 5, NY = 4;
using XA = std::array< double, NX >;
using YA = std::array< XA, NY >;
YA uu{};
for(size_t yi = 0; yi < NY; yi++) {
for(size_t xi = 0; xi < NX; xi++) {
uu[yi][xi] = xi + yi;
std::cerr << "uu["<< yi << ","<< xi << "] = " << uu[yi][xi] << '\n';
}
}
double sum{0}, x = rand(), y = rand();
for(auto iy = uu.rbegin(); iy != uu.rend(); iy++) {
auto ix = iy->rbegin();
double res = *ix++;
for(; ix != iy->rend(); ix++) {
res = std::fma(res, x, *ix);
}
sum = std::fma(sum, y, res);
}
std::cerr << "XXXX: " << sum << '\n';
return 0;
}
When I compile it with GCC 8.1 and -mfma -msse4.2 -O3 -DNDEBUG, I get pretty decent optimization:
.text:0000000000402D2A call rand
.text:0000000000402D2F vxorpd xmm6, xmm6, xmm6
.text:0000000000402D33 vcvtsi2sd xmm6, xmm6, eax
.text:0000000000402D37 call rand
.text:0000000000402D3C mov rcx, cs:_refptr__ZSt4cerr
.text:0000000000402D43 vxorpd xmm1, xmm1, xmm1
.text:0000000000402D47 mov r8d, 6
.text:0000000000402D4D vcvtsi2sd xmm1, xmm1, eax
.text:0000000000402D51 vmovsd xmm2, [rsp+128h+var_D8]
.text:0000000000402D57 vfmadd213sd xmm2, xmm6, [rsp+128h+var_E0]
.text:0000000000402D5E vfmadd213sd xmm2, xmm6, [rsp+128h+var_E8]
.text:0000000000402D65 lea rdx, aXxxx ; "XXXX: "
.text:0000000000402D6C vfmadd213sd xmm2, xmm6, [rsp+128h+var_F0]
.text:0000000000402D73 vmovsd xmm0, [rsp+128h+var_B0]
.text:0000000000402D79 vfmadd213sd xmm2, xmm6, [rsp+128h+var_F8]
.text:0000000000402D80 vfmadd213sd xmm0, xmm6, [rsp+128h+var_B8]
.text:0000000000402D87 vfmadd213sd xmm0, xmm6, [rsp+128h+var_C0]
.text:0000000000402D8E vfmadd213sd xmm0, xmm6, [rsp+128h+var_C8]
.text:0000000000402D95 vmovapd xmm3, xmm0
.text:0000000000402D99 vmovsd xmm0, [rsp+128h+var_88]
.text:0000000000402DA2 vfmadd213sd xmm3, xmm6, [rsp+128h+var_D0]
.text:0000000000402DA9 vfmadd213sd xmm0, xmm6, [rsp+128h+var_90]
.text:0000000000402DB3 vfmadd213sd xmm0, xmm6, [rsp+128h+var_98]
.text:0000000000402DBD vfmadd213sd xmm0, xmm6, [rsp+128h+var_A0]
.text:0000000000402DC7 vmovapd xmm4, xmm0
.text:0000000000402DCB vmovsd xmm0, [rsp+128h+var_60]
.text:0000000000402DD4 vfmadd213sd xmm4, xmm6, [rsp+128h+var_A8]
.text:0000000000402DDE vfmadd213sd xmm0, xmm6, [rsp+128h+var_68]
.text:0000000000402DE8 vfmadd213sd xmm0, xmm6, [rsp+128h+var_70]
.text:0000000000402DF2 vfmadd213sd xmm0, xmm6, [rsp+128h+var_78]
.text:0000000000402DFC vfmadd213sd xmm0, xmm6, [rsp+128h+var_80]
.text:0000000000402E06 vfmadd231sd xmm0, xmm1, cs:qword_404018
.text:0000000000402E0F vfmadd132sd xmm0, xmm4, xmm1
.text:0000000000402E14 vfmadd132sd xmm0, xmm3, xmm1
.text:0000000000402E19 vfmadd132sd xmm1, xmm2, xmm0
.text:0000000000402E1E vmovapd xmm6, xmm1
.text:0000000000402E22 call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x ; std::__ostream_insert<char,std::char_traits<char>>(std::basic_ostream<char,std::char_traits<char>> &,char const*,long long)
But when I compile with MSVC 2019 and flags
/W3 /GR /EHsc /Ox /MD /Ob2 /fp:fast /GL /arch:AVX2
I get the following:
.text:000000014000218B add rsi, 8
.text:000000014000218F cmp rdi, 5
.text:0000000140002193 jb loc_140002110
.text:0000000140002199 inc rbx
.text:000000014000219C cmp rbx, 4
.text:00000001400021A0 jb loc_140002100
.text:00000001400021A6 vxorpd xmm6, xmm6, xmm6
.text:00000001400021AA call cs:rand
.text:00000001400021B0 vxorps xmm7, xmm7, xmm7
.text:00000001400021B4 vcvtsi2sd xmm7, xmm7, eax
.text:00000001400021B8 call cs:rand
.text:00000001400021BE vxorps xmm2, xmm2, xmm2
.text:00000001400021C2 vcvtsi2sd xmm2, xmm2, eax
.text:00000001400021C6 lea rcx, [rsp+0F8h+var_40]
.text:00000001400021CE xchg ax, ax
.text:00000001400021D0
.text:00000001400021D0 loc_1400021D0: ; CODE XREF: main+15B↓j
.text:00000001400021D0 vmovsd xmm1, qword ptr [rcx]
.text:00000001400021D4 lea rdx, [rcx-20h]
.text:00000001400021D8 mov rax, rcx
.text:00000001400021DB cmp rcx, rdx
.text:00000001400021DE jz short loc_1400021F6
.text:00000001400021E0
.text:00000001400021E0 loc_1400021E0: ; CODE XREF: main+144↓j
.text:00000001400021E0 add rax, 0FFFFFFFFFFFFFFF8h
.text:00000001400021E4 vmovaps xmm0, xmm7
.text:00000001400021E8 vfmadd213sd xmm0, xmm1, qword ptr [rax]
.text:00000001400021ED vmovapd xmm1, xmm0
.text:00000001400021F1 cmp rax, rdx
.text:00000001400021F4 jnz short loc_1400021E0
.text:00000001400021F6
.text:00000001400021F6 loc_1400021F6: ; CODE XREF: main+12E↑j
.text:00000001400021F6 sub rcx, 28h ; '('
.text:00000001400021FA lea rdx, [rsp+0F8h+var_D8]
.text:00000001400021FF vfmadd213sd xmm6, xmm2, xmm1
.text:0000000140002204 lea rax, [rcx+8]
.text:0000000140002208 cmp rax, rdx
.text:000000014000220B jnz short loc_1400021D0
.text:000000014000220D mov rcx, cs:?cerr@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A ; std::basic_ostream<char,std::char_traits<char>> std::cerr
.text:0000000140002214 lea rdx, aXxxx ; "XXXX: "
.text:000000014000221B call sub_140001080
So, MSVC even is not able to unroll loops which go over the elements of std::array? Or perhaps am I missing some optimization options ?
Upvotes: 3
Views: 110