SSE inline assembly and possible g++ optimization bug

Question

Let's start with the code. I have two structures, one for vectors, and other for matrices.

struct AVector
    {
    explicit AVector(float x=0.0f, float y=0.0f, float z=0.0f, float w=0.0f):
        x(x), y(y), z(z), w(w) {}
    AVector(const AVector& a):
        x(a.x), y(a.y), z(a.z), w(a.w) {}

    AVector& operator=(const AVector& a) {x=a.x; y=a.y; z=a.z; w=a.w; return *this;}

    float x, y, z, w;
    };

struct AMatrix
    {
    // Row-major
    explicit AMatrix(const AVector& a=AVector(), const AVector& b=AVector(), const AVector& c=AVector(), const AVector& d=AVector())
        {row[0]=a; row[1]=b; row[2]=c; row[3]=d;}
    AMatrix(const AMatrix& m) {row[0]=m.row[0]; row[1]=m.row[1]; row[2]=m.row[2]; row[3]=m.row[3];}

    AMatrix& operator=(const AMatrix& m) {row[0]=m.row[0]; row[1]=m.row[1]; row[2]=m.row[2]; row[3]=m.row[3]; return *this;}

    AVector row[4];
    };

Next, code performing calculations on those structures. Dot product using inlined ASM and SSE instructions:

inline AVector AVectorDot(const AVector& a, const AVector& b)
    {
    // XXX
    /*const double v=a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;

    return AVector(v, v, v, v);*/

    AVector c;

    asm volatile(
        "movups (%1), %%xmm0
	"
        "movups (%2), %%xmm1
	"
        "mulps %%xmm1, %%xmm0
	"          // xmm0 -> (a1+b1, , , )
        "movaps %%xmm0, %%xmm1
	"         // xmm1 = xmm0
        "shufps $0xB1, %%xmm1, %%xmm1
	"  // 0xB1 = 10110001
        "addps %%xmm1, %%xmm0
	"          // xmm1 -> (x, y, z, w)+(y, x, w, z)=(x+y, x+y, z+w, z+w)
        "movaps %%xmm0, %%xmm1
	"         // xmm1 = xmm0
        "shufps $0x0A, %%xmm1, %%xmm1
	"  // 0x0A = 00001010
        "addps %%xmm1, %%xmm0
	"          // xmm1 -> (x+y+z+w, , , )
        "movups %%xmm0, %0
	"
        : "=m"(c)
        : "r"(&a), "r"(&b)
        );

    return c;
    }

Matrix transposition:

inline AMatrix AMatrixTranspose(const AMatrix& m)
    {
    AMatrix c(
        AVector(m.row[0].x, m.row[1].x, m.row[2].x, m.row[3].x),
        AVector(m.row[0].y, m.row[1].y, m.row[2].y, m.row[3].y),
        AVector(m.row[0].z, m.row[1].z, m.row[2].z, m.row[3].z),
        AVector(m.row[0].w, m.row[1].w, m.row[2].w, m.row[3].w));

    // XXX
    /*printf("AMcrix c:
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
",
        c.row[0].x, c.row[0].y, c.row[0].z, c.row[0].w,
        c.row[1].x, c.row[1].y, c.row[1].z, c.row[1].w,
        c.row[2].x, c.row[2].y, c.row[2].z, c.row[2].w,
        c.row[3].x, c.row[3].y, c.row[3].z, c.row[3].w);*/

    return c;
    }

Matrix-matrix multiplication - transpose first matrix, because when I have it stored as column major, and second one as row major, then I can perform multiplication using dot-products.

inline AMatrix AMatrixMultiply(const AMatrix& a, const AMatrix& b)
    {
    AMatrix c;

    const AMatrix at=AMatrixTranspose(a);

    // XXX
    /*printf("AMatrix at:
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
",
        at.row[0].x, at.row[0].y, at.row[0].z, at.row[0].w,
        at.row[1].x, at.row[1].y, at.row[1].z, at.row[1].w,
        at.row[2].x, at.row[2].y, at.row[2].z, at.row[2].w,
        at.row[3].x, at.row[3].y, at.row[3].z, at.row[3].w);*/

    for(int i=0; i<4; ++i)
        {
        c.row[i].x=AVectorDot(at.row[0], b.row[i]).w;
        c.row[i].y=AVectorDot(at.row[1], b.row[i]).w;
        c.row[i].z=AVectorDot(at.row[2], b.row[i]).w;
        c.row[i].w=AVectorDot(at.row[3], b.row[i]).w;
        }

    return c;
    }

Now time for main (pun intended) part:

int main(int argc, char *argv[])
    {
    AMatrix a(
        AVector(0, 1, 0, 0),
        AVector(1, 0, 0, 0),
        AVector(0, 0, 0, 1),
        AVector(0, 0, 1, 0)
        );

    AMatrix b(
        AVector(1, 0, 0, 0),
        AVector(0, 2, 0, 0),
        AVector(0, 0, 3, 0),
        AVector(0, 0, 0, 4)
        );

    AMatrix c=AMatrixMultiply(a, b);

    printf("AMatrix c:
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
    [%5.2f %5.2f %5.2f %5.2f]
",
        c.row[0].x, c.row[0].y, c.row[0].z, c.row[0].w,
        c.row[1].x, c.row[1].y, c.row[1].z, c.row[1].w,
        c.row[2].x, c.row[2].y, c.row[2].z, c.row[2].w,
        c.row[3].x, c.row[3].y, c.row[3].z, c.row[3].w);

    AVector v(1, 2, 3, 4);
    AVector w(1, 1, 1, 1);

    printf("Dot product: %f (1+2+3+4 = 10)
", AVectorDot(v, w).w);

    return 0;
    }

In the above code I make two matrices, multiply them and print the resulting matrix. It works fine if I don't use any of the compiler optimizations (g++ main.cpp -O0 -msse). With optimizations enabled (g++ main.cpp -O1 -msse) resulting matrix is empty (all fields are zeroes). Uncommenting any block marked with XXX makes program write correct result.

It seems to me that GCC optimizes-out matrix at from AMatrixMultiply function, because it wrongly assumes it's not used in AVectorDot, which is written using SSE inlines.

Last few lines check if dot-product function really works, and yes, it does.

So, the question is: did I do or understand something wrong, or is this some kind of bug in GCC? My guess is 7:3 mix of above.

I'm using GCC version 5.1.0 (tdm-1).

Brett Hale · Accepted Answer

This is also a very inefficient way of multiplying matrices using SSE. I'd be surprised if it was much faster than a scalar implementation with so much floating-point throughput available on modern CPUs. A better method is outlined here, no explicit transpose needed:

AMatrix & operator *= (AMatrix & m0, const AMatrix & m1)
{
    __m128 r0 = _mm_load_ps(& m1[0][x]);
    __m128 r1 = _mm_load_ps(& m1[1][x]);
    __m128 r2 = _mm_load_ps(& m1[2][x]);
    __m128 r3 = _mm_load_ps(& m1[3][x]);

    for (int i = 0; i < 4; i++)
    {
        __m128 ti = _mm_load_ps(& m0[i][x]), t0, t1, t2, t3;

        t0 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(0, 0, 0, 0));
        t1 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(1, 1, 1, 1));
        t2 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(2, 2, 2, 2));
        t3 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(3, 3, 3, 3));

        ti = t0 * r0 + t1 * r1 + t2 * r2 + t3 * r3;
        _mm_store_ps(& m0[i][x], ti);
    }

    return m0;
}

On modern compilers, like gcc and clang, t0 * r0 + t1 * r1 + t2 * r2 + t3 * r3 is actually operating on __m128 types; though you can replace these with _mm_mul_ps and _mm_add_ps intrinsics if you want.

Return by value is then just a matter of adding a function like:

inline AMatrix operator * (const AMatrix & m0, const AMatrix & m1)
{
    AMatrix lhs (m0); return (lhs *= m1);
}

Personally, I'd just replace the float x, y, z, w; with alignas (16) float _s[4] = {}; or similar - so you get a 'zero-vector' by default, or a defaulted constructor:

constexpr AVector () = default;

as well as nice constructors, like:

constexpr Vector (float x, float y, float z, float w)
        : _s {x, y, z, w} {}

SSE inline assembly and possible g++ optimization bug

Answers (2)

Related Questions