Transpose 8x8 64-bits matrix

Question

Targeting AVX2, what is a fastest way to transpose a 8x8 matrix containing 64-bits integers (or doubles)?

I searched though this site and I found several ways of doing 8x8 transpose but mostly for 32-bits floats. So I'm mainly asking because I'm not sure whether the principles that made those algorithms fast readily translate to 64-bits and second, apparently AVX2 only has 16 registers so only loading all the values would take up all the registers.

One way of doing it would be to call 2x2 _MM_TRANSPOSE4_PD but I was wondering whether this is optimal:

  #define _MM_TRANSPOSE4_PD(row0,row1,row2,row3)                \
        {                                                       \
            __m256d tmp3, tmp2, tmp1, tmp0;                     \
                                                                \
            tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0);       \
            tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF);       \
            tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0);       \
            tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF);       \
                                                                \
            (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);  \
            (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);  \
            (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);  \
            (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);  \
        }

Still assuming AVX2, is transposing double[8][8] and int64_t[8][8] largely the same, in principle?

PS: And just being curious, having AVX512 would change the things substantially, correct?

Soonts · Accepted Answer

After some thoughts and discussion in the comments, I think this is the most efficient version, at least when source and destination data is in RAM. It does not require AVX2, AVX1 is enough.

The main idea, modern CPUs can do twice as many load micro-ops compared to stores, and on many CPUs loading stuff into higher half of vectors with vinsertf128 has same cost as regular 16-byte load. Compared to your macro, this version no longer needs these relatively expensive (3 cycles of latency on most CPUs) vperm2f128 shuffles.

struct Matrix4x4
{
    __m256d r0, r1, r2, r3;
};

inline void loadTransposed( Matrix4x4& mat, const double* rsi, size_t stride = 8 )
{
    // Load top half of the matrix into low half of 4 registers
    __m256d t0 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi ) );     // 00, 01
    __m256d t1 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi + 2 ) ); // 02, 03
    rsi += stride;
    __m256d t2 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi ) );     // 10, 11
    __m256d t3 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi + 2 ) ); // 12, 13
    rsi += stride;
    // Load bottom half of the matrix into high half of these registers
    t0 = _mm256_insertf128_pd( t0, _mm_loadu_pd( rsi ), 1 );    // 00, 01, 20, 21
    t1 = _mm256_insertf128_pd( t1, _mm_loadu_pd( rsi + 2 ), 1 );// 02, 03, 22, 23
    rsi += stride;
    t2 = _mm256_insertf128_pd( t2, _mm_loadu_pd( rsi ), 1 );    // 10, 11, 30, 31
    t3 = _mm256_insertf128_pd( t3, _mm_loadu_pd( rsi + 2 ), 1 );// 12, 13, 32, 33

    // Transpose 2x2 blocks in registers.
    // Due to the tricky way we loaded stuff, that's enough to transpose the complete 4x4 matrix.
    mat.r0 = _mm256_unpacklo_pd( t0, t2 ); // 00, 10, 20, 30
    mat.r1 = _mm256_unpackhi_pd( t0, t2 ); // 01, 11, 21, 31
    mat.r2 = _mm256_unpacklo_pd( t1, t3 ); // 02, 12, 22, 32
    mat.r3 = _mm256_unpackhi_pd( t1, t3 ); // 03, 13, 23, 33
}

inline void store( const Matrix4x4& mat, double* rdi, size_t stride = 8 )
{
    _mm256_storeu_pd( rdi, mat.r0 );
    _mm256_storeu_pd( rdi + stride, mat.r1 );
    _mm256_storeu_pd( rdi + stride * 2, mat.r2 );
    _mm256_storeu_pd( rdi + stride * 3, mat.r3 );
}

// Transpose 8x8 matrix of double values
void transpose8x8( double* rdi, const double* rsi )
{
    Matrix4x4 block;
    // Top-left corner
    loadTransposed( block, rsi );
    store( block, rdi );

#if 1
    // Using another instance of the block to support in-place transpose
    Matrix4x4 block2;
    loadTransposed( block, rsi + 4 );       // top right block
    loadTransposed( block2, rsi + 8 * 4 ); // bottom left block

    store( block2, rdi + 4 );
    store( block, rdi + 8 * 4 );
#else
    // Flip the #if if you can guarantee ( rsi != rdi )
    // Performance is about the same, but this version uses 4 less vector registers,
    // slightly more efficient when some registers need to be backed up / restored.
    assert( rsi != rdi );
    loadTransposed( block, rsi + 4 );
    store( block, rdi + 8 * 4 );

    loadTransposed( block, rsi + 8 * 4 );
    store( block, rdi + 4 );
#endif
    // Bottom-right corner
    loadTransposed( block, rsi + 8 * 4 + 4 );
    store( block, rdi + 8 * 4 + 4 );
}

For completeness, here’s a version which uses the code very similar to your macro, does twice as few loads, same count of stores, and more shuffles. Have not benchmarked but I would expect it to be slightly slower.

struct Matrix4x4
{
    __m256d r0, r1, r2, r3;
};

inline void load( Matrix4x4& mat, const double* rsi, size_t stride = 8 )
{
    mat.r0 = _mm256_loadu_pd( rsi );
    mat.r1 = _mm256_loadu_pd( rsi + stride );
    mat.r2 = _mm256_loadu_pd( rsi + stride * 2 );
    mat.r3 = _mm256_loadu_pd( rsi + stride * 3 );
}

inline void store( const Matrix4x4& mat, double* rdi, size_t stride = 8 )
{
    _mm256_storeu_pd( rdi, mat.r0 );
    _mm256_storeu_pd( rdi + stride, mat.r1 );
    _mm256_storeu_pd( rdi + stride * 2, mat.r2 );
    _mm256_storeu_pd( rdi + stride * 3, mat.r3 );
}

inline void transpose( Matrix4x4& m4 )
{
    // These unpack instructions transpose lanes within 2x2 blocks of the matrix
    const __m256d t0 = _mm256_unpacklo_pd( m4.r0, m4.r1 );
    const __m256d t1 = _mm256_unpacklo_pd( m4.r2, m4.r3 );
    const __m256d t2 = _mm256_unpackhi_pd( m4.r0, m4.r1 );
    const __m256d t3 = _mm256_unpackhi_pd( m4.r2, m4.r3 );
    // Produce the transposed matrix by combining these blocks
    m4.r0 = _mm256_permute2f128_pd( t0, t1, 0x20 );
    m4.r1 = _mm256_permute2f128_pd( t2, t3, 0x20 );
    m4.r2 = _mm256_permute2f128_pd( t0, t1, 0x31 );
    m4.r3 = _mm256_permute2f128_pd( t2, t3, 0x31 );
}

// Transpose 8x8 matrix with double values
void transpose8x8( double* rdi, const double* rsi )
{
    Matrix4x4 block;
    // Top-left corner
    load( block, rsi );
    transpose( block );
    store( block, rdi );

    // Using another instance of the block to support in-place transpose, with very small overhead
    Matrix4x4 block2;
    load( block, rsi + 4 );     // top right block
    load( block2, rsi + 8 * 4 ); // bottom left block

    transpose( block2 );
    store( block2, rdi + 4 );
    transpose( block );
    store( block, rdi + 8 * 4 );

    // Bottom-right corner
    load( block, rsi + 8 * 4 + 4 );
    transpose( block );
    store( block, rdi + 8 * 4 + 4 );
}

Transpose 8x8 64-bits matrix

Answers (2)

Related Questions