Matrix multiplication code running slower with AVX2

Question

I am learning to program with AVX. So, I wrote a simple program to multiply matrices of size 4. While with no compiler optimizations, the AVX version is slightly faster than the non-AVX version, with O3 optimization, the non-AVX version becomes almost twice as fast as the AVX version. Any tip on how can I improve the performance of the AVX version? Following is the full code.

#include 
#include        
#include       

#define MAT_SIZE    4
#define USE_AVX

double A[MAT_SIZE][MAT_SIZE];
double B[MAT_SIZE][MAT_SIZE];
double C[MAT_SIZE][MAT_SIZE];

union {
    double m[4][4];
    __m256d row[4];
} matB;

void init_matrices()
{
    for(int i = 0; i < MAT_SIZE; i++)
        for(int j = 0; j < MAT_SIZE; j++)
        {
            A[i][j] = (float)(i+j);
            B[i][j] = (float)(i+j+1);
            matB.m[i][j] = B[i][j];
        }
}

void print_result()
{
    for(int i = 0; i < MAT_SIZE; i++)
    {
        for(int j = 0; j < MAT_SIZE; j++)
        {
            printf("%.1f	", C[i][j]);
        }
        printf("
");
    }
}

void withoutAVX()
{
    for(int row = 0; row < MAT_SIZE; row++)
        for(int col = 0; col < MAT_SIZE; col++)
        {
            float sum = 0;
            for(int e = 0; e < MAT_SIZE; e++)
                sum += A[row][e] * B[e][col];
            C[row][col] = sum;
        }
}

void withAVX()
{
    for(int row = 0; row < 4; row++)
    {
        //calculate_resultant_row(row);
        const double* rowA = (const double*)&A[row];
        __m256d* pr = (__m256d*)(&C[row]);

        *pr = _mm256_mul_pd(_mm256_broadcast_sd(&rowA[0]), matB.row[0]);
        for(int i = 1; i < 4; i++)
            *pr = _mm256_add_pd(*pr, _mm256_mul_pd(_mm256_broadcast_sd(&rowA[i]), 
                matB.row[i]));
    }
}

static __inline__ unsigned long long rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

int main() 
{
    init_matrices();

    // start timer
    unsigned long long cycles = rdtsc();
#ifdef USE_AVX
    withAVX();
#else
    withoutAVX();
#endif
    // stop timer
    cycles = rdtsc() - cycles;

    printf("
Total time elapsed : %ld

", cycles); 
    print_result();
    return 0;
}

Matrix multiplication code running slower with AVX2

Answers (1)

Related Questions