AVX version didn't as fast as expected

Question

I'm trying to convert a function to AVX version. The function itself basically just compare float and return true/false depends on the calculations.

Here is the original function:

bool testSingle(float* thisFloat, float* otherFloat)
{   
    for (unsigned int k = 0; k < COL_COUNT / 2; k++)
    {

        if (thisFloat[k] < -otherFloat[COL_COUNT / 2 + k] || -thisFloat[COL_COUNT / 2 + k] > otherFloat[k]) 
        {
            return true;
        }
    }

    return false;
}

And, this is the AVX version

__m256 testAVX(float* thisFloat, __m256* otherFloatInAVX)
{
    __m256 vTemp1;
    __m256 vTemp2;
    __m256 vTempResult;
    __m256 vEndResult = _mm256_set1_ps(0.0f);

    for (unsigned int k = 0; k < COL_COUNT / 2; k++)
    {

        vTemp1 = _mm256_cmp_ps(_mm256_set1_ps(thisFloat[k]), otherFloatInAVX[COL_COUNT / 2 + k], _CMP_LT_OQ);

        vTemp2 = _mm256_cmp_ps(_mm256_set1_ps(-thisFloat[COL_COUNT / 2 + k]), otherFloatInAVX[k], _CMP_GT_OQ);

        vTempResult = _mm256_or_ps(vTemp1, vTemp2);
        vEndResult = _mm256_or_ps(vTempResult, vEndResult);
        if (_mm256_movemask_ps(vEndResult) == 255)
        {
            break;
        }

    }

    return vEndResult;

}

And here is the complete code. I generated some random float at the beginning and save it to AVX for the calculation in AVX version. The value in variable thisFloat will be compared with otherFloat1, otherFloat2,...,otherFloat8.

#define ROW_COUNT 1000000
#define COL_COUNT 46

float randomNumberFloat(float Min, float Max)
{
    return ((float(rand()) / float(RAND_MAX)) * (Max - Min)) + Min;
}

int main(int argc, char** argv)
{

    float** thisFloat = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        thisFloat[i] = new float[COL_COUNT];

    float** otherFloat1 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat1[i] = new float[COL_COUNT];

    float** otherFloat2 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat2[i] = new float[COL_COUNT];

    float** otherFloat3 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat3[i] = new float[COL_COUNT];

    float** otherFloat4 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat4[i] = new float[COL_COUNT];

    float** otherFloat5 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat5[i] = new float[COL_COUNT];

    float** otherFloat6 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat6[i] = new float[COL_COUNT];

    float** otherFloat7 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat7[i] = new float[COL_COUNT];

    float** otherFloat8 = new float*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloat8[i] = new float[COL_COUNT];

    // save to AVX
    __m256** otherFloatInAVX = new __m256*[ROW_COUNT];
    for (int i = 0; i < ROW_COUNT; ++i)
        otherFloatInAVX[i] = new __m256[COL_COUNT];

    // variable for results
    unsigned int* resultsSingle = new unsigned int[ROW_COUNT];
    __m256* resultsAVX = new __m256[ROW_COUNT];


    // Generate Random Values
    for (unsigned int i = 0; i < ROW_COUNT; i++)
    {
        for (unsigned int j = 0; j < COL_COUNT; j++)
        {
            thisFloat[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat1[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat2[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat3[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat4[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat5[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat6[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat7[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
            otherFloat8[i][j] = randomNumberFloat(-1000.0f, 1000.0f);

        }

        for (unsigned int j = 0; j < COL_COUNT / 2; j++)
        {
            otherFloatInAVX[i][j] = _mm256_setr_ps(otherFloat1[i][j], otherFloat2[i][j], otherFloat3[i][j], otherFloat4[i][j], otherFloat5[i][j], otherFloat6[i][j], otherFloat7[i][j], otherFloat8[i][j]);
            otherFloatInAVX[i][COL_COUNT / 2 + j] = _mm256_setr_ps(-otherFloat1[i][j], -otherFloat2[i][j], -otherFloat3[i][j], -otherFloat4[i][j], -otherFloat5[i][j], -otherFloat6[i][j], -otherFloat7[i][j], -otherFloat8[i][j]);
        }
    }

    // do normal test
    auto start_normal = std::chrono::high_resolution_clock::now();
    for (unsigned int i = 0; i < ROW_COUNT; i++)
    {
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat1[i]);
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat2[i]);
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat3[i]);
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat4[i]);
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat5[i]);
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat6[i]);
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat7[i]);
        resultsSingle[i] = testSingle(thisFloat[i], otherFloat8[i]);
    }
    auto end_normal = std::chrono::high_resolution_clock::now();

    auto duration_normal = std::chrono::duration_cast(end_normal - start_normal);
    std::cout << "Duration of normal test: " << duration_normal.count() << " ms 
";

    // do AVX test

    auto start_avx = std::chrono::high_resolution_clock::now();
    for (unsigned int i = 0; i < ROW_COUNT; i++)
    {
        resultsAVX[i] = testAVX(thisFloat[i], otherFloatInAVX[i]);
    }
    auto end_avx = std::chrono::high_resolution_clock::now();


    auto duration_avx = std::chrono::duration_cast(end_avx - start_avx);
    std::cout << "Duration of AVX test: " << duration_avx.count() << " ms";
return 0;
}

Then, I measured the running time of both and got

Duration of normal test: 290 ms
Duration of AVX test: 159 ms

The AVX version is 1.82x faster than original one.

Is is still possible to improve the AVX version? Or am I did AVX the wrong way? I expect it to be probably 5-6x faster since I do eight calculations at the same time.

AVX version didn't as fast as expected

Answers (1)

Related Questions

AVX version didn&#39;t as fast as expected

Answers (1)

Related Questions

AVX version didn't as fast as expected