Jesper Bang
Jesper Bang

Reputation: 3

Intel Intrinsics code optimization

So i'm trying to multiply a constant with short int a[101] with intel intrinsics. I have done it with addition but i can't seem to figure why it wont work with multiplication. Also before we used ints of 32 bits and now we use 16 bit short so we can have double as many values in the intrinsics to fill the 128 bit as far as i understand?

naive example of what im trying to do:

int main(int argc, char **argv){
    short int a[101];
    int len = sizeof(a)/sizeof(short);

    /*Populating array a with values 1 to 101*/

    mult(len, a);

    return 0;
}

int mult(int len, short int *a){
    int result = 0;
    for(int i=0; i<len; i++){
        result += a[i]*20;  
    }
    return result;
}

And my code trying to do the same in intrinsics

/*Same main as before with a short int a[101] containing values 1 to 101*/

int SIMD(int len, short int *a){
    int res;
    int val[4];

    /*Setting constant value to mulitply with*/
    __m128i sum = _mm_set1_epi16(20);
    __m128i s = _mm_setzero_si128( );

    for(int i=0; i<len/4*4; i += 4){
        __m128i vec = _mm_loadu_si128((__m128i *)(a+i));
        s += _mm_mul_epu32(vec,sum);
    }

    _mm_storeu_si128((__m128i*) val, s);
    res += val[0] + val[1] + val[2] + val[3];

    /*Haldeling tail*/
    for(int i=len/4*4; i<len; i++){
        res += a[i];
    }
    return res;
}

So i do get a number out as result, but the number does not match the naive method, i have tried other intrinsics and changing numbers to see if it makes any noticable difference but nothing comes close to the output i expect. The computation time is almost the same as the naive at the moment aswell.

Upvotes: 0

Views: 264

Answers (1)

Eric Postpischil
Eric Postpischil

Reputation: 224511

There are 8 short in one __m128i. So:

for(int i=0; i<len/4*4; i += 4)

should be

for(int i=0; i<len/8*8; i += 8)`

and:

res += val[0] + val[1] + val[2] + val[3];

should be:

res += val[0] + val[1] + val[2] + val[3] + val[4] + val[5] + val[6] + val[7];

and:

for(int i=len/4*4; i<len; i++)

should be:

for(int i=len/8*8; i<len; i++)

In:

s += _mm_mul_epu32(vec,sum);

_mm_mul_epu32 operates on 32-bit elements. It should be:

s += _mm_mullo_epi16(vec, sum);

The object res is not initialized; it should be:

int res = 0;

Here is working code:

#include <stdio.h>
#include <stdlib.h>

#include <immintrin.h>

//  Number of elements in an array.
#define NumberOf(x) (sizeof (x) / sizeof *(x))


//  Compute the result with scalar arithmetic.
static int mult(int len, short int *a)
{
    int result = 0;
    for (size_t i=0; i<len; i++)
    {
        result += a[i]*20;  
    }
    return result;
}


//  Compute the result with SIMD arithmetic.
static int SIMD(int len, short int *a)
{
    //  Initialize the multiplier and the sum.
    __m128i multiplier = _mm_set1_epi16(20);
    __m128i s = _mm_setzero_si128( );

    //  Process blocks of 8 short.
    for (int i=0; i<len/8*8; i += 8)
    {
        __m128i vec = _mm_loadu_si128((__m128i *)(a+i));

        //  Multtiply by multiplier and add to sum.
        s = _mm_add_epi16(s, _mm_mullo_epi16(vec, multiplier));
    }

    //  Store the sum so far so its individual elements can be manipulated.
    short val[8];
    _mm_storeu_si128((__m128i*) val, s);

    //  Add the individual elements.
    int res = 0;
    for (size_t i = 0; i < 8; ++i)
        res += val[i];

    //  Add the elements in the tail.
    for (size_t i = len/8*8; i < len; ++i)
    {
        res += a[i];
    }

    return res;
}



int main(int argc, char **argv)
{
    short int a[96];
    int len = NumberOf(a);

    //  Initiailize a.
    for (size_t i = 0; i < len; ++i)
        a[i] = i+1;

    printf("sum by scalar arithmetic is %d.\n", mult(len, a));
    printf("sum by SIMD arithmetic is %d.\n", SIMD(len, a));

    return 0;
}

Upvotes: 1

Related Questions