user1264388
user1264388

Reputation: 11

Issue parallelizing this c code with openmp

How can I parallelize this code with OpenMP?. The result I get is not correct.

I try to use temporary variables p1aux, p2aux, and psumaux because the Reduction clause cannot be used with pointers or intrinsic functions. But as I said the result is incorrect.

When I say "result is not correct", I would say: The calculations of array1 + array2 are stored in the matrix sumsse. The calculations are correct until the component sumsse[50] [50] [50], more or less, but the other components of the matrix are always 0.

Do You have any idea what might happen?.

If anyone can help me, thank you very much.

#include <stdio.h>
#ifdef __SSE2__
#include <emmintrin.h>
#endif

#include <omp.h>
#include <stdlib.h>
#include <math.h>

double __attribute__((aligned(16))) array1[256][256][256],array2[256][256][256];
double __attribute__((aligned(16))) sumsse[256][256][256];



int main ()
{

    int k,j,i,dim;
    dim=256;


    __m128d *p1= (__m128d*) &array1[0][0][0];
    __m128d *p2= (__m128d*) &array2[0][0][0];
    __m128d *psum= (__m128d*) &sumsse[0][0][0];

    __m128d *p1aux= (__m128d*) &array1[0][0][0];
    __m128d *p2aux= (__m128d*) &array2[0][0][0];
    __m128d *psumaux= (__m128d*) &sumsse[0][0][0];

    int nthreads =(8);
    omp_set_num_threads(nthreads);


    //initializa array2

    for(k = 0; k < dim; ++k){
        for(j = 0; j < dim; ++j){
            for(i = 0; i < dim; ++i){
                array2[k][j][i] = 1.0;
            }
        }
    }

    //initialize array1

    for(k = 0; k < dim; ++k){
        for(j = 0; j < dim; ++j){
            for(i = 0; i < dim; ++i){
                array1[k][j][i] = i + j + k;
            }
        }
    }

    //initialize array sumsse

    for(k = 0; k < dim; ++k){
        for(j = 0; j < dim; ++j){
            for(i = 0; i < dim; ++i){
                sumsse[k][j][i] = 0.0;
            }
        }
    }

    //add array1 + array2 with sse 

#pragma omp parallel firstprivate(p1,p2,p1aux,p2aux)
    {

    for(k = 0; k < dim; ++k){

        for(j = 0; j < dim; ++j){

#pragma omp for private(i)

                for(i = 0; i < dim; i += 2){

                    *psum = _mm_add_pd(*p1,*p2);

                    psum = psumaux + 1;
                    p1 = p1aux+1;
                    p2 = p2aux+1;
                    psumaux = psum;
                    p1aux = p1;
                    p2aux = p2;
                }
            }
        }

    }// end parallel


    //show some datas


    printf("Elementosse=10 sumsse=%f",sumsse[10][10][10]);
    printf("\n");
    printf("Elementosse=50 sumsse=%f",sumsse[50][50][50]);   
    printf("Elementosse=100 sumsse=%f",sumsse[100][100][100]);
    printf("\n");
    printf("Elementosse=120 sumsse=%f",sumsse[120][120][120]);
    printf("\n");
    printf("Elementosse=200 sumsse=%f",sumsse[200][200][200]);
    printf("\n");

    return 0;
}

Upvotes: 1

Views: 202

Answers (1)

Jens Gustedt
Jens Gustedt

Reputation: 78923

instead of having incremented your auxiliary variable bizarrely you should thing of a closed form for the index of each of it in terms of k, j, i and max. Then you should assure that there is no aliasing between them.

Upvotes: 1

Related Questions