PooriR
PooriR

Reputation: 43

openmp parallel for schedule construct giving different answers for ever few program runs

I am trying to use openmp work sharing constructs. The code shared is a simpler example of what's going wrong with my bigger openmp code. I'm assigning values to an integer matrix, printing the matrix element values, initialising them to 0 and repeating it in a 't' loop. I'm counting the number of times the value assignments (done by parallel for) fail through the integer 'p'. p is supposed to be 0 if the code is correct, but it gives me different answers for different runs, so the work construct is failing somewhere. I had to run it around 12 times before I got the first wrong value of p as output (1, 2, 3, etc.)

The barrier directives in the code aren't really necessary, I was getting different values of p without it and thought an explicit barrier would help but I was wrong. This is the code:

    #define NRA 10                 /* number of rows in matrix A */
    #define NCA 10                 /* number of columns in matrix A */

    int main()
    {
        int i, j, ir, p = 0, t; 
        int *a; 
        a = (int*) malloc(sizeof(int)*NRA*NCA);

        omp_set_num_threads(5);

        for(t=0;t<100000;t++)
        {
            #pragma omp barrier
            #pragma omp parallel for schedule (static,2) collapse(2)
            for(i=0;i<NRA;i++)
            {
                for(j=0;j<NCA;j++)
                { 
                    ir=j*NRA+i; 
                    a[ir] = 1; 
                }
            }

            #pragma omp single
            {
                for(i=0;i<NRA;i++)
                {
                    for(j=0;j<NCA;j++)
                    { 
                        ir=j*NRA+i; 
                        if(a[ir] != 1)
                        { 
                            p += 1;
                        } 
                    }
                }
            }

            #pragma omp parallel for schedule (static,2) collapse(2)
            for(i=0;i<NRA;i++)
            {
                for(j=0;j<NCA;j++)
                { 
                    ir=j*NRA+i; 
                    a[ir] = 0; 
                }
            }

            # pragma omp barrier
        }//end t 

        printf("p is %d\n",p);
    }

This is the bigger code, and I don't think the race condition is an issue because I declared all variables outside parallel loop shared and all other variables locally inside the parallel loop. Any suggestions would be helpful!

    #define NRA 10                 /* number of rows in matrix A */
    #define NCA 10                 /* number of columns in matrix A */
    #define NCB 10                  /* number of columns in matrix B */

    void matrixcalc (double *ad, double *bd, double *cd, int chunkd);
    void printresults (double *cd, int chunkd);
    void printrep (double *cd, int chunkd);

    int main () 
    {
        int nthreads, chunk, p = 0;
        double *a,*b,*c;   
        a = (double*)malloc(NRA*NCA*sizeof(double)); 
        if(a==NULL) 
            printf("ho\n"); 
        b = (double*)malloc(NCA*NCB*sizeof(double));
        c = (double*)malloc(NRA*NCB*sizeof(double));

        omp_set_num_threads(5);

        chunk = 2;                    /* set loop iteration chunk size */
        int ir3, i1, j1;

        /*** Spawn a parallel region explicitly scoping all variables ***/
        int t, tmax = 100000;
        for(t=0;t<tmax;t++)
        {
            #pragma omp parallel shared(a,b,c,nthreads,chunk,t,tmax) 
            { 
                int tid = omp_get_thread_num(); 
                int i, j, ir;
                if (tid == 0)
                {
                    nthreads = omp_get_num_threads();
                    // printf("Starting matrix multiple example with %d threads\n",nthreads);
                    // printf("Initializing matrices...\n");
                }

                /*** Initialize matrices ***/
                #pragma omp for schedule (static, chunk) collapse(2)
                for (i=0; i<NRA; i++)
                {   
                    for (j=0; j<NCA; j++)
                    { 
                        ir =j*NRA+i; 
                        a[ir]= 1.0; 
                    }
                }
                #pragma omp for schedule (static, chunk) collapse(2)
                for (i=0; i<NCA; i++)
                {   
                    for (j=0; j<NCB; j++)
                    {  
                        ir = j*NCA+i; 
                        b[ir] = 1.0;
                    }
                }
                #pragma omp for schedule (static, chunk) collapse(2)
                for (i=0; i<NRA; i++)
                {    
                    for (j=0; j<NCB; j++)
                    { 
                        ir=j*NRA+i; 
                        c[ir]= 0.0;
                    }
                }
                /*** Do matrix multiply sharing iterations on outer loop ***/
                /*** Display who does which iterations for demonstration purposes ***/

                matrixcalc(a,b,c,chunk);
                if(t!=tmax-1)
                {
                    #pragma omp for schedule (static, chunk) collapse(2)
                    for(i=0;i<NRA;i++)
                    {    
                        for(j=0;j<NCB;j++)
                        {
                            ir=j*NRA+i;
                            c[ir]=0.0;
                        }
                    } 
                }
            }//end parallel region

            for(i1=0;i1<NRA;i1++)
            {
                for(j1=0;j1<NCB;j1++)
                {
                    ir3=j1*NRA+i1; 
                    if(c[ir3]!=12.20000&&c[ir3]!=0.0)
                    {
                        printf("%lf\n",c[ir3]);
                        p+=1;
                    } 
                }
           }  

       }//end t 
       printf("finalp\t%d\n",p);
       for(i1=0;i1<NRA;i1++)
       {
           for(j1=0;j1<NCB;j1++)
           { 
               ir3=j1*NRA+i1;
               printf("%lf\t",c[ir3]);
           }
           printf("\n");
       } 
   }   

   void matrixcalc (double *a, double *b, double *c, int chunk)
   {
       int i,j,k,ir,ir1,ir2;

       //printf("Thread %d starting matrix multiply...%d\n",tid,chunk);
       double r = 1.0;
       #pragma omp for schedule (static, chunk) collapse(3)

       for (i=0; i<NRA; i++)    
       {
           for(j=0; j<NCB; j++)  
           {     
               for (k=0; k<NCA; k++)
               {
                   ir=j*NRA+i; 
                   ir1=k*NRA+i; 
                   ir2=j*NCA+k; 
                   c[ir] += a[ir1] * b[ir2];
               }
           }
       }
       #pragma omp for schedule (static, chunk) collapse(2)
       for(i=0;i<NRA;i++)
       {
           for(j=0;j<NCB;j++)
           { 
               ir=j*NRA+i;
               c[ir]+=r*2.0;
           }
       }
       #pragma omp single
       {  
           double h; 
           h = 0.1;
           h = 2.0*h; 
           for(i=0;i<NRA;i++)
           {
               for(j=0;j<NCB;j++)
               {
                   ir=j*NRA+i;
                   c[ir]+=h;
               }
           }
       } 

Upvotes: 0

Views: 202

Answers (1)

Zulan
Zulan

Reputation: 22660

The issue is a race condition on ir. Since it is defined outside of the loop, it is implicitly shared. You could force it to be private, but it is better to declare variables as locally as possible. That makes reasoning about OpenMP code much easier:

#pragma omp parallel for schedule (static,2) collapse(2)
for(int i=0;i<NRA;i++)
{
    for(int j=0;j<NCA;j++)
    { 
        int ir = j*NRA+i; 
        a[ir] = 1; 
    }
}

As commented by Jorge Bellón, there are other issues in your code with respect to redundant barriers and efficiency.

Upvotes: 2

Related Questions