How to parallelise a while loop that has iterations on a matrix with OpenMP?

Question

I'm trying to parallelise the while loop of this code.

#include 
#include 
#include 

#define n 100

float max_dif(float w_new[n][n], float w[n][n]);

int main()
{
    int i,j;
    float w[n][n],w_new[n][n]={0},max=100;

#pragma omp parallel for private(j)
    for (i = 0; i < n; i++)
    {
        for (j = 0; j < n; j++)
        {
            w[i][j]=75;
        }
    }

#pragma omp parallel for
    for (i = 0; i < n; i++)
    {
        w[0][i]=0;
        w[n-1][i]=100;
        w[i][0]=100;
        w[i][n-1]=100;

        w_new[0][i]=0;
        w_new[n-1][i]=100;
        w_new[i][0]=100;
        w_new[i][n-1]=100;
    }

    w[0][n-1]=100;
    w_new[0][n-1]=100;

    int counter=0;

    while(max > 0.0001)
    {   
        for (i = 1; i < n-1; i++)
        {
            for (j = 1; j < n-1; j++)
            {
                w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
            }
        }
        max = max_dif(w_new,w);
        for (i = 0; i < n; i++)
        {
            for (j = 0; j < n; j++)
            {
                w[i][j]=w_new[i][j];
            }
        }
        counter++;
    }

    printf("Counter is: %d
", counter);
    return 0;
}

float max_dif(float w_new[n][n], float w[n][n])
{
    int i,j;
    float max=0;
    for (i = 1; i < n-1; i++)
    {
        for (j = 1; j < n-1; j++)
        {
            if (max < fabs(w_new[i][j]-w[i][j]))
                max = fabs(w_new[i][j]-w[i][j]);
        }
    }
    return max;
}

I was thinking of the basic parallel for construct inside the while, but my counter variable is expected to be around 5000. So I would like that to be in parallel. But I need w_new to copy back to w so I can calculate the next iteration of w_new. So I don't think tasks or sections would work as I would need one w for the other.

dreamcrash · Accepted Answer

Parallelizing directly the while(max > 0.0001) will be complicated with OpenMP and probably not that performant as well. In OpenMP you have two main ways of parallelizing code either using loop-based or task-based parallelism. The latter is not suitable for the current code and the former cannot be applied to a loop that at compile-time is not possible to determined the number of iterations performed by that loop.

However, you can parallelize the computation performed inside the while instead, for instance:

  #pragma omp parallel
  {
      #pragma omp for
      for (int i = 1; i < n-1; i++){
        for (int j = 1; j < n-1; j++){
            w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
        }
      }

the max_dif is definitely a good candidate for parallelization as well:

#pragma omp for reduction(max: max)
for (int i = 1; i < n-1; i++){
    for (int j = 1; j < n-1; j++){
        if (max < fabs(w_new[i][j]-w[i][j]))
            max = fabs(w_new[i][j]-w[i][j]);
    }
}

and finally the last loop inside the while:

    #pragma omp for
    for (int i = 0; i < n; i++){
        for (int j = 0; j < n; j++){
            w[i][j]=w_new[i][j];
        }
    }

Unfortunately, to apply this approach you will have to inline the function max_dif because the variable to be used in the reduction clause has to be shared, and since max is allocated inside the parallel region it will be private to each thread. Using the shared clause in #pragma omp for is not allowed.

Full example:

#include 
#include 
#include 

#define n 100
int main()
{
    float w[n][n],w_new[n][n]={0},max=100;

    #pragma omp parallel for
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            w[i][j]=75;

    #pragma omp parallel for
    for (int i = 0; i < n; i++){
        w[0][i]=0;
        w[n-1][i]=100;
        w[i][0]=100;
        w[i][n-1]=100;

        w_new[0][i]=0;
        w_new[n-1][i]=100;
        w_new[i][0]=100;
        w_new[i][n-1]=100;
    }

    w[0][n-1]=100;
    w_new[0][n-1]=100;

    int counter=0;

    while(max > 0.0001){   
      #pragma omp parallel
      {
          max = 0;
          #pragma omp for    
          for (int i = 1; i < n-1; i++)
             for (int j = 1; j < n-1; j++)
                w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
        
          #pragma omp for reduction(max: max)
          for (int i = 1; i < n-1; i++){
             for (int j = 1; j < n-1; j++){
               if (max < fabs(w_new[i][j]-w[i][j]))
                  max = fabs(w_new[i][j]-w[i][j]);
             }
          }       
          #pragma omp for nowait
          for (int i = 0; i < n; i++)
              for (int j = 0; j < n; j++)
                 w[i][j]=w_new[i][j];
      }
      counter++;
    }

    printf("Counter is: %d
", counter);
    return 0;
}

Output:

Counter is: 4894

I was thinking of the basic parallel for construct inside the while, but my counter variable is expected to be around 5000. So I would like that to be in parallel.

That is possible but will require some changes. I explain them in the code. Actually we can use a single parallel region for the entire code:

#include 
#include 
#include 
#define n 100

int main()
{
   float w[n][n],w_new[n][n]={0},max=100;
   int counter = 0;
   #pragma omp parallel
   {
      #pragma omp for
      for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
             w[i][j]=75;

      #pragma omp for
      for (int i = 0; i < n; i++){
        w[0][i]=0;
        w[n-1][i]=100;
        w[i][0]=100;
        w[i][n-1]=100;

        w_new[0][i]=0;
        w_new[n-1][i]=100;
        w_new[i][0]=100;
        w_new[i][n-1]=100;
     }

     w[0][n-1]=100;
     w_new[0][n-1]=100;

     
    while(max > 0.0001){
       // We need this barrier to ensure that we don't have a
       // race condition between the master updating max to zero
       // and the other threads reading max > 0.0001 
       #pragma omp barrier
       #pragma omp master
       #pragma omp atomic write
        max = 0;
       #pragma omp barrier
       
        #pragma omp for reduction(max:max)
        for (int i = 1; i < n-1; i++){
           for (int j = 1; j < n-1; j++){
               w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
               if (max < fabs(w_new[i][j]-w[i][j]))
                   max = fabs(w_new[i][j]-w[i][j]);
           }
        }      
          
        #pragma omp for nowait
        for (int i = 0; i < n; i++)
            for (int j = 0; j < n; j++)
               w[i][j]=w_new[i][j];
          
        #pragma omp master
        counter++;
     } 
   }
   printf("Counter is: %d
", counter);
   return 0;
}

I tested superficially, and for 4 cores this version had a speedup of 2.8x for an input of 1000 nothing too great. Further improvements can be done by trying to vectorize parts of it.

How to parallelise a while loop that has iterations on a matrix with OpenMP?

Answers (1)

Related Questions