glc78
glc78

Reputation: 439

OpenMP - How to parallelize loop with carried dependecies

What is the best way to parallelize this nested loop with carried dependences? Konwing that it's inside a function that I call n-times from main.

[edit]

    funct(unsigned char*** grid, int n) {

#   pragma omp parallel for num_threads(thread_count) default(none) \
    shared(grid, n, cur) private(i, j)

        for(i = 1; i <= n+1; i++) {
            for(j = 1; j <= n; j++) {
                if(grid[cur][i-1][j] == 2 && grid[cur][i][j] == 0) {
                    grid[1-cur][i-1][j] = 0;
                    grid[1-cur][i][j] = 2;
                }
                else {
                    grid[1-cur][i][j] = grid[cur][i][j];
                }
            }
        }
    }

Main:

cur = 0;

for(s = 0; s < steps; s++) {
    funct(grid, N);
    cur = 1-cur;
    funct_2(grid, N)
    cur = 1-cur
}

This code runs without errors but returns incorrect result (.ppm file).

Upvotes: 1

Views: 819

Answers (1)

Piotr Skotnicki
Piotr Skotnicki

Reputation: 48487

Loop nest optimization/parallelization is the subject of my PhD studies. The below code was generated automatically using my own optimizing compiler. It involves a tiling transformation along with parallel execution of extracted (tile-wise) synchronization-free slices.

#define min(x,y)    ((x) < (y) ? (x) : (y))
#define floord(n,d) (((n)<0) ? -((-(n)+(d)-1)/(d)) : (n)/(d))
#pragma scop
#pragma omp parallel for
for (register int ir0 = 0; ir0 <= floord(n, 32); ir0 += 1) {
  for (register int ir1 = 0; ir1 <= floord(n - 1, 32); ir1 += 1) {
    if (ir0 == 0) {
      for (register int i1 = 32 * ir1 + 1; i1 <= min(n, 32 * ir1 + 32); i1 += 1) {
        grid[-cur + 1][0][i1] = (((grid[cur][0][i1] == 2) && (grid[cur][1][i1] == 0)) ? 0 : grid[-cur + 1][0][i1]);
      }
    }
    if (n >= 32 * ir0 + 1) {
      for (register int ii0 = ir0; ii0 <= min(ir0 + 1, n / 32); ii0 += 1) {
        for (register int i0 = 32 * ii0 + 1; i0 <= min(n + 1, 31 * ir0 + ii0 + 32); i0 += 1) {
          for (register int i1 = 32 * ir1 + 1; i1 <= min(n, 32 * ir1 + 32); i1 += 1) {
            if (i0 >= 32 * ir0 + 2) {
              grid[-cur + 1][i0 - 1][i1] = (((grid[cur][i0 - 1][i1] == 2) && (grid[cur][i0][i1] == 0)) ? 0 : grid[-cur + 1][i0 - 1][i1]);
            }
            if (ii0 == ir0) {
              grid[-cur + 1][i0][i1] = (((grid[cur][i0 - 1][i1] == 2) && (grid[cur][i0][i1] == 0)) ? 2 : grid[cur][i0][i1]);
            }
          }
        }
      }
    } else {
      for (register int i1 = 32 * ir1 + 1; i1 <= 32 * ir1 + 32; i1 += 1) {
        grid[-cur + 1][n + 1][i1] = (((grid[cur][n][i1] == 2) && (grid[cur][n + 1][i1] == 0)) ? 2 : grid[cur][n + 1][i1]);
      }
    }
  }
}
#pragma endscop

I tested the code with 8 threads operating on grid[2][N][N], where N = { 2500, 5500 }. I obtained a 3x speedup compared to serial execution of your original code.

Upvotes: 1

Related Questions