Reputation: 1874
I know the basics of OpenMP and I know that in order to parallelize a for its iterations must not depend on previous iterations. Also one can use reductions, but they support only basic operators such as +, -,/, *, &&, ||.
How I can make this for parallel?
for (i = 1; i < n; ++i) {
for (j = 1; j < n; ++j) {
// stanga
if (res[i][j - 1] != res[i][j]) {
cmin2[i][j][0] = min(cmin2_res[i][j - 1][0] + 1, cmin[i][j][0]);
cmin2_res[i][j][0] = min(cmin2[i][j - 1][0] + 1, cmin_res[i][j][0]);
} else {
cmin2[i][j][0] = min(cmin2[i][j - 1][0] + 1, cmin[i][j][0]);
cmin2_res[i][j][0] = min(cmin2_res[i][j - 1][0] + 1, cmin_res[i][j][0]);
}
// sus
if (res[i - 1][j] != res[i][j]) {
cmin2[i][j][0] = min3(cmin2[i][j][0], cmin2_res[i - 1][j][0] + 1, cmin[i][j][1]);
cmin2_res[i][j][0] = min3(cmin2_res[i][j][0], cmin2[i - 1][j][0] + 1, cmin_res[i][j][1]);
} else {
cmin2[i][j][0] = min3(cmin2[i][j][0], cmin2[i - 1][j][0] + 1, cmin[i][j][1]);
cmin2_res[i][j][0] = min3(cmin2_res[i][j][0], cmin2_res[i - 1][j][0] + 1, cmin_res[i][j][1]);
}
}
}
My question is rather how I can decompose this for to be able to run it in parallel (and maybe use reductions if possible).
The problem is that at each iteration the operations must be done in this order, because I have 3 more groups of for like this.
P.S. min and min3 are macros.
Upvotes: 2
Views: 226
Reputation: 50927
There's a brute force way to do what you want, but a better parallelization will require a little more input about what you want in and out of the routines.
The data dependencies in your loop look like this, in i-j space:
i →
..........
j .....1....
↓ ....12....
...123....
where the value at point three depends on that those point 2s, and those depend on those at pt 1, etc. Because of this diagonal structure, you can re-order the loops to traverse the grid diagonally, eg first iteration is over (0,1), (1,0) then over (0,2),(1,1),(2,0), and so on. A simplified version of your problem looks like below:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
int **int2darray(int n, int m);
void free2darray(int **array);
void init2darray(int **array, int n, int m);
void tick(struct timeval *timer);
double tock(struct timeval *timer);
int main(int argc, char **argv) {
const int N=10000;
int **serialarr, **omparr;
struct timeval serialtimer, omptimer;
double serialtime, omptime;
serialarr = int2darray(N,N);
omparr = int2darray(N,N);
init2darray(serialarr, N, N);
init2darray(omparr, N, N);
/* serial calculation */
tick(&serialtimer);
for (int i=1; i<N; i++)
for (int j=1; j<N; j++)
serialarr[i][j] = serialarr[i-1][j] + serialarr[i][j-1];
serialtime = tock(&serialtimer);
/* omp */
tick(&omptimer);
#pragma omp parallel shared(omparr) default(none)
{
for (int ipj=1; ipj<=N; ipj++) {
#pragma omp for
for (int j=1; j<ipj; j++) {
int i = ipj - j;
omparr[i][j] = omparr[i-1][j] + omparr[i][j-1];
}
}
for (int ipj=N+1; ipj<2*N-1; ipj++) {
#pragma omp for
for (int j=ipj-N+1; j<N; j++) {
int i = ipj - j;
omparr[i][j] = omparr[i-1][j] + omparr[i][j-1];
}
}
}
omptime = tock(&omptimer);
/* compare results */
int abserr = 0;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
abserr += abs(omparr[i][j] - serialarr[i][j]);
printf("Difference between serial and OMP array: %d\n", abserr);
printf("Serial time = %lf\n", serialtime);
printf("OMP time = %lf\n", omptime);
free2darray(omparr);
free2darray(serialarr);
return 0;
}
int **int2darray(int n, int m) {
int *data = malloc(n*m*sizeof(int));
int **array = malloc(n*sizeof(int*));
for (int i=0; i<n; i++)
array[i] = &(data[i*m]);
return array;
}
void free2darray(int **array) {
free(array[0]);
free(array);
}
void init2darray(int **array, int n, int m) {
for (int i=0; i<n; i++)
for (int j=0; j<m; j++)
array[i][j] = i*m+j;
}
void tick(struct timeval *timer) {
gettimeofday(timer, NULL);
}
double tock(struct timeval *timer) {
struct timeval now;
gettimeofday(&now, NULL);
return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}
Running gives:
$ gcc -fopenmp -Wall -O2 loops.c -o loops -std=c99
$ export OMP_NUM_THREADS=8
$ ./loops
Difference between serial and OMP array: 0
Serial time = 0.246649
OMP time = 0.174936
You'll notice the speedup is pretty poor, even with large N, because the amount of computation per iteration is small, it's the inner loop that's parallelized, and we're going through memory in a weird, cache-unfriendly order.
Some of the above could probably be fixed, but it would help a bit more to know more about what you're trying to do; eg, do you care about the cmin2_res arrays, or are they just intermediate products? In words, what are you trying to calculate?
Upvotes: 3