Reputation: 1
I saw that piece of some code below in a forum but when i started to compile it i get some errors.. I want to parallel the area from #pragma scop
up to #pragma endscop
.
/* Main computational kernel. The whole function will be timed,
including the call and return. */
static
void kernel_fdtd_2d(int tmax,
int nx,
int ny,
DATA_TYPE POLYBENCH_2D(ex,NX,NY,nx,ny),
DATA_TYPE POLYBENCH_2D(ey,NX,NY,nx,ny),
DATA_TYPE POLYBENCH_2D(hz,NX,NY,nx,ny),
DATA_TYPE POLYBENCH_1D(_fict_,TMAX,tmax))
{
int t, i, j;
#pragma scop
#pragma omp parallel private (t,i,j)
{
#pragma omp master
{
for (t = 0; t < _PB_TMAX; t++)
{
#pragma omp for
for (j = 0; j < _PB_NY; j++)
ey[0][j] = _fict_[t];
#pragma omp barrier
#pragma omp for collapse(2) schedule(static)
for (i = 1; i < _PB_NX; i++)
for (j = 0; j < _PB_NY; j++)
ey[i][j] = ey[i][j] - 0.5*(hz[i][j]-hz[i-1][j]);
#pragma omp barrier
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX; i++)
for (j = 1; j < _PB_NY; j++)
ex[i][j] = ex[i][j] - 0.5*(hz[i][j]-hz[i][j-1]);
#pragma omp barrier
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX - 1; i++)
for (j = 0; j < _PB_NY - 1; j++)
hz[i][j] = hz[i][j] - 0.7* (ex[i][j+1] - ex[i][j] + ey[i+1][j] - ey[i][j]);
#pragma omp barrier
}
}
}
#pragma endscop
}
int main(int argc, char** argv)
{
/* Retrieve problem size. */
int tmax = TMAX;
int nx = NX;
int ny = NY;
/* Variable declaration/allocation. */
POLYBENCH_2D_ARRAY_DECL(ex,DATA_TYPE,NX,NY,nx,ny);
POLYBENCH_2D_ARRAY_DECL(ey,DATA_TYPE,NX,NY,nx,ny);
POLYBENCH_2D_ARRAY_DECL(hz,DATA_TYPE,NX,NY,nx,ny);
POLYBENCH_1D_ARRAY_DECL(_fict_,DATA_TYPE,TMAX,tmax);
/* Initialize array(s). */
init_array (tmax, nx, ny,
POLYBENCH_ARRAY(ex),
POLYBENCH_ARRAY(ey),
POLYBENCH_ARRAY(hz),
POLYBENCH_ARRAY(_fict_));
/* Start timer. */
polybench_start_instruments;
/* Run kernel. */
kernel_fdtd_2d (tmax, nx, ny,
POLYBENCH_ARRAY(ex),
POLYBENCH_ARRAY(ey),
POLYBENCH_ARRAY(hz),
POLYBENCH_ARRAY(_fict_));
/* Stop and print timer. */
polybench_stop_instruments;
polybench_print_instruments;
/* Prevent dead-code elimination. All live-out data must be printed
by the function call in argument. */
polybench_prevent_dce(print_array(nx, ny, POLYBENCH_ARRAY(ex),
POLYBENCH_ARRAY(ey),
POLYBENCH_ARRAY(hz)));
/* Be clean. */
POLYBENCH_FREE_ARRAY(ex);
POLYBENCH_FREE_ARRAY(ey);
POLYBENCH_FREE_ARRAY(hz);
POLYBENCH_FREE_ARRAY(_fict_);
return 0;
}
The errors are like:
stencils/fdtd-2d/fdtd-2dp.c:80:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp for
^
stencils/fdtd-2d/fdtd-2dp.c:83:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp barrier
^
stencils/fdtd-2d/fdtd-2dp.c:84:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp for collapse(2) schedule(static)
^
stencils/fdtd-2d/fdtd-2dp.c:88:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp barrier
^
stencils/fdtd-2d/fdtd-2dp.c:89:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp for collapse(2) schedule(static)
^
stencils/fdtd-2d/fdtd-2dp.c:93:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp barrier
^
stencils/fdtd-2d/fdtd-2dp.c:94:9: error: work-sharing region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp for collapse(2) schedule(static)
^
stencils/fdtd-2d/fdtd-2dp.c:98:9: error: barrier region may not be closely nested inside of work-sharing, critical, ordered, master or explicit task region
#pragma omp barrier
^
Any help appreciated in how may i compile this..
Upvotes: 0
Views: 321
Reputation: 4623
Honestly, this is pretty poor OpenMP code. It does not consider data usage throughout the algorithm. What you probably want is:
int t, i, j;
#pragma omp parallel private (t,i,j)
{
for (t = 0; t < _PB_TMAX; t++)
{
#pragma omp for nowait
for (j = 0; j < _PB_NY; j++)
ey[0][j] = _fict_[t];
#pragma omp for collapse(2) nowait schedule(static)
for (i = 1; i < _PB_NX; i++)
for (j = 0; j < _PB_NY; j++)
ey[i][j] = ey[i][j] - 0.5*(hz[i][j]-hz[i-1][j]);
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX; i++)
for (j = 1; j < _PB_NY; j++)
ex[i][j] = ex[i][j] - 0.5*(hz[i][j]-hz[i][j-1]);
// #pragma omp barrier <- Implicit if nowait not specified
#pragma omp for collapse(2) schedule(static)
for (i = 0; i < _PB_NX - 1; i++)
for (j = 0; j < _PB_NY - 1; j++)
hz[i][j] = hz[i][j] - 0.7*(ex[i][j+1] - ex[i][j] + ey[i+1][j] - ey[i][j]);
// #pragma omp barrier <- Implicit if nowait not specified
}
}
The barriers should be removed because they are implicit after the for loop ends without a nowait
specified.
Furthermore, I believe the first two barriers should be entirely removed because there is no thread dependence between the first three loops -- if a thread finishes its portion of the loop and immediately starts a portion of the next loop, there is no chance of a race condition. You can add the nowait
clause to override the implicit barrier at the end of the omp for
directive.
Finally, if _PB_NX
and _PB_NY
are large-ish, then you are very unlikely to gain any benefit by collapsing the nested loops. I would imagine that removing the collapse(2)
could slightly improve the performance of the overall function.
Hope this helps.
Upvotes: 1
Reputation: 11
Remove the #pragma omp master
statement from your code. That will fix the compilation issue. You probably don't want to run that block 'only' in the master thread, because then you will not get any performance benefit of using Open MP.
Upvotes: 0