Reputation: 61389
I have a Model class that holds data for a model and runs several functions on that data. The details are probably not too important except that it has the following design:
A MWE of the class appears as follows:
#include <cstdlib>
class Model {
private:
int width;
int height;
int size;
int nshift[8]; //Offset from a focal cell's index to its neighbours
double *restrict h; //Digital elevation model (height)
int *restrict rec; //Index of receiving cell
const int NO_FLOW = -1;
const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};
private:
void GenerateRandomTerrain(){
//srand(std::random_device()());
for(int y=0;y<height;y++)
for(int x=0;x<width;x++){
const int c = y*width+x;
h[c] = rand()/(double)RAND_MAX;
}
}
public:
Model(const int width0, const int height0)
: nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
{
width = width0;
height = height0;
size = width*height;
h = new double[size];
GenerateRandomTerrain();
}
~Model(){
delete[] h;
}
private:
void FindDownstream(){
//! computing receiver array
#pragma acc parallel loop collapse(2) independent present(h,rec,width,height)
for(int y=2;y<height-2;y++)
for(int x=2;x<width-2;x++){
const int c = y*width+x;
//The slope must be greater than zero for there to be downhill flow;
//otherwise, the cell is marekd NO_FLOW
double max_slope = 0;
int max_n = NO_FLOW;
#pragma acc loop seq
for(int n=0;n<8;n++){
double slope = (h[c] - h[c+nshift[n]])/dr[n];
if(slope>max_slope){
max_slope = slope;
max_n = n;
}
}
rec[c] = max_n;
}
}
public:
void run(const int nstep){
rec = new int[size];
#pragma acc enter data copyin(h[0:size],nshift[0:8],height,width,this) create(rec[0:size])
for(int step=0;step<=nstep;step++)
FindDownstream();
#pragma acc exit data copyout(h[0:size]) delete(this,rec)
delete[] rec;
}
};
int main(int argc, char **argv){
Model model(300,300);
model.run(100);
return 0;
}
When I compile with:
pgc++ -acc -ta=tesla,pinned,cc60 -Minfo=accel -fast test.cpp -std=c++11
I get the following warning:
51, Loop without integer trip count will be executed in sequential mode
Complex loop carried dependence of rec->,nshift prevents parallelization
Loop carried dependence of rec-> prevents parallelization
Loop carried backward dependence of rec-> prevents vectorization
Some digging on the internet reveals that a typical cause of this is the potential for pointer aliasing to cause dependencies.
I have tried to use *restrict
and independent
(as shown) to tell the compiler everything is alright, but it ignores me and does not parallelize the loop.
Passing pointers as arguments to the function with appropriate use of restrict
eliminates the error, but I have an aesthetic preference against this design. Alternatively, all the methods, which are each, essentially, a kernel, could be strung together in the run()
function, but again, this is not desirable.
If I use independent
on the inner loop, I get:
PGCC-W-0155-inner loop of tiled/collapsed loop nest should not have another loop directive (actual_code.cpp: 227)
But the loop does appear to parallelize.
I am compiling with PGI 17.9.
Upvotes: 3
Views: 1071
Reputation: 5646
The problem here is that "height" and "width" are class data members. Hence the compiler must assume that they may have external references to them and therefore could change values during the execution of these loops.
The solution is to copy the values to local variables and then use the local variables as the loop bounds.
Note that since you have "collapse(2)" on the outer loop, the "independent" clause already applies to both loops. (Though, "independent" is default for "parallel" compute regions so is unneeded.) A second "loop" construct is not allowed when collapsing multiple loops.
% cat test.cpp
#include <cstdlib>
class Model {
private:
int width;
int height;
int size;
int nshift[8]; //Offset from a focal cell's index to its neighbours
double *restrict h; //Digital elevation model (height)
int *restrict rec; //Index of receiving cell
const int NO_FLOW = -1;
const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};
private:
void GenerateRandomTerrain(){
//srand(std::random_device()());
for(int y=0;y<height;y++)
for(int x=0;x<width;x++){
const int c = y*width+x;
h[c] = rand()/(double)RAND_MAX;
}
}
public:
Model(const int width0, const int height0) : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
{
width = width0;
height = height0;
size = width*height;
h = new double[size];
GenerateRandomTerrain();
}
~Model(){
delete[] h;
}
private:
void FindDownstream(){
//! computing receiver array
int hgt = height;
int wdt = width;
#pragma acc parallel loop collapse(2) present(h,rec)
for(int y=2;y<hgt-2;y++)
for(int x=2;x<wdt-2;x++){
const int c = y*wdt+x;
//The slope must be greater than zero for there to be downhill flow;
//otherwise, the cell is marekd NO_FLOW
double max_slope = 0;
int max_n = NO_FLOW;
#pragma acc loop seq
for(int n=0;n<8;n++){
double slope = (h[c] - h[c+nshift[n]])/dr[n];
if(slope>max_slope){
max_slope = slope;
max_n = n;
}
}
rec[c] = max_n;
}
}
public:
void run(const int nstep){
rec = new int[size];
#pragma acc enter data copyin(this,h[0:size],nshift[0:8]) create(rec[0:size])
for(int step=0;step<=nstep;step++)
FindDownstream();
#pragma acc exit data copyout(h[0:size]) delete(rec,nshift,this)
delete[] rec;
}
};
int main(int argc, char **argv){
Model model(300,300);
model.run(100);
return 0;
}
% pgc++ test.cpp -w --c++11 -Minfo=accel -ta=tesla:cc60 -V17.10; a.out
Model::FindDownstream():
49, Generating present(h[:])
Accelerator kernel generated
Generating Tesla code
51, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
52, /* blockIdx.x threadIdx.x collapsed */
61, #pragma acc loop seq
49, Generating implicit copy(this[:])
Generating present(rec[:])
61, Loop carried scalar dependence for max_slope at line 63
Model::run(int):
74, Generating enter data copyin(nshift[:],h[:size])
Generating enter data create(rec[:size])
Generating enter data copyin(this[:1])
83, Generating exit data delete(this[:1],rec[:1])
Generating exit data copyout(h[:size])
Generating exit data delete(nshift[:])
Upvotes: 1