Richard
Richard

Reputation: 61389

Loop carried dependence of `->` prevents parallelization

I have a Model class that holds data for a model and runs several functions on that data. The details are probably not too important except that it has the following design:

A MWE of the class appears as follows:

#include <cstdlib>


class Model {
 private:
  int width;
  int height;
  int size;

  int    nshift[8];      //Offset from a focal cell's index to its neighbours
  double *restrict h;    //Digital elevation model (height)
  int    *restrict rec;  //Index of receiving cell

  const int NO_FLOW  = -1;
  const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
  const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};  

 private:
  void GenerateRandomTerrain(){
    //srand(std::random_device()());
    for(int y=0;y<height;y++)
    for(int x=0;x<width;x++){
      const int c = y*width+x;
      h[c]  = rand()/(double)RAND_MAX;
    }
  }  


 public:
  Model(const int width0, const int height0)
    : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
  {
    width  = width0;
    height = height0;
    size   = width*height;

    h      = new double[size];

    GenerateRandomTerrain();
  }

  ~Model(){
    delete[] h;
  }

 private:
  void FindDownstream(){
    //! computing receiver array
    #pragma acc parallel loop collapse(2) independent present(h,rec,width,height)
    for(int y=2;y<height-2;y++)
    for(int x=2;x<width-2;x++){
      const int c      = y*width+x;

      //The slope must be greater than zero for there to be downhill flow;
      //otherwise, the cell is marekd NO_FLOW
      double max_slope = 0;
      int    max_n     = NO_FLOW;

      #pragma acc loop seq
      for(int n=0;n<8;n++){
        double slope = (h[c] - h[c+nshift[n]])/dr[n];
        if(slope>max_slope){
          max_slope = slope;
          max_n     = n;
        }
      }
      rec[c] = max_n;
    }    
  }

 public:
  void run(const int nstep){
    rec    = new int[size];

    #pragma acc enter data copyin(h[0:size],nshift[0:8],height,width,this) create(rec[0:size])

    for(int step=0;step<=nstep;step++)
      FindDownstream();

    #pragma acc exit data copyout(h[0:size]) delete(this,rec)

    delete[] rec;
  }

};

int main(int argc, char **argv){
  Model model(300,300);
  model.run(100);

  return 0;
}

When I compile with:

pgc++ -acc -ta=tesla,pinned,cc60 -Minfo=accel  -fast test.cpp -std=c++11

I get the following warning:

 51, Loop without integer trip count will be executed in sequential mode
     Complex loop carried dependence of rec->,nshift prevents parallelization
     Loop carried dependence of rec-> prevents parallelization
     Loop carried backward dependence of rec-> prevents vectorization

Some digging on the internet reveals that a typical cause of this is the potential for pointer aliasing to cause dependencies.

I have tried to use *restrict and independent (as shown) to tell the compiler everything is alright, but it ignores me and does not parallelize the loop.

Passing pointers as arguments to the function with appropriate use of restrict eliminates the error, but I have an aesthetic preference against this design. Alternatively, all the methods, which are each, essentially, a kernel, could be strung together in the run() function, but again, this is not desirable.

If I use independent on the inner loop, I get:

PGCC-W-0155-inner loop of tiled/collapsed loop nest should not have another loop directive (actual_code.cpp: 227)

But the loop does appear to parallelize.

I am compiling with PGI 17.9.

Upvotes: 3

Views: 1071

Answers (1)

Mat Colgrove
Mat Colgrove

Reputation: 5646

The problem here is that "height" and "width" are class data members. Hence the compiler must assume that they may have external references to them and therefore could change values during the execution of these loops.

The solution is to copy the values to local variables and then use the local variables as the loop bounds.

Note that since you have "collapse(2)" on the outer loop, the "independent" clause already applies to both loops. (Though, "independent" is default for "parallel" compute regions so is unneeded.) A second "loop" construct is not allowed when collapsing multiple loops.

% cat test.cpp
#include <cstdlib>


class Model {
 private:
  int width;
  int height;
  int size;

  int    nshift[8];      //Offset from a focal cell's index to its neighbours
  double *restrict h;    //Digital elevation model (height)
  int    *restrict rec;  //Index of receiving cell

  const int NO_FLOW  = -1;
  const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
  const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};

 private:
  void GenerateRandomTerrain(){
    //srand(std::random_device()());
    for(int y=0;y<height;y++)
    for(int x=0;x<width;x++){
      const int c = y*width+x;
      h[c]  = rand()/(double)RAND_MAX;
    }
  }


 public:
  Model(const int width0, const int height0) : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
  {
    width  = width0;
    height = height0;
    size   = width*height;

    h      = new double[size];

    GenerateRandomTerrain();
  }

  ~Model(){
    delete[] h;
  }

 private:
  void FindDownstream(){
    //! computing receiver array
    int hgt = height;
    int wdt = width;
    #pragma acc parallel loop collapse(2) present(h,rec)
    for(int y=2;y<hgt-2;y++)
    for(int x=2;x<wdt-2;x++){
      const int c      = y*wdt+x;

      //The slope must be greater than zero for there to be downhill flow;
      //otherwise, the cell is marekd NO_FLOW
      double max_slope = 0;
      int    max_n     = NO_FLOW;

      #pragma acc loop seq
      for(int n=0;n<8;n++){
        double slope = (h[c] - h[c+nshift[n]])/dr[n];
        if(slope>max_slope){
          max_slope = slope;
          max_n     = n;
        }
      }
      rec[c] = max_n;
    }
  }

 public:
  void run(const int nstep){
    rec    = new int[size];

    #pragma acc enter data copyin(this,h[0:size],nshift[0:8]) create(rec[0:size])

    for(int step=0;step<=nstep;step++)
      FindDownstream();

    #pragma acc exit data copyout(h[0:size]) delete(rec,nshift,this)

    delete[] rec;
  }

};

int main(int argc, char **argv){
  Model model(300,300);
  model.run(100);

  return 0;
}
% pgc++ test.cpp -w --c++11 -Minfo=accel -ta=tesla:cc60 -V17.10; a.out
Model::FindDownstream():
     49, Generating present(h[:])
         Accelerator kernel generated
         Generating Tesla code
         51, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
         52,   /* blockIdx.x threadIdx.x collapsed */
         61, #pragma acc loop seq
     49, Generating implicit copy(this[:])
         Generating present(rec[:])
     61, Loop carried scalar dependence for max_slope at line 63
Model::run(int):
     74, Generating enter data copyin(nshift[:],h[:size])
         Generating enter data create(rec[:size])
         Generating enter data copyin(this[:1])
     83, Generating exit data delete(this[:1],rec[:1])
         Generating exit data copyout(h[:size])
         Generating exit data delete(nshift[:])

Upvotes: 1

Related Questions