3D indices access in CUDA for nonlinear diffusion

Question

I'm writing my first CUDA code for 2D/3D nonlinear diffusion. The 2D case is working fine, however I'm struggling with the 3D one. Basically I'm getting zeros on the stage of finite differences calculation and surprisingly the 'deltaN' (pls see code below) is giving the right answer but the other ones are not working (zeros in answer). I'm trying to process 256x256x256 volume. Any suggestions please? Thanks!

 #define BLKXSIZE 8
 #define BLKYSIZE 8
 #define BLKZSIZE 8
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )

void AnisotropDiff_GPU(double* A, double* B, int N, int M, int Z, double sigma, int iter, double tau, int type)
{ 
// Nonlinear Diffusion in 3D 
double *Ad;     

dim3 dimBlock(BLKXSIZE, BLKYSIZE, BLKZSIZE);           
dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE), idivup(Z,BLKYSIZE));    

cudaMalloc((void**)&Ad,N*M*Z*sizeof(double));  
cudaMemcpy(Ad,A,N*M*Z*sizeof(double),cudaMemcpyHostToDevice);  

int n = 1;
while (n <= iter) {    
anis_diff3D<<>>(Ad, N, M, Z, sigma, iter, tau, type);  
n++;}
cudaMemcpy(B,Ad,N*M*Z*sizeof(double),cudaMemcpyDeviceToHost);
cudaFree(Ad);
}

and here is a part for calculating finite differences

 __global__ void anis_diff3D(double* A, int N, int M, int Z, double sigma, int iter, double tau, int type)
{

 int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
 int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
 int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

 if ( (xIndex < N) && (yIndex < M) && (zIndex < Z) )
 {
    int index_out = xIndex + M*yIndex + N*M*zIndex;

    double deltaN=0, deltaS=0, deltaW=0, deltaE=0, deltaU=0, deltaD=0;
    double cN, cS, cW, cE, cU, cD;

    int indexN = (xIndex-1) + M*yIndex + N*M*zIndex;
    int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;
    int indexW = xIndex + M*(yIndex-1) + N*M*zIndex;
    int indexE = xIndex + M*(yIndex+1) + N*M*zIndex;
    int indexU = xIndex + M*yIndex + N*M*(zIndex-1);
    int indexD = xIndex + M*yIndex + N*M*(zIndex+1);


    if (xIndex>1)
        deltaN = A[indexN]-A[index_out];
    if (xIndex1)
        deltaW = A[indexW]-A[index_out];    
    if (yIndex1)
        deltaU = A[indexU]-A[index_out];    
    if (zIndex



Thanks a lot for help!

Robert Crovella · Accepted Answer

You have out-of-bounds indexing for some of the values you are trying to compute in the kernel.

If you compile your code with the last kernel line like this:

A[index_out] = deltaS ;

and then run it with cuda-memcheck, cuda-memcheck will report out of bounds accesses:

========= Invalid __global__ read of size 8
=========     at 0x000000b0 in anis_diff3D(double*, int, int, int, double, int, double, int)
=========     by thread (7,7,7) in block (31,31,31)
=========     Address 0x408100000 is out of bounds

So what is going on? Let's take a look at your index calculation:

int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;

For the very last thread in the grid (thread (7,7,7) in block (31, 31, 31)), this index calculation indexes beyond the end of your memory array A in this line:

    deltaS = A[indexS]-A[index_out];

You'll have to handle these boundary conditions in order to make things run properly.

While we are at it, if you had done error checking, your kernel would have thrown an error. Note that depending on which value you select to store at the end of the kernel, the compiler may optimize out the other calculations, leading to a kernel which appears to run correctly, (for example if you store deltaN instead of deltaS). Here's an example of code with error checking in it:

#include 
#include 

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)
", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING
"); \
            exit(1); \
        } \
    } while (0)

 #define BLKXSIZE 8
 #define BLKYSIZE 8
 #define BLKZSIZE 8
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
 #define sizeX 256
 #define sizeY 256
 #define sizeZ 256
 #define sizeT (sizeX*sizeY*sizeZ)


 __global__ void anis_diff3D(double* A, int N, int M, int Z, double sigma, int iter, double tau, int type)
{

 int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
 int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
 int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

 if ( (xIndex < N) && (yIndex < M) && (zIndex < Z) )
 {
    int index_out = xIndex + M*yIndex + N*M*zIndex;

    double deltaN=0, deltaS=0, deltaW=0, deltaE=0, deltaU=0, deltaD=0;
    double cN, cS, cW, cE, cU, cD;

    int indexN = (xIndex-1) + M*yIndex + N*M*zIndex;
    if (indexN > ((N*M*Z)-1)) indexN = (N*M*Z) -1;
    if (indexN < 0) indexN = 0;
    int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;
    if (indexS > ((N*M*Z)-1)) indexS = (N*M*Z) -1;
    if (indexS < 0) indexS = 0;
    int indexW = xIndex + M*(yIndex-1) + N*M*zIndex;
    if (indexW > ((N*M*Z)-1)) indexW = (N*M*Z) -1;
    if (indexW < 0) indexW = 0;
    int indexE = xIndex + M*(yIndex+1) + N*M*zIndex;
    if (indexE > ((N*M*Z)-1)) indexE = (N*M*Z) -1;
    if (indexE < 0) indexE = 0;
    int indexU = xIndex + M*yIndex + N*M*(zIndex-1);
    if (indexU > ((N*M*Z)-1)) indexU = (N*M*Z) -1;
    if (indexU < 0) indexU = 0;
    int indexD = xIndex + M*yIndex + N*M*(zIndex+1);
    if (indexD > ((N*M*Z)-1)) indexD = (N*M*Z) -1;
    if (indexD < 0) indexD = 0;    

    if (xIndex>1)
        deltaN = A[indexN]-A[index_out];
    if (xIndex1)
        deltaW = A[indexW]-A[index_out];
    if (yIndex1)
        deltaU = A[indexU]-A[index_out];
    if (zIndex>>(Ad, N, M, Z, sigma, iter, tau, type);
    n++;
    cudaDeviceSynchronize();
    cudaCheckErrors("kernel");
  }
  cudaMemcpy(B,Ad,N*M*Z*sizeof(double),cudaMemcpyDeviceToHost);
  cudaCheckErrors("cc2");
  cudaFree(Ad);
}

int main(){

  double *A;
  A = (double *)malloc(sizeT * sizeof(double));
  if (A == 0) {printf("malloc fail
"); return 1;}

  for (int i=0; i< sizeT; i++)
    A[i] = (double)(rand()/(double)RAND_MAX);

  printf("data:
");
  for (int i = 0; i < 8; i++)
    printf("A[%d] = %f
", i, A[i]);

  AnisotropDiff_GPU(A, A, sizeX, sizeY, sizeZ, 0.5f, 3, 0.5, 3);
  printf("results:
");
  for (int i = 0; i < 8; i++)
    printf("A[%d] = %f
", i, A[i]);

  return 0;
}

EDIT I've edited the above code to clamp the index calculations to the boundaries of the defined array. This should prevent out-of-bounds access. Whether it is sensible from an algorithm standpoint I don't know.

3D indices access in CUDA for nonlinear diffusion

Answers (1)

Related Questions