CUDA: working with arrays of different sizes

Question

In this example, I am trying to create an 10x8 array using values from a 10x9 array. It looks like I am accessing memory incorrectly but I am not sure where my error is.

The code in C++ would be something like

for (int h = 0; h < height; h++){
    for (int i = 0; i < (width-2); i++)
        dd[h*(width-2)+i] = hi[h*(width-1)+i] + hi[h*(width-1)+i+1];
}

This is what I am trying in CUDA:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 

#include 

#define TILE_WIDTH 4

using namespace std;

__global__ void cudaOffsetArray(int height, int width, float *HI, float *DD){

    int             x                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             y                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             grid_width          =   gridDim.x  * blockDim.x;
  //int             index               =   y * grid_width + x;

    if ((x < (width - 2)) && (y < (height)))
        DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);
}

int main(){

    int height  = 10;
    int width   = 10;

    float *HI = new float [height * (width - 1)];
    for (int i = 0; i < height; i++){
        for (int j = 0; j < (width - 1); j++)
            HI[i * (width - 1) + j] = 1;
    }

    float   *gpu_HI;
    float   *gpu_DD;
    cudaMalloc((void **)&gpu_HI, (height * (width - 1) * sizeof(float)));
    cudaMalloc((void **)&gpu_DD, (height * (width - 2) * sizeof(float)));

    cudaMemcpy(gpu_HI, HI, (height * (width - 1) * sizeof(float)), cudaMemcpyHostToDevice);

    dim3            dimGrid((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
    dim3            dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

    cudaOffsetArray<<>>(height, width, gpu_HI, gpu_DD);

    float *result   = new float[height * (width - 2)];
    cudaMemcpy(result, gpu_DD,  (height * (width - 2) * sizeof(float)), cudaMemcpyDeviceToHost);

    for (int i = 0; i < height; i++){
        for (int j = 0; j < (width - 2); j++)
            cout << result[i * (width - 2) + j] << " ";
        cout << endl;
    }

    cudaFree(gpu_HI);
    cudaFree(gpu_DD);
    delete[] result;
    delete[] HI;

    system("pause");
}

I've also tried this in the global function:

if ((x < (width - 2)) && (y < (height)))
    DD[y * (grid_width - 2) + (blockIdx.x - 2) * blockDim.x + threadIdx.x] = 
        (HI[y * (grid_width - 1) + (blockIdx.x - 1) * blockDim.x + threadIdx.x] + 
         HI[y * (grid_width - 1) + (blockIdx.x - 1) * blockDim.x + threadIdx.x + 1]);

Robert Crovella · Accepted Answer

To "fix" your code, change each use of grid_width to width in this line in your kernel:

    DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);

Like this:

    DD[y * (width - 2) + x] = (HI[y * (width - 1) + x] + HI[y * (width - 1) + x + 1]);

Explanation:

Your grid_width:

dim3            dimGrid((width * 2 - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
dim3            dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

doesn't actually correspond to your array size (10x10, or 10x9, or 10x8). I"m not sure why you're launching 2*width threads in the x dimension, but this means that your thread array is considerably larger than your data array.

So when you use grid_width in the kernel:

    DD[y * (grid_width - 2) + x] = (HI[y * (grid_width - 1) + x] + HI[y * (grid_width - 1) + x + 1]);

the indexing will be a problem. If you instead change each instance of grid_width above to just width (which corresponds to the actual width of your data array) you'll get better indexing, I think. Normally it's not an issue to launch "extra threads" because you have a thread check line in your kernel:

if ((x < (width - 2)) && (y < (height)))

but when you launch extra threads, it is making your grid larger, and so you can't use grid dimensions to index properly into your data array.

CUDA: working with arrays of different sizes

Answers (1)

Related Questions