2D array CUDA problems

Question

I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:

__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
        for(int j = blockIdx.y * blockDim.y + threadIdx.y; j <  m; j += blockDim.y * gridDim.y){
            z[i*m + j] = x[i*m + j] + y[i*m + j];
        }
    }
}

__host__ void add2D(double *a, double *b, double *result, int N, int M){
    double *a_d, *b_d, *c_d;

    size_t pitcha;
    size_t pitchb;
    size_t pitchc;

    cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
    cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
    cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
    cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
    cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));

    dim3 threadsPerBlock(2, 2); 
    dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y); 

    add_d2D<<>>(a_d, b_d, c_d , N, M);

    cudaDeviceSynchronize();

    cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));

    cudaFree(a_d);
    cudaFree(b_d);
    cudaFree(c_d);
}

And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.

double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
    for (int j = 0; j < 10; j ++){
        a[i][j] = 0 + rand() % 10;
        b[i][j] = 0 + rand() % 10;
    }
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
    for (int j = 0; j < 10; j ++){
        std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;  
    }
}

Mikhail · Accepted Answer

You have two mistakes

Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.

Here is a working version

#include 
#include 
#include 
#include  

#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
    if (cudaSuccess != err)
    {
        std::stringstream error_msg;
        error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
        const auto error_msg_str = error_msg.str();
        std::cout << error_msg_str << std::endl;
        throw std::runtime_error(error_msg_str);
    }
}

__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements) 
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
    if (row< n && col (ceil(a / (b*1.0))); };
    dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
    //all the pitches should be the same
    auto pitch_elements = pitcha / sizeof(double);
    add_d2D << > >(a_d, b_d, c_d, N, M, pitch_elements);

    CUDASAFECALL(cudaDeviceSynchronize());

    CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));

    CUDASAFECALL(cudaFree(a_d));
    CUDASAFECALL(cudaFree(b_d));
    CUDASAFECALL(cudaFree(c_d));
}

int main()
{
    double a[4][10];
    double b[4][10];
    double c[4][10];
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 10; j++) {
            a[i][j] = 0 + rand() % 10;
            b[i][j] = 0 + rand() % 10;
        }
    }
    add2D((double *)a, (double *)b, (double *)c, 4, 10);
    for (int i = 0; i < 4; i++) {
        for (int j = 0; j < 10; j++) {
            std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
        }
    }
    return 0;
}

2D array CUDA problems

Answers (1)

Related Questions