Reputation: 407
I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
Upvotes: 1
Views: 254
Reputation: 8028
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}
Upvotes: 3