Reputation: 445
I'm trying to pass a 2d array to a kernel so that each thread can access index = threadIdx.x + (blockIdx.x * blockDim.x) but I'm having trouble figuring out just how to do this and how to copy the data back over.
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int));
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost);
for (num_blocks)
for(block_size)
h_array[block][thread] should be 1
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
What am I doing wrong, here?
Upvotes: 0
Views: 463
Reputation: 152173
Here's a complete code that passes a basic test with the errors mentioned by @hidrargyro fixed:
$ cat t236.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
int main(){
int *d_array, *h_array;
int block_size = 256;
int num_blocks = 256;
int grid_size = num_blocks;
h_array=(int *)malloc(block_size*num_blocks*sizeof(int));
if (h_array==0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int));
cudaCheckErrors("cudaMalloc fail");
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMallocPitch fail");
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMemset2D fail");
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2D fail");
for (int i = 0; i<num_blocks; i++)
for(int j = 0; j<block_size; j++)
if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;}
printf("success\n");
return 0;
}
$ nvcc -arch=sm_20 -o t236 t236.cu
$ ./t236
success
$
Upvotes: 1
Reputation: 267
Your cudaMemset2D is accesing to a bigger memory space that you previously allocated with cudaMallocPitch Also your cudaMemcpy2D is copying a little portion of that memory.
You should use the function in the following way:
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost);
Upvotes: 1