Reputation: 622
I have a strange problem dealing with 2D array on CUDA device.
#define VR 100 // rows
#define ST 13 // columns
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
I don't get evenly distributed numbers, instead they appear in sequence of 13 with bunch of zeros in between. See: http://pastie.org/6106381
Upvotes: 2
Views: 488
Reputation: 152173
The problem is that the original data array is being allocated using cudaMallocPitch
whereas the copying is being done using ordinary cudaMemcpy
. This will give unexpected results because the cudaMallocPitch
operation creates "padded" rows to satisfy alignment requirements, whereas cudaMemcpy assumes everything is stored contiguously. Below is code that I believe has corrections to be functional:
#include <stdio.h>
#include <iostream>
#include <curand_kernel.h>
#define VR 100 // rows
#define ST 13 // columns
__device__ float generate(curandState* globalState, int id)
{
//int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curandState localState = globalState[id];
float rand;
do {
rand = curand_uniform( &localState );
} while(rand == 0); //
globalState[id] = localState;
return rand;
}
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
//cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
using namespace std;
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy2D(h_arr, ST*sizeof(float), d_arr, pitch, ST*sizeof(float), VR ,cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
}
Compiling the above code using:
nvcc -arch=sm_20 -lcurand -o t70 t70.cu
and then running I get what appears to be "normal" output:
N[0][0]=0.876772
N[0][1]=0.550017
N[0][2]=0.49023
N[0][3]=0.530145
N[0][4]=0.501616
N[0][5]=0.326232
N[0][6]=0.438308
N[0][7]=0.857651
N[0][8]=0.462743
N[0][9]=0.38252
N[0][10]=0.258212
N[0][11]=0.194021
N[0][12]=0.895522
N[1][0]=0.559201
N[1][1]=0.257747
N[1][2]=0.430971
N[1][3]=0.707209
N[1][4]=0.599081
N[1][5]=0.0457626
N[1][6]=0.702412
N[1][7]=0.88791
N[1][8]=0.508877
N[1][9]=0.702734
N[1][10]=0.379898
N[1][11]=0.138841
N[1][12]=0.540869
(results truncated)
Upvotes: 4
Reputation: 897
I think it's wrong, you should assign VR number of threads or blocks because you already loop through ST in the kernel.
maybe that will fix it.
Upvotes: 0