CUDA 2D array assignment

Question

I have a strange problem dealing with 2D array on CUDA device.

    #define VR 100 // rows
    #define ST 13 // columns
    __global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
    int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curand_init ( seed, id, 0, &globalState[id] );
    cuPrintf("Thread id: %d 
", id);

    float* row = (float*)(((char*)arr) + id * pitch);
    for (int j = 0; j < ST; ++j) {
        row[j] = generate(globalState, id);
    }

}

int main() {
    float *d_arr;
    float *h_arr = new float[VR*ST];
    size_t pitch;
    cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);

    dim3 dimBlock(VR); 
    dim3 dimGrid(1,1);

    curandState* devStates;
    cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );

    test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
    cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);

    for (int i=0; ihttp://pastie.org/6106381

Robert Crovella · Accepted Answer

The problem is that the original data array is being allocated using cudaMallocPitch whereas the copying is being done using ordinary cudaMemcpy. This will give unexpected results because the cudaMallocPitch operation creates "padded" rows to satisfy alignment requirements, whereas cudaMemcpy assumes everything is stored contiguously. Below is code that I believe has corrections to be functional:

    #include 
    #include 
    #include 

    #define VR 100 // rows
    #define ST 13 // columns


__device__ float generate(curandState* globalState, int id)
{
    //int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curandState localState = globalState[id];
    float rand;
    do {
        rand = curand_uniform( &localState );
    } while(rand == 0); //
    globalState[id] = localState;
    return rand;
}


    __global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
    int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curand_init ( seed, id, 0, &globalState[id] );
    //cuPrintf("Thread id: %d 
", id);

    float* row = (float*)(((char*)arr) + id * pitch);
    for (int j = 0; j < ST; ++j) {
        row[j] = generate(globalState, id);
    }

}

    using namespace std;
int main() {
    float *d_arr;
    float *h_arr = new float[VR*ST];
    size_t pitch;
    cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);

    dim3 dimBlock(VR);
    dim3 dimGrid(1,1);

    curandState* devStates;
    cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );

    test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
    cudaMemcpy2D(h_arr, ST*sizeof(float),  d_arr, pitch, ST*sizeof(float), VR ,cudaMemcpyDeviceToHost);

    for (int i=0; i

N[0][0]=0.876772
N[0][1]=0.550017
N[0][2]=0.49023
N[0][3]=0.530145
N[0][4]=0.501616
N[0][5]=0.326232
N[0][6]=0.438308
N[0][7]=0.857651
N[0][8]=0.462743
N[0][9]=0.38252
N[0][10]=0.258212
N[0][11]=0.194021
N[0][12]=0.895522
N[1][0]=0.559201
N[1][1]=0.257747
N[1][2]=0.430971
N[1][3]=0.707209
N[1][4]=0.599081
N[1][5]=0.0457626
N[1][6]=0.702412
N[1][7]=0.88791
N[1][8]=0.508877
N[1][9]=0.702734
N[1][10]=0.379898
N[1][11]=0.138841
N[1][12]=0.540869


(results truncated)

CUDA 2D array assignment

Answers (2)

Related Questions