c4rrt3r
c4rrt3r

Reputation: 622

CUDA 2D array assignment

I have a strange problem dealing with 2D array on CUDA device.

    #define VR 100 // rows
    #define ST 13 // columns
    __global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
    int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curand_init ( seed, id, 0, &globalState[id] );
    cuPrintf("Thread id: %d \n", id);

    float* row = (float*)(((char*)arr) + id * pitch);
    for (int j = 0; j < ST; ++j) {
        row[j] = generate(globalState, id);
    }

}

int main() {
    float *d_arr;
    float *h_arr = new float[VR*ST];
    size_t pitch;
    cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);

    dim3 dimBlock(VR); 
    dim3 dimGrid(1,1);

    curandState* devStates;
    cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );

    test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
    cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);

    for (int i=0; i<VR; i++) {
        for (int j=0; j<ST; j++) {
            cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
        }
    }

I don't get evenly distributed numbers, instead they appear in sequence of 13 with bunch of zeros in between. See: http://pastie.org/6106381

Upvotes: 2

Views: 488

Answers (2)

Robert Crovella
Robert Crovella

Reputation: 152173

The problem is that the original data array is being allocated using cudaMallocPitch whereas the copying is being done using ordinary cudaMemcpy. This will give unexpected results because the cudaMallocPitch operation creates "padded" rows to satisfy alignment requirements, whereas cudaMemcpy assumes everything is stored contiguously. Below is code that I believe has corrections to be functional:

    #include <stdio.h>
    #include <iostream>
    #include <curand_kernel.h>

    #define VR 100 // rows
    #define ST 13 // columns


__device__ float generate(curandState* globalState, int id)
{
    //int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curandState localState = globalState[id];
    float rand;
    do {
        rand = curand_uniform( &localState );
    } while(rand == 0); //
    globalState[id] = localState;
    return rand;
}


    __global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
    int id = (blockIdx.x * blockDim.x)  + threadIdx.x;
    curand_init ( seed, id, 0, &globalState[id] );
    //cuPrintf("Thread id: %d \n", id);

    float* row = (float*)(((char*)arr) + id * pitch);
    for (int j = 0; j < ST; ++j) {
        row[j] = generate(globalState, id);
    }

}

    using namespace std;
int main() {
    float *d_arr;
    float *h_arr = new float[VR*ST];
    size_t pitch;
    cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);

    dim3 dimBlock(VR);
    dim3 dimGrid(1,1);

    curandState* devStates;
    cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );

    test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
    cudaMemcpy2D(h_arr, ST*sizeof(float),  d_arr, pitch, ST*sizeof(float), VR ,cudaMemcpyDeviceToHost);

    for (int i=0; i<VR; i++) {
        for (int j=0; j<ST; j++) {
            cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
        }
    }
}

Compiling the above code using:

nvcc -arch=sm_20 -lcurand  -o t70 t70.cu

and then running I get what appears to be "normal" output:

N[0][0]=0.876772
N[0][1]=0.550017
N[0][2]=0.49023
N[0][3]=0.530145
N[0][4]=0.501616
N[0][5]=0.326232
N[0][6]=0.438308
N[0][7]=0.857651
N[0][8]=0.462743
N[0][9]=0.38252
N[0][10]=0.258212
N[0][11]=0.194021
N[0][12]=0.895522
N[1][0]=0.559201
N[1][1]=0.257747
N[1][2]=0.430971
N[1][3]=0.707209
N[1][4]=0.599081
N[1][5]=0.0457626
N[1][6]=0.702412
N[1][7]=0.88791
N[1][8]=0.508877
N[1][9]=0.702734
N[1][10]=0.379898
N[1][11]=0.138841
N[1][12]=0.540869

(results truncated)

Upvotes: 4

Soroosh Bateni
Soroosh Bateni

Reputation: 897

I think it's wrong, you should assign VR number of threads or blocks because you already loop through ST in the kernel.

maybe that will fix it.

Upvotes: 0

Related Questions