al3ndaleeb
al3ndaleeb

Reputation: 11

My CUDA kernel code is not working

I try to make a small code to generate numbers and return the result in array but once I run this code it's not working, I have tried to use Nsight debugger to understand where is my problem but it freezes and closes immediately.

Could you help me please to understand where is the problem in this code?

__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp, 
                              int m[2], int p[5], int q[5], int i, int* n, 
                              int out[10][5], int N)
    {
        int id = blockDim.x * blockIdx.x + threadIdx.x;

        int idx = blockIdx.x;
        int idy = blockIdx.y;

        int w = idx/100;
        int x = idx%100;
        int y = idy;

        int z = threadIdx.x;

        int len = ((i * 2) + 5);


        // Fill PF_tmp & QF_tmp
        if( i > 0){
            for(int k = 0; k < (i * 2); k++)
            {
                p[k]   = PF_tmp[k];
                q[k]   = QF_tmp[k];
            }
        }

        // Fill X
        if( x > 10)
        {
            p[(i*2)] = (x - (x % 10)) / 10;
            p[(i*2)+1] = x % 10;
        }else{
            p[(i*2)] = 0;
            p[(i*2)+1] = x;
        }

        // Fill Y
        if( y > 10)
        {
            q[(i*2)] = (y - (y % 10)) / 10;
            q[(i*2)+1] = y % 10;
        }else{
            q[(i*2)] = 0;
            q[(i*2)+1] = y;
        }

        // Fill m
        p[(i * 2)+2] = m[0];
        q[(i * 2)+2] = m[1];

        // Fill W 
        if( w > 10)
        {
            p[(i*2)+3] = (w - (w % 10)) / 10;
            p[(i*2)+4] = w % 10;
        }else{
            p[(i*2)+3] = 0;
            p[(i*2)+4] = w;
        }

        // Fill Z 
        if( z > 10)
        {
            q[(i*2)+3] = (z - (z % 10)) / 10;
            q[(i*2)+4] = z % 10;
        }else{
            q[(i*2)+3] = 0;
            q[(i*2)+4] = z;
        }

        // Fill PL_tmp & QL_tmp
        if( i > 0)
        {
            for(int k = 0; k < (i * 2); k++)
            {
                p[(len-(i * 2))+k]   = PL_tmp[k];
                q[(len-(i * 2))+k]   = QL_tmp[k];
            }
        }

        if(id<10)
        {
            for(int k =0; k<5; k++)
                out[id][k] = p[k];
        }

    }



    int main()
    {
        cudaError err;
        dim3 blocks(10000, 100);
        dim3 threads(100); 

        int m[2] = {4,5};
        int hst_out[10][5];
        int p[5];
        int q[5];
        err = cudaMalloc((void **)&p, 5);
        err = cudaMalloc((void **)&q, 5);
        err = cudaMalloc((void **)&hst_out, 50);

        mykernel<<<blocks, threads>>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);

        return 0;
    }

Upvotes: 0

Views: 431

Answers (1)

Peter Teoh
Peter Teoh

Reputation: 6753

The error very obvious, it is all C programming.

when you declare

        int m[2] = {4,5};
        int hst_out[10][5];
        int p[5];
        int q[5];

now hst_out, p, q are not a pointer, but later it is used as a pointer:

        err = cudaMalloc((void **)&p, 5);
        err = cudaMalloc((void **)&q, 5);
        err = cudaMalloc((void **)&hst_out, 50);

so u should have declare it initially as a pointer instead, eg,

        int *p;

and used it as this way:

        err = cudaMalloc((void **)&p, 5*sizeof(int));

And notice too that the size you have declared is just 5 bytes....whereas I declared it as 5*sizeof(int).

For more example see:

http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html

Upvotes: 1

Related Questions