My CUDA kernel code is not working

Question

I try to make a small code to generate numbers and return the result in array but once I run this code it's not working, I have tried to use Nsight debugger to understand where is my problem but it freezes and closes immediately.

Could you help me please to understand where is the problem in this code?

__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp, 
                              int m[2], int p[5], int q[5], int i, int* n, 
                              int out[10][5], int N)
    {
        int id = blockDim.x * blockIdx.x + threadIdx.x;

        int idx = blockIdx.x;
        int idy = blockIdx.y;

        int w = idx/100;
        int x = idx%100;
        int y = idy;

        int z = threadIdx.x;

        int len = ((i * 2) + 5);


        // Fill PF_tmp & QF_tmp
        if( i > 0){
            for(int k = 0; k < (i * 2); k++)
            {
                p[k]   = PF_tmp[k];
                q[k]   = QF_tmp[k];
            }
        }

        // Fill X
        if( x > 10)
        {
            p[(i*2)] = (x - (x % 10)) / 10;
            p[(i*2)+1] = x % 10;
        }else{
            p[(i*2)] = 0;
            p[(i*2)+1] = x;
        }

        // Fill Y
        if( y > 10)
        {
            q[(i*2)] = (y - (y % 10)) / 10;
            q[(i*2)+1] = y % 10;
        }else{
            q[(i*2)] = 0;
            q[(i*2)+1] = y;
        }

        // Fill m
        p[(i * 2)+2] = m[0];
        q[(i * 2)+2] = m[1];

        // Fill W 
        if( w > 10)
        {
            p[(i*2)+3] = (w - (w % 10)) / 10;
            p[(i*2)+4] = w % 10;
        }else{
            p[(i*2)+3] = 0;
            p[(i*2)+4] = w;
        }

        // Fill Z 
        if( z > 10)
        {
            q[(i*2)+3] = (z - (z % 10)) / 10;
            q[(i*2)+4] = z % 10;
        }else{
            q[(i*2)+3] = 0;
            q[(i*2)+4] = z;
        }

        // Fill PL_tmp & QL_tmp
        if( i > 0)
        {
            for(int k = 0; k < (i * 2); k++)
            {
                p[(len-(i * 2))+k]   = PL_tmp[k];
                q[(len-(i * 2))+k]   = QL_tmp[k];
            }
        }

        if(id<10)
        {
            for(int k =0; k<5; k++)
                out[id][k] = p[k];
        }

    }



    int main()
    {
        cudaError err;
        dim3 blocks(10000, 100);
        dim3 threads(100); 

        int m[2] = {4,5};
        int hst_out[10][5];
        int p[5];
        int q[5];
        err = cudaMalloc((void **)&p, 5);
        err = cudaMalloc((void **)&q, 5);
        err = cudaMalloc((void **)&hst_out, 50);

        mykernel<<>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);

        return 0;
    }

My CUDA kernel code is not working

Answers (1)

Related Questions