Why is OpenCL nested loop only working for some elements

Question

I am trying to implement the following loop in an OpenCL kernel.

for(i=0;i



This is my kernel. I am currently hardcoding M to be 4 and it is only working for the first 4 elements.

__kernel
void cwk3( __global float *gradients,  __global float *inputs,  __global float *weights)
{
    // The global id tells us the index of the vector for this thread.
    int gid1 = get_global_id(0);
    int gid2 = get_global_id(1);

    // Perform the addition.
    weights[(gid1 * 4) + gid2] += gradients[gid1] * inputs[gid2];
}


The relevant c++ code is

    float
        *gradients = (float*) malloc( N  *sizeof(float) ),
        *inputs    = (float*) malloc(   M*sizeof(float) ),
        *weights   = (float*) malloc( N*M*sizeof(float) );
    initialiseArrays( gradients, inputs, weights, N, M );

    cl_mem deviceGradients = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*sizeof(float), gradients
    , &status );
    cl_mem deviceInputs = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*sizeof(float), inputs
    , &status );
    cl_mem deviceWeights = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*M*sizeof(float), weights
    , &status );

    cl_kernel kernel = compileKernelFromFile( "kernel.cl", "cwk3", context, device );

    status = clSetKernelArg( kernel, 0, sizeof(deviceGradients), &deviceGradients );
    status = clSetKernelArg( kernel, 1, sizeof(deviceInputs), &deviceInputs );
    status = clSetKernelArg( kernel, 2, sizeof(deviceWeights), &deviceWeights );

    size_t indexSpaceSize[2], workGroupSize[1];
    indexSpaceSize[0] = N;
    indexSpaceSize[1] = M;
    workGroupSize [0] = 4;

    status = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, indexSpaceSize, workGroupSize, 0, NULL, NULL );
    if( status != CL_SUCCESS )
    {
        printf( "Failure enqueuing kernel: Error %d.
", status );
        return EXIT_FAILURE;        
    }

    status = clEnqueueReadBuffer( queue, deviceWeights, CL_TRUE, 0, N*M*sizeof(float), weights, 0, NULL, NULL );
    if( status != CL_SUCCESS )
    {
        printf( "Could not copy device data to host: Error %d.
", status );
        return EXIT_FAILURE;
    }


This simply creates the buffers and copies them to the GPU, launches the kernel and then reads the answer back from the GPU to the CPU. N and M are read in as command line arguments. I am currently setting them both to 4 for testing

Why is OpenCL nested loop only working for some elements

Answers (1)

Related Questions