Callum Perks
Callum Perks

Reputation: 71

Why is OpenCL nested loop only working for some elements

I am trying to implement the following loop in an OpenCL kernel.

for(i=0;i<N;i++) for(j=0;j<M;j++) weights[i*M+j] += gradients[i] * input[j];

This is my kernel. I am currently hardcoding M to be 4 and it is only working for the first 4 elements.

__kernel
void cwk3( __global float *gradients,  __global float *inputs,  __global float *weights)
{
    // The global id tells us the index of the vector for this thread.
    int gid1 = get_global_id(0);
    int gid2 = get_global_id(1);

    // Perform the addition.
    weights[(gid1 * 4) + gid2] += gradients[gid1] * inputs[gid2];
}

The relevant c++ code is

    float
        *gradients = (float*) malloc( N  *sizeof(float) ),
        *inputs    = (float*) malloc(   M*sizeof(float) ),
        *weights   = (float*) malloc( N*M*sizeof(float) );
    initialiseArrays( gradients, inputs, weights, N, M );

    cl_mem deviceGradients = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*sizeof(float), gradients
    , &status );
    cl_mem deviceInputs = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*sizeof(float), inputs
    , &status );
    cl_mem deviceWeights = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*M*sizeof(float), weights
    , &status );

    cl_kernel kernel = compileKernelFromFile( "kernel.cl", "cwk3", context, device );

    status = clSetKernelArg( kernel, 0, sizeof(deviceGradients), &deviceGradients );
    status = clSetKernelArg( kernel, 1, sizeof(deviceInputs), &deviceInputs );
    status = clSetKernelArg( kernel, 2, sizeof(deviceWeights), &deviceWeights );

    size_t indexSpaceSize[2], workGroupSize[1];
    indexSpaceSize[0] = N;
    indexSpaceSize[1] = M;
    workGroupSize [0] = 4;

    status = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, indexSpaceSize, workGroupSize, 0, NULL, NULL );
    if( status != CL_SUCCESS )
    {
        printf( "Failure enqueuing kernel: Error %d.\n", status );
        return EXIT_FAILURE;        
    }

    status = clEnqueueReadBuffer( queue, deviceWeights, CL_TRUE, 0, N*M*sizeof(float), weights, 0, NULL, NULL );
    if( status != CL_SUCCESS )
    {
        printf( "Could not copy device data to host: Error %d.\n", status );
        return EXIT_FAILURE;
    }

This simply creates the buffers and copies them to the GPU, launches the kernel and then reads the answer back from the GPU to the CPU. N and M are read in as command line arguments. I am currently setting them both to 4 for testing

Upvotes: 1

Views: 203

Answers (1)

Quimby
Quimby

Reputation: 19223

You seem to be confused about global and local work groups.

Global work size specifies total number of calls (work items) executed. global_work_size=[M,N] will call the kernel MxN times in total. One work item can determine its position by get_global_id. OpenCL could implement this as something like this :

for(i=0;i<N;i++)
   for(j=0;j<M;j++)
       call_kernel(set global_id=[i,j])

Local work groups describe how to group together individual launched work items( which are created according to global sizes) and make them aware of each other and share memory between themselves. None of those features you use/need, so ignore them. So to implement your for loop in OpenCL:

for(i=0;i<N;i++) 
    for(j=0;j<M;j++) 
        weights[i*M+j] += gradients[i] * input[j];

You would have this kernel:

 __kernel
void cwk3( __global float *gradients,  __global float *inputs,  __global float *weights)
{
    int gid1 = get_global_id(0);
    int gid2 = get_global_id(1);
    int M = get_global_size(0);

    weights[(gid1 * M) + gid2] += gradients[gid1] * inputs[gid2];
}

And call it like this:

size_t global_work[2];
global_work[0]=M;
global_work[1]=N;
// This is 2D kernel, not 1D
// Offsets are 0
// Global work size is M*N
// Ignore local work size 
status = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work);

Upvotes: 4

Related Questions