Opencl Reduction is not as expected

Question

I'm pretty a novice about opencl. I have tried about "get the summation of all cubes of every element in an array". Here's my kernel code:

kernel void cubeSum(global float *input,
                    local float *prods,
                    global float *output )
{
    int gid = get_global_id( 0 );
    int tnum = get_local_id( 0 ); // thread number
    int wgNum = get_group_id( 0 ); // work-group number
    int numItems = get_local_size( 0 );
    prods[ tnum ] = input[ gid ] * input[ gid ] * input[gid]; // cube

    for (int offset = 1; offset < numItems; offset *= 2) {
        int mask = 2 * offset - 1;
        barrier(CLK_LOCAL_MEM_FENCE);
        if ( (tnum & mask) == 0 ) {
            prods[tnum] += prods[tnum + offset];
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if ( tnum == 0 )
        output[wgNum] = prods[0];
}

I can't figure out why my result is not the same with sequential result. When the array is from 0 to 511, my result is sequential result minus 2048; when the array is from 0 to 1023, my result is sequential result plus 16384.

I will try to figure it out myself while I'm waiting for you answers.

Another question is I found it is hard to debug kernel code since the dataset is quite big and it runs concurrently. Any advice for debugging?

All the advices are appreciated =).

By the way, here's my host code:

#include 
#include 
#include 
#include 
#include 
#include 

#define NUM_ELEMENTS (512)
#define LOCAL_SIZE (512)
#define MAX_SOURCE_SIZE (0x100000)

int main(int argc, const char * argv[])
{
    float data[NUM_ELEMENTS];           //hA
    float sum;
    float sumTest;

    size_t global;
    size_t local;
    size_t numWorkGroups;
    size_t dataSize;
    size_t resultsSize;

    cl_device_id device;
    cl_context context;
    cl_command_queue cmdQueue;
    cl_program program;
    cl_kernel kernel;

    cl_mem input;
    cl_mem output;

    FILE *fp;
    //failed to use relative path here. permission problem?
    char fileName[] = "/Users/sure/USC/590/cubeSum/cubeSum/cubeSum.cl";
    char *source_str;
    size_t source_size;

    /* カーネルを含むソースコードをロード */
    fp = fopen(fileName, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.
");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    //allocate the host memory buffers:
    int i = 0;
    unsigned int count = NUM_ELEMENTS;
    for (i = 0; i < count; i++) {
        data[i] = i;
    }

    //array size in bytes (will need this later):
    dataSize = NUM_ELEMENTS * sizeof(float);

    //opencl function status
    cl_int status;

    // Connect to a compute device
    //
    int gpu = 1;

    status = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (status != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!
");
        return EXIT_FAILURE;
    }

    //create an Opencl context
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);

    //create a command queue
    cmdQueue = clCreateCommandQueue( context, device, 0, &status );

    //allocate memory buffers on the device
    input = clCreateBuffer( context, CL_MEM_READ_ONLY, dataSize, NULL, &status );   //dA

    //TODO: at this line, I don't have the value of local which is calculated by clGetKernelWorkGroupInfo
    //need to figure out a way to avoid hardcode it.
    output = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(float) * NUM_ELEMENTS / LOCAL_SIZE, NULL, &status ); //dC

    // enqueue the 2 commands to write data into the device buffers:
    status = clEnqueueWriteBuffer( cmdQueue, input, CL_FALSE, 0, dataSize, data, 0, NULL, NULL );

    // create the kernel program on the device:
    program = clCreateProgramWithSource(context, 1, (const char **) & source_str, (const size_t *)&source_size, &status);
    if (!program)
    {
        printf("Error: Failed to create compute program!
");
        return EXIT_FAILURE;
    }


    // Build the program executable
    //
    status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (status != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!
");
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s
", buffer);
        exit(1);
    }

    //create compute kernel
    kernel = clCreateKernel( program, "cubeSum", &status );

    // Get the maximum work group size for executing the kernel on the device
    //
    status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (status != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d
", status);
        exit(1);
    }

    global = count;

    numWorkGroups = global / local;
    float results[numWorkGroups];        //hC
    resultsSize = numWorkGroups * sizeof(float);

    //set kernel parameter
    status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &input );
    status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
    status = clSetKernelArg( kernel, 2, sizeof(cl_mem), &output );

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (status)
    {
        printf("Error: Failed to execute kernel!
");
        return EXIT_FAILURE;
    }

    clFinish(cmdQueue);
    status = clEnqueueReadBuffer( cmdQueue, output, CL_TRUE, 0, resultsSize, results, 0, NULL, NULL );

    // Validate our results
    //
    sum = 0;

    for (int i=0; i



EDIT: Just found another thing. My code is correct if I just sum all element without cube/square. Thus, I'm gonna figure out how cube affect to my program.

jprice · Accepted Answer

You appear to only be allocating 4-bytes of local memory:

status = clSetKernelArg( kernel, 1, sizeof(float), NULL );

This should be the total amount of local memory required for that argument by the entire work-group. In the case of your kernel, this is (work-group-size * sizeof(float)).

So, you should instead have something like this:

status = clSetKernelArg( kernel, 1, local*sizeof(float), NULL );

The discrepancies you are seeing are likely coming from the limitations of floating point, since you are summing some very large numbers. If you initialise your inputs with smaller numbers (e.g. data[i] = i*0.01;), you should get results equal to your sequential implementation (I've verified this on my own system). This is why you don't see the errors when you remove the cube.

Opencl Reduction is not as expected

Answers (1)

Related Questions