OpenCL - GPU sum and CPU sum not the same

Question

I'm new to OpenCL.

I wrote a program that should do parallel reduction on 1 million elements array. In the last part of the code I'm comparing the CPU sum and GPU sum , they are not the same, and that is the problem. My local size is 64. From index "90" the sum in the GPU is starting to get bigger.

EDIT: if i sum smaller number (now i sum 0 - 1m) suppose 1's the final sum is correct.

KERNEL:

__kernel void gpuSumfunc(  __global float *vec ,__global float* sum, int n)          
{
    __local float tempSum[64];

    const int i;                                                        
    const int globalID = get_global_id(0); //BLOCK_DIM*BLOCK_IND+THREAD_ID
    const int tid = get_local_id(0);         //THREAD_ID
    const int BlockDIM = get_local_size(0);//BLOCK_DIM=64

    if (globalID < n)
    {
        tempSum[tid] = vec[globalID];    //Inserting global data to local data

    }
    else
    {
        tempSum[tid] = 0;
    }
        barrier(CLK_LOCAL_MEM_FENCE);    //Wating for all the threads to copy their data

        for (i = BlockDIM / 2; i > 0; i /= 2)
        {

            if (tid < i)
            {

                tempSum[tid] += tempSum[tid + i];
            }
            barrier(CLK_LOCAL_MEM_FENCE);

        }

        if (tid == 0)
        {
            sum[get_group_id(0)] = tempSum[0];
        }

    }

MAIN:

//HOST-cpu
    float *h_a;//input
    float *h_b;//output
    float *h_s;
    //DEVICE-gpu
    cl_mem d_a;//input buffer
    cl_mem d_b;//Output

               //Kernel File
    FILE* fileKernel;

    //Memory allocation - cpu input 
    vector = (float*)malloc(n * sizeof(float));
    h_a = (float*)malloc(n * sizeof(float));
    h_b = (float*)malloc(n * sizeof(float));
    h_s = (float*)malloc(n * sizeof(float));

    *vector = { 0 };
    *h_a = { 0 };
    *h_b = { 0 };
    *h_s = { 0 };


    //Initializing Data for gpu
    for (i = 0; i < n; i++) {
        h_a[i] = i;//(float)i;
    }


    //Initializing Data for cpu
    for (i = 0; i < n; i++) {
        vector[i] = i;//(float)i;
    }
    fileKernel = fopen("KernelCode.cl", "r");
    if (!fileKernel)
    {
        printf("Cannot open kernel file!
");
        exit(1);
    }

    // Read kernel code
    kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(kernelSource, 1, MAX_SOURCE_SIZE, fileKernel);
    fclose(fileKernel);


    error = clGetPlatformIDs(2, cp_Platform, NULL); //array with two devices

    error = clGetDeviceIDs(cp_Platform[1], CL_DEVICE_TYPE_GPU, 1, &Device_ID, NULL); // cp_platform[1] = Nvidia GPU

    context = clCreateContext(NULL, 1, &Device_ID, NULL, NULL, &error); // creating openCL context 

    queue = clCreateCommandQueue(context, Device_ID, 0, &error); // creating command queue, executing openCL context on device cp_Platform[1] 


    globalSize = ceil(n / (float)localSize)*localSize;

    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);

    error = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, n * sizeof(float), h_a, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
    error |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,n * sizeof(float), h_s, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.


    program = clCreateProgramWithSource(context, 1, (const char **)& kernelSource, (const size_t *)&source_size, &error); //this function creates a program object for this specific openCL context
    error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); //compiles and links a program executable from the program source


    kernel = clCreateKernel(program, "gpuSumfunc", &error); //creating kernel object 
    error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void*)&workGroupSize, NULL);
    error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), (void*)&pWorkGroupSize, NULL);
    error = clGetDeviceInfo(Device_ID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(NumOfCU), &NumOfCU, NULL);

    error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); //Used to set the argument value for a specific argument of a kernel.
    error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    error |= clSetKernelArg(kernel, 2, sizeof(int), &n);
    error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Enqueues a command to execute a kernel on a device.

    clFinish(queue);

    clEnqueueReadBuffer(queue, d_b, CL_TRUE, 0, n*sizeof(float) , h_b, 0, NULL, NULL); ////writing data from the device (d_b) to host(h_b)
    clock_t end = clock();

    for (i = 0; i < (n+localSize-1)/localSize; i++)
    {
        gpuSum += h_b[i];
        cpuSum = cpuSumfunc(vector, 64*(i+1));
    if ((gpuSum - cpuSum) > Tolerance)
        {
            printf("
failed! for index:%d",i);
            printf("
CPU sum = %f", cpuSum);
            printf("
GPU sum = %f
", gpuSum);
        }
        else
        {
            printf("
Passed! for index:%d",i);
            printf("
CPU sum: %.2f", cpuSum);
            printf("
GPU sum: %.2f
", gpuSum);
        }
    }


    // cpu


    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    //printf("
Total program's running time is: %.2f
", time_spent);

    free(h_a);
    free(h_b);
    free(h_s);
    free(vector);
    //free(kernelSource);
    clReleaseProgram(program);
    clReleaseContext(context);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
}

float cpuSumfunc(float * vec, int n)
{

    float sum = 0;
    int i;

    for (i = 0; i < n; i++)
    {
        sum += vec[i];

    }
    return sum;
}

DarkZeros · Accepted Answer

Float32 values are not accurate enough for your summation operation and there will be rounding errors which will be different in CPU and GPU devices.

16956560 needs 25bits for accurate representation. Float32 only provides 23bits of accuracy. Which means: 16956560 + 1 = 16956560 if the operation is performed in Float32.

The differences in both devices is that:

Ordering: CPU and GPU will sum in different order, having different rounding errors.
Accuracy: Most CPUs (x86 and so) use internal 48bits float math, and then save it as 32bits. While GPUs do all math in pure 32bits.

You can solve it by using Float64 (double) or using integers (int64_t = Long).

Note: Actually, your GPU sum is more accurate than the CPU one, since it first packs small values together, and then adds those big values with the final sum.

OpenCL - GPU sum and CPU sum not the same

Answers (1)

Related Questions