Reputation: 1
I'm new to OpenCL.
I wrote a program that should do parallel reduction on 1 million elements array. In the last part of the code I'm comparing the CPU sum and GPU sum , they are not the same, and that is the problem. My local size is 64. From index "90" the sum in the GPU is starting to get bigger.
EDIT: if i sum smaller number (now i sum 0 - 1m) suppose 1's the final sum is correct.
KERNEL:
__kernel void gpuSumfunc( __global float *vec ,__global float* sum, int n)
{
__local float tempSum[64];
const int i;
const int globalID = get_global_id(0); //BLOCK_DIM*BLOCK_IND+THREAD_ID
const int tid = get_local_id(0); //THREAD_ID
const int BlockDIM = get_local_size(0);//BLOCK_DIM=64
if (globalID < n)
{
tempSum[tid] = vec[globalID]; //Inserting global data to local data
}
else
{
tempSum[tid] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE); //Wating for all the threads to copy their data
for (i = BlockDIM / 2; i > 0; i /= 2)
{
if (tid < i)
{
tempSum[tid] += tempSum[tid + i];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0)
{
sum[get_group_id(0)] = tempSum[0];
}
}
MAIN:
//HOST-cpu
float *h_a;//input
float *h_b;//output
float *h_s;
//DEVICE-gpu
cl_mem d_a;//input buffer
cl_mem d_b;//Output
//Kernel File
FILE* fileKernel;
//Memory allocation - cpu input
vector = (float*)malloc(n * sizeof(float));
h_a = (float*)malloc(n * sizeof(float));
h_b = (float*)malloc(n * sizeof(float));
h_s = (float*)malloc(n * sizeof(float));
*vector = { 0 };
*h_a = { 0 };
*h_b = { 0 };
*h_s = { 0 };
//Initializing Data for gpu
for (i = 0; i < n; i++) {
h_a[i] = i;//(float)i;
}
//Initializing Data for cpu
for (i = 0; i < n; i++) {
vector[i] = i;//(float)i;
}
fileKernel = fopen("KernelCode.cl", "r");
if (!fileKernel)
{
printf("Cannot open kernel file!\n");
exit(1);
}
// Read kernel code
kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(kernelSource, 1, MAX_SOURCE_SIZE, fileKernel);
fclose(fileKernel);
error = clGetPlatformIDs(2, cp_Platform, NULL); //array with two devices
error = clGetDeviceIDs(cp_Platform[1], CL_DEVICE_TYPE_GPU, 1, &Device_ID, NULL); // cp_platform[1] = Nvidia GPU
context = clCreateContext(NULL, 1, &Device_ID, NULL, NULL, &error); // creating openCL context
queue = clCreateCommandQueue(context, Device_ID, 0, &error); // creating command queue, executing openCL context on device cp_Platform[1]
globalSize = ceil(n / (float)localSize)*localSize;
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);
error = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, n * sizeof(float), h_a, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
error |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,n * sizeof(float), h_s, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
program = clCreateProgramWithSource(context, 1, (const char **)& kernelSource, (const size_t *)&source_size, &error); //this function creates a program object for this specific openCL context
error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); //compiles and links a program executable from the program source
kernel = clCreateKernel(program, "gpuSumfunc", &error); //creating kernel object
error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void*)&workGroupSize, NULL);
error = clGetKernelWorkGroupInfo(kernel, Device_ID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), (void*)&pWorkGroupSize, NULL);
error = clGetDeviceInfo(Device_ID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(NumOfCU), &NumOfCU, NULL);
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); //Used to set the argument value for a specific argument of a kernel.
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
error |= clSetKernelArg(kernel, 2, sizeof(int), &n);
error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Enqueues a command to execute a kernel on a device.
clFinish(queue);
clEnqueueReadBuffer(queue, d_b, CL_TRUE, 0, n*sizeof(float) , h_b, 0, NULL, NULL); ////writing data from the device (d_b) to host(h_b)
clock_t end = clock();
for (i = 0; i < (n+localSize-1)/localSize; i++)
{
gpuSum += h_b[i];
cpuSum = cpuSumfunc(vector, 64*(i+1));
if ((gpuSum - cpuSum) > Tolerance)
{
printf("\nfailed! for index:%d",i);
printf("\nCPU sum = %f", cpuSum);
printf("\nGPU sum = %f\n", gpuSum);
}
else
{
printf("\nPassed! for index:%d",i);
printf("\nCPU sum: %.2f", cpuSum);
printf("\nGPU sum: %.2f\n", gpuSum);
}
}
// cpu
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
//printf("\nTotal program's running time is: %.2f\n", time_spent);
free(h_a);
free(h_b);
free(h_s);
free(vector);
//free(kernelSource);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
}
float cpuSumfunc(float * vec, int n)
{
float sum = 0;
int i;
for (i = 0; i < n; i++)
{
sum += vec[i];
}
return sum;
}
Upvotes: 0
Views: 438
Reputation: 8410
Float32
values are not accurate enough for your summation operation and there will be rounding errors which will be different in CPU and GPU devices.
16956560
needs 25bits for accurate representation.
Float32
only provides 23bits of accuracy.
Which means: 16956560 + 1 = 16956560 if the operation is performed in Float32.
The differences in both devices is that:
You can solve it by using Float64 (double
) or using integers (int64_t = Long).
Note: Actually, your GPU sum is more accurate than the CPU one, since it first packs small values together, and then adds those big values with the final sum.
Upvotes: 2