2d loop OpenCl program is not working

This program is a simple parallel program which adds the elements of 2 arrays. The program was successfully compiled but the results are not right.

The program read the arrays from 2 files then add their elements.

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#include <CL/cl.h>
#include <time.h>

const int number_of_points = 12;  // number of points in Both  A and B files (number of rows)
const int number_of_axis = 3;     // number of points axis in Both  A and B files (number of Columns)

using namespace std;

int main(int argc, char *argv[]) {
    clock_t tStart = clock();
    // Create the two input vectors
    // working variables
    int i,j;
    ifstream input_fileA, input_fileB;  // input files
    string line;    // transfer row from file to array
    float x;        // transfer word from file to array
    int row = 0;    // number of rows of file A,B (= array)
    int col = 0;    // number of rows of file A,B (= array)

    // working arrays
    int mem_size_InoutA = number_of_points * number_of_axis;
    int mem_size_InoutB = number_of_points * number_of_axis;
    int mem_size_Output = number_of_points * number_of_axis;

    float inputAArray[number_of_points][number_of_axis]={{0}};  // array contains file A data
    float inputBArray[number_of_points][number_of_axis]={{0}};  // array contains file B data
    float outputArray[number_of_points][number_of_axis]={{0}};  // array contains file B data

    // import input files

    // transfer input files data to array
    // input file A to arrayA
    row = 0;
    while (getline(input_fileA, line))

        istringstream streamA(line);
        col = 0;
        while(streamA >> x){
            inputAArray[row][col] = x;

    // input file B to arrayB
    row = 0;
    while (getline(input_fileB, line))

        istringstream streamB(line);
        col = 0;
        while(streamB >> x){
            inputBArray[row][col] = x;

    // switch columns of B array
    for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
        float temporary = inputBArray[row_of_arrayB][2];
        inputBArray[row_of_arrayB][2] = inputBArray[row_of_arrayB][1];
        inputBArray[row_of_arrayB][1] = temporary;

    // close input files

    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
    source_str = (char*)malloc(number_of_points);
    source_size = fread( source_str, 1, number_of_points, fp);
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
            &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InoutA , NULL, &ret);
    cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InoutB, NULL, &ret);

    cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
            mem_size_Output, NULL, &ret);

    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
            mem_size_InoutA, inputAArray, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
            mem_size_InoutB, inputBArray, 0, NULL, NULL);

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
            (const char **)&source_str, (const size_t *)&source_size, &ret);

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);

    // Execute the OpenCL kernel on the list
    size_t global_item_size[2], local_item_size[2];
    global_item_size[0] = number_of_points; // Process the entire lists
    global_item_size[1] = number_of_points; // Process the entire lists
    local_item_size[0] = 3; // Process in groups of 64
    local_item_size[1] = 3; // Process in groups of 64

    ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
            global_item_size, local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
//    int *C = (int*)malloc(sizeof(int)*number_of_points);

//    float *C = (float*)malloc(sizeof(float)*number_of_points);
    ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
            number_of_points * sizeof(float), outputArray, 0, NULL, NULL);

    // Display the result to the screen
    float buttomSNM = 0;
    for(i = 0; i < number_of_points; i++)
        for(j= 0; j < number_of_axis; j++)
            printf("%f + %f = %f\n", inputAArray[i][j], inputBArray[i][j], outputArray[i][j]);

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(inputa_mem_obj);
    ret = clReleaseMemObject(inputb_mem_obj);
    ret = clReleaseMemObject(output_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
    return 0;

The kernel file is:

__kernel void calculate_bottom_SNM(__global float *inputAArray, __global float *inputBArray,
                         __global float *outputArray) {

    // Get the index of the current element
    int i = get_global_id(0);
    int j = get_global_id(1);

    outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];

The First File (First Array)

0   0.000000e+00    9.998994e-01    
1   1.000000e-03    9.998981e-01    
2   2.000000e-03    9.998967e-01    
3   3.000000e-03    9.998953e-01    
4   4.000000e-03    9.998939e-01    
5   5.000000e-03    9.998925e-01    
6   6.000000e-03    9.998911e-01    
7   7.000000e-03    9.998896e-01    
8   8.000000e-03    9.998881e-01    
9   9.000000e-03    9.998865e-01    
10  1.000000e-02    9.998850e-01    
11  1.100000e-02    9.998834e-01

The Second File (Second Array)

0   0.000000e+00    9.998966e-01    
1   1.000000e-03    9.998953e-01    
2   2.000000e-03    9.998939e-01    
3   3.000000e-03    9.998925e-01    
4   4.000000e-03    9.998911e-01    
5   5.000000e-03    9.998896e-01    
6   6.000000e-03    9.998881e-01    
7   7.000000e-03    9.998866e-01    
8   8.000000e-03    9.998850e-01    
9   9.000000e-03    9.998834e-01    
10  1.000000e-02    9.998818e-01    

And The results:

0.000000 + 0.000000 = 0.000000
0.000000 + 0.999897 = 0.000000
0.999899 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
0.001000 + 0.999895 = 0.000000
0.999898 + 0.001000 = 0.000000
2.000000 + 2.000000 = 0.000000
0.002000 + 0.999894 = 0.000000
0.999897 + 0.002000 = 0.000000
3.000000 + 3.000000 = 0.000000
0.003000 + 0.999892 = 0.000000
0.999895 + 0.003000 = 0.000000
4.000000 + 4.000000 = 0.000000
0.004000 + 0.999891 = 0.000000
0.999894 + 0.004000 = 0.000000
5.000000 + 5.000000 = 0.000000
0.005000 + 0.999890 = 0.000000
0.999892 + 0.005000 = 0.000000
6.000000 + 6.000000 = 0.000000
0.006000 + 0.999888 = 0.000000
0.999891 + 0.006000 = 0.000000
7.000000 + 7.000000 = 0.000000
0.007000 + 0.999887 = 0.000000
0.999890 + 0.007000 = 0.000000
8.000000 + 8.000000 = 0.000000
0.008000 + 0.999885 = 0.000000
0.999888 + 0.008000 = 0.000000
9.000000 + 9.000000 = 0.000000
0.009000 + 0.999883 = 0.000000
0.999887 + 0.009000 = 0.000000
10.000000 + 10.000000 = 0.000000
0.010000 + 0.999882 = 0.000000
0.999885 + 0.010000 = 0.000000
11.000000 + 0.000000 = 0.000000
0.011000 + 0.000000 = 0.000000
0.999883 + 0.000000 = 0.000000
ALL Time taken: 0.06s

Of course the results are not right, the right ones are sum of elements. Thanks,

Once again, you are failing to check the return codes from your OpenCL API calls. If you don't do this, you can't possibly know when problems occur. Every time you call an OpenCL function, you should do something like this:

ret = clDoSomething(...);
if (ret != CL_SUCCESS)
  printf("Failed on function clDoSomething: %d\n", ret);
  exit(1); // Or do whatever cleanup needs to be done before exiting

You can make this easier by defining a simple utility function:

void checkError(cl_int err, const char *operation)
  if (err != CL_SUCCESS)
    fprintf(stderr, "Error during operation '%s': %d\n", operation, err);


ret = clDoSomething(...);
checkError(ret, "calling clDoSomething");

This time, the problem appears to come from the clBuildProgram call (it returns -54, which corresponds to CL_BUILD_PROGRAM_FAILURE). In this case, you also need to get the build log to see the full error:

  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    // Get size of build log
    size_t logSize;
    ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                0, NULL, &logSize);
    checkError(ret, "getting build log size");

    // Get build log
    char log[logSize];
    ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                logSize, log, NULL);
    checkError(ret, "getting build log");

    printf("OpenCL program build log:\n%s\n", log);

If you add this to your code, you'll get a build log that looks something like this:

input.cl:1:10: error: unknown type name 'voi'; did you mean 'void'?
__kernel voi
input.cl:1:13: error: expected identifier or '('
__kernel voi

This looks strange, but indicates that your program may be getting cut off after just a few characters. If you look at the code that you've written to read the OpenCL program from file, you have this:

source_str = (char*)malloc(number_of_points);
source_size = fread( source_str, 1, number_of_points, fp);

So, you're only reading the first 12 characters of the program! You could use fseek and ftell to get the actual length of the file:

fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);

source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';

If you do this, you'll then get a different program build error:

input.cl:8:17: error: subscripted value is not an array, pointer, or vector
  outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
input.cl:8:37: error: subscripted value is not an array, pointer, or vector
  outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
input.cl:8:57: error: subscripted value is not an array, pointer, or vector
  outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];

This is because you are trying to index the arrays as if they are two dimensional, where in fact they are only one dimensional (as all OpenCL buffers are). You need to manually compute offsets into the 1D array to get around this, for example:

outputArray[i + j*number_of_points] = inputAArray[i + j*number_of_points] + inputBArray[i + j*number_of_points];

(this requires that you pass number_of_points to your kernel as an argument).

Finally, there are a couple of other errors:

  1. As pointed out in another answer, the size of the memory objects needs to multiplied by sizeof(cl_float) (and the clEnqueueReadBuffer call needs to use this).

  2. Your global work size should probably be this:

    global_item_size[0] = number_of_points;

    global_item_size[1] = number_of_axis;

The main take away from this answer is that you really, really need to check the error codes returned by every OpenCL API function call, otherwise you will never be able to debug these problems.

int mem_size_InoutA = number_of_points * number_of_axis * sizeof(cl_float);
int mem_size_InoutB = number_of_points * number_of_axis * sizeof(cl_float);
int mem_size_Output = number_of_points * number_of_axis * sizeof(cl_float);


 clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
        mem_size_Output, outputArray, 0, NULL, NULL);

