OpenCL 2-D array multiply

Question

I've just begun to experiment with OpenCL. I'm trying to make a kernel which will multiply two 2-d arrays. I've already done this with vectors, however in 2-d I get only results from the first row. I've tried implementing some of the solutions I found but every single of them keeps messing only with the first row. An image from the execution: https://i.sstatic.net/JmlAA.png

Here is my host file:

#include "stdafx.h"
#include 

#include 
#include 

#include "util.hpp" // utility library   

#define __CL_ENABLE_EXCEPTIONS
#define ROWS (5)
#define COLUMNS (5)

#include "metrics.h"

/*Start main()*/

int main(void)
{
    int A = 4;
    /*Define the vectors for operands and result*/

    float** h_x = new float*[ROWS];
    float** h_y = new float*[ROWS];
    float** h_s = new float*[ROWS];

    for (int i = 0; i < ROWS; ++i){
        h_x[i] = new float[COLUMNS];
    }

    for (int i = 0; i < ROWS; ++i){
        h_y[i] = new float[COLUMNS];
    }

    for (int i = 0; i < ROWS; ++i){
        h_s[i] = new float[COLUMNS];
    }

    // Fill vectors a and b with random float values

    for (int i = 0; i < ROWS; i++)
    {
        for (int j = 0; j < COLUMNS; j++){
            h_x[i][j] = rand() / (float)RAND_MAX;
            h_y[i][j] = rand() / (float)RAND_MAX;
            h_s[i][j] = 0.0;
        }   
    }

    /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/

    // Get all platforms (drivers)

    std::vector all_platforms;
    cl::Platform::get(&all_platforms);


    if (all_platforms.size() == 0){ // Check for issues
        std::cout << " No platforms found. Check OpenCL installation!
";
        exit(1);
    }

    cl::Platform default_platform = all_platforms[0];
    std::cout << "Using platform: " << default_platform.getInfo() << "
";

    // Get default device of the default platform

    std::vector all_devices;
    default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);

    if (all_devices.size() == 0){ // Check for issues
        std::cout << " No devices found. Check OpenCL installation!
";
        exit(1);
    }

    cl::Device default_device = all_devices[0];
    std::cout << "Using device: " << default_device.getInfo() << "
";

    // Create an OpenCL context

    cl::Context context({ default_device });

    cl::Program program(context, util::loadProgram("saxy_kernel.cl"), true);

    if (program.build({ default_device }) != CL_SUCCESS){
        std::cout << " Error building: " << program.getBuildInfo(default_device) << "
";
        getchar();
        exit(1);
    }

    // create buffers on the device
    cl::Buffer buffer_X(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
    cl::Buffer buffer_Y(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
    cl::Buffer buffer_S(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
    cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int));

    //create queue to which we will push commands for the device.
    cl::CommandQueue queue(context, default_device);


    StartCounter();
    //write arrays A and B to the device
    queue.enqueueWriteBuffer(buffer_X, CL_TRUE, 0, sizeof(float)* ROWS*COLUMNS, &h_x[0][0]);
    queue.enqueueWriteBuffer(buffer_Y, CL_TRUE, 0, sizeof(float)* ROWS*COLUMNS, &h_y[0][0]);
    queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int), &A);

    //run the kernel
    cl::Kernel kernel_add = cl::Kernel(program, "simple_add");
    kernel_add.setArg(0, buffer_X);
    kernel_add.setArg(1, buffer_Y);
    kernel_add.setArg(2, buffer_S);
    kernel_add.setArg(3, buffer_A);

    queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(5,5), cl::NullRange);
    queue.finish();

    //read result C from the device to array C
    queue.enqueueReadBuffer(buffer_S, CL_TRUE, 0, sizeof(float)* ROWS * COLUMNS, &h_s[0][0]);

    std::cout << "Kernel execution time: " << GetCounter() << "ms 
";

    /*Print vectors*/
    std::cout << "
Matrix #1: 
";
    for (int i = 0; i



And here the kernel:

__kernel void kernel simple_add(
   __global float* X, 
   __global float* Y, 
   __global float* S, 
   __global int *A){

   S[get_global_id(0)] = X[get_global_id(0)] * Y[get_global_id(0)];

/* Var defs
   int k;
   int i = get_global_id(0);
   int j = get_global_id(1);
   float tmp;

   if ( (i < 5) && (j < 5))
   {
       tmp = 0.0;
       for(k=0;k<5;k++)
           tmp += X[i*5+k] * Y[k*5+j];
       S[i*5+j] = tmp;
   }*/
}


I'm sure I'm doing something really wrong, but I can't find out what is it. Any help will be greatly appreciated.

jprice · Accepted Answer

Your kernel code is fine, as is the way that you are creating your OpenCL buffers and launching the kernel. The issue is in the way that your data is represented on the host, and how you are copying it to the device.

Your OpenCL buffers are 1D arrays, which is necessary. Your host arrays are 2D however, which mean that adjacent rows are not contiguous (a 2D array is an array of pointers).

The (simplest) fix would be to linearise your storage on the host, to match the data-layout of the device:

float* h_x = new float[ROWS*COLUMNS];
for (int i = 0; i < ROWS; ++i){
    for (int j = 0; j < COLUMNS; ++j){
      h_x[j + i*COLUMNS] = rand() / (float)RAND_MAX;;
    }
}

OpenCL 2-D array multiply

Answers (1)

Related Questions