Reputation: 1127
I've just begun to experiment with OpenCL. I'm trying to make a kernel which will multiply two 2-d arrays. I've already done this with vectors, however in 2-d I get only results from the first row. I've tried implementing some of the solutions I found but every single of them keeps messing only with the first row. An image from the execution: https://i.sstatic.net/JmlAA.png
Here is my host file:
#include "stdafx.h"
#include <CL/cl.hpp>
#include <vector>
#include <iostream>
#include "util.hpp" // utility library
#define __CL_ENABLE_EXCEPTIONS
#define ROWS (5)
#define COLUMNS (5)
#include "metrics.h"
/*Start main()*/
int main(void)
{
int A = 4;
/*Define the vectors for operands and result*/
float** h_x = new float*[ROWS];
float** h_y = new float*[ROWS];
float** h_s = new float*[ROWS];
for (int i = 0; i < ROWS; ++i){
h_x[i] = new float[COLUMNS];
}
for (int i = 0; i < ROWS; ++i){
h_y[i] = new float[COLUMNS];
}
for (int i = 0; i < ROWS; ++i){
h_s[i] = new float[COLUMNS];
}
// Fill vectors a and b with random float values
for (int i = 0; i < ROWS; i++)
{
for (int j = 0; j < COLUMNS; j++){
h_x[i][j] = rand() / (float)RAND_MAX;
h_y[i][j] = rand() / (float)RAND_MAX;
h_s[i][j] = 0.0;
}
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Get all platforms (drivers)
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0){ // Check for issues
std::cout << " No platforms found. Check OpenCL installation!\n";
exit(1);
}
cl::Platform default_platform = all_platforms[0];
std::cout << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>() << "\n";
// Get default device of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0){ // Check for issues
std::cout << " No devices found. Check OpenCL installation!\n";
exit(1);
}
cl::Device default_device = all_devices[0];
std::cout << "Using device: " << default_device.getInfo<CL_DEVICE_NAME>() << "\n";
// Create an OpenCL context
cl::Context context({ default_device });
cl::Program program(context, util::loadProgram("saxy_kernel.cl"), true);
if (program.build({ default_device }) != CL_SUCCESS){
std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << "\n";
getchar();
exit(1);
}
// create buffers on the device
cl::Buffer buffer_X(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
cl::Buffer buffer_Y(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
cl::Buffer buffer_S(context, CL_MEM_READ_WRITE, sizeof(float)* ROWS*COLUMNS);
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int));
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, default_device);
StartCounter();
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_X, CL_TRUE, 0, sizeof(float)* ROWS*COLUMNS, &h_x[0][0]);
queue.enqueueWriteBuffer(buffer_Y, CL_TRUE, 0, sizeof(float)* ROWS*COLUMNS, &h_y[0][0]);
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int), &A);
//run the kernel
cl::Kernel kernel_add = cl::Kernel(program, "simple_add");
kernel_add.setArg(0, buffer_X);
kernel_add.setArg(1, buffer_Y);
kernel_add.setArg(2, buffer_S);
kernel_add.setArg(3, buffer_A);
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(5,5), cl::NullRange);
queue.finish();
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_S, CL_TRUE, 0, sizeof(float)* ROWS * COLUMNS, &h_s[0][0]);
std::cout << "Kernel execution time: " << GetCounter() << "ms \n";
/*Print vectors*/
std::cout << "\nMatrix #1: \n";
for (int i = 0; i<ROWS; i++){
std::cout << "\n";
for (int j = 0; j<COLUMNS; j++){
std::cout << "" << h_x[i][j] << "\t ";
}
}
std::cout << "\n\nMatrix #2: \n";
for (int i = 0; i<ROWS; i++){
std::cout << "\n";
for (int j = 0; j<COLUMNS; j++){
std::cout << "" << h_y[i][j] << "\t ";
}
}
std::cout << "\n\nResult: \n";
for (int i = 0; i<ROWS; i++){
std::cout << "\n";
for (int j = 0; j<COLUMNS; j++){
std::cout << "" << h_s[i][j] << "\t ";
}
}
getchar();
return 0;
}
And here the kernel:
__kernel void kernel simple_add(
__global float* X,
__global float* Y,
__global float* S,
__global int *A){
S[get_global_id(0)] = X[get_global_id(0)] * Y[get_global_id(0)];
/* Var defs
int k;
int i = get_global_id(0);
int j = get_global_id(1);
float tmp;
if ( (i < 5) && (j < 5))
{
tmp = 0.0;
for(k=0;k<5;k++)
tmp += X[i*5+k] * Y[k*5+j];
S[i*5+j] = tmp;
}*/
}
I'm sure I'm doing something really wrong, but I can't find out what is it. Any help will be greatly appreciated.
Upvotes: 0
Views: 2122
Reputation: 9925
Your kernel code is fine, as is the way that you are creating your OpenCL buffers and launching the kernel. The issue is in the way that your data is represented on the host, and how you are copying it to the device.
Your OpenCL buffers are 1D arrays, which is necessary. Your host arrays are 2D however, which mean that adjacent rows are not contiguous (a 2D array is an array of pointers).
The (simplest) fix would be to linearise your storage on the host, to match the data-layout of the device:
float* h_x = new float[ROWS*COLUMNS];
for (int i = 0; i < ROWS; ++i){
for (int j = 0; j < COLUMNS; ++j){
h_x[j + i*COLUMNS] = rand() / (float)RAND_MAX;;
}
}
Upvotes: 1