Reputation: 617
I am trying to learn/teach myself OpenCL and started with a program to do Matrix Multiply. No matter what I do, I end up with the answer of zero.
I know that a 1x3 and a 3x1 should yield a 1x1 answer, and it should be non zero with the way I have it setup to create random floats. Here is my main body, and the kernel. Other than the warnings; What am I missing, I have been over this for hours and can't see the problem.
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"
int main()
{
int nX = 1;
int nY = 3;
int nZ = 1;
// Get all platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// create a command queue
cl::CommandQueue queue(context, device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()) {
throw std::runtime_error("Can not open kernel source file.");
}
std::stringstream ss;
ss << fs.rdbuf();
std::string code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
// Build the kernel
cl::Program program(context, sources);
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
//Create Matrix arrays and fill with random float values
float *A = new float[nX*nY];
float *B = new float[nY*nZ];
float *C = new float[nX*nZ];
for(int i =0; i < nX; i++){
for(int j = 0; j < nY; j++)
{
A[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1;
std::cout << " A[" << std::to_string(j + i * nY) << "] = ";
std::cout << A[j + i*nY] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
for(int i =0; i < nY; i++){
for(int j = 0; j < nZ; j++)
{
B[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " B[" + std::to_string(j + i * nY) + "] = " ;
std::cout << B[j + i * nY] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
//fill Matrix C with random values
for(int i =0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
C[j + i*nX] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " C[" + std::to_string(j + i * nX) + "] = " ;
std::cout << B[j + i * nX] << " ";
}
std::cout << std::endl;
}
// Create data/memory buffers, and equeue them
cl::Buffer bufA(context, CL_MEM_READ_ONLY, sizeof(float) * nX * nY);
cl::Buffer bufB(context, CL_MEM_READ_ONLY, sizeof(float) * nY * nZ);
cl::Buffer bufC(context, CL_MEM_READ_WRITE, sizeof(float) * nX * nZ);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nX * nY, A);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nY * nZ, B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, nX);
kernel.setArg(1, nY);
kernel.setArg(2, nZ);
kernel.setArg(3, bufA);
kernel.setArg(4, bufB);
kernel.setArg(5, bufC);
// Execute the kernel
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
// Retrieve results from global memory
queue.enqueueReadBuffer(bufC, CL_TRUE,0, sizeof(float) * nX * nZ, C);
queue.finish();
fs.close();
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
std::cout << "C[" + std::to_string(j*nZ+i) + "] = " ;
std::cout << C[j*nZ+i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
This is my Kernel function:
__kernel void mCrossProd(const int nX, const int nY, const int nZ, __global float* A, __global float* B, __global float* C) {
int i = get_global_id(0);
int j = get_global_id(1);
for(int k = 0; k < nX; k++){
C[j*nY+i] += A[j*nX+k] * B[k*nY+i];
}
}
Upvotes: 0
Views: 484
Reputation: 442
The problem is in the following line of code:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
Try this instead:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M,N), cl::NDRange(1,1));
I believe the arguments for enqueueNDRangeKernel refer to the number of workers/threads running for this kernel (suggest reading the docs). Try play around with the numbers and benchmark. Also, you can check for errors, which should make debugging easier.
if( queue.enqueueNDRangeKernel() != CL_SUCCESS ) { throw error; }
By adding this, you will find out that your code currently does not even compute the matrix multiplication and fails to queue up the kernel.
There is also a bug with the way you are accessing your array. You only allocate 3 floats for matrix A and matrix B. But you assign values to indices that are out of bounds. e.g. float *A = new float[3]; A[5] = 10.0f;
. This is undefined behaviour, which is why it does not necessarily crash but it is quite dangerous. You also do not free up the memory. If you use new
keyword, remember to match with delete
or delete[]
in this case when are you done using the memory. Otherwise you get memory leaks. Alternatively, you can look into using vectors, smart pointers, statically sized arrays etc.
Also,
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
Should probably be replace with
if(program.build({device}!=CL_SUCCESS))
{
std::cerr << "Failed to compile kernel code" << std::endl;
exit(1);
}
Or at the very least do catch(...)
instead of catching a specific exception. This is because I do not think program.build throws an exception. The documents state that it returns an error code, which is something you should check for.
One last thing, try to make the code simpler so it is easier to debug. For instance, you can try create a simple kernel that just adds two arrays rather than multiplying.
Anyways, I have modified your code. Hopefully it makes sense:
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include <CL/cl.hpp>
int main()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()){
throw std::runtime_error("Cannot open kernel source file.");
}
// Extract kernel code
std::stringstream ss;
ss << fs.rdbuf();
auto code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
fs.close();
// Build the kernel
cl::Program program(context, sources);
if(program.build({device})!=CL_SUCCESS){
std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)<<"\n";
exit(1);
}
// Output matrix dimensions
int M = 4, N = 3, K = 6;
int A_dims = M * K;
int B_dims = N * K;
int C_dims = M * N;
// Create buffers for device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);
float A[] = {2.0f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f,
4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f,
3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f,
0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f};
float B[] = {3.0f, 2.0f, 4.0f,
1.0f, 1.0f, 2.0f,
4.0f, 2.0f, 1.0f,
0.0f, 0.0f, 1.0f,
9.0f, 2.0f, 1.0f,
2.0f, 1.0f, 0.0f};
float C[] = {0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f};
cl::CommandQueue queue(context,device);
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(float)*A_dims,A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(float)*B_dims,B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, M);
kernel.setArg(1, N);
kernel.setArg(2, K);
kernel.setArg(3, buffer_A);
kernel.setArg(4, buffer_B);
kernel.setArg(5, buffer_C);
// Execute kernel
if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
{
std::cout << "Failed to launch kernel" << std::endl;
exit(1);
}
queue.finish();
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(float)*C_dims,C);
std::cout << sizeof(C) / sizeof(float) << std::endl;
std::cout << C_dims << std::endl;
std::cout << M << " " << N << std::endl;
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
std::cout << "C[" + std::to_string(i*N+j) + "] = ";
std::cout << C[i*N+j] << " ";
}
std::cout << std::endl;
}
}
Kernel source:
__kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
int const i = get_global_id(0);
int const j = get_global_id(1);
int const debug_elem_id = 3; // purely for debug purposes.
for(int k = 0; k < K; k++){
C[i*N+j] += A[i*K+k] * B[N*k+j];
if((i*N+j)==debug_elem_id)
{
printf("PROD, %.2f\n", A[i*K+k] * B[N*k+j]);
}
}
if((i*N+j)==debug_elem_id)
{
printf("SUM: %.2f\n", C[i*N+j]);
}
}
Edit: Made corrections to example code and explanation. Credit to @mogu
Upvotes: 2