Bob R
Bob R

Reputation: 617

OpenCL Matrix Multiply runs, but answer is always zero

I am trying to learn/teach myself OpenCL and started with a program to do Matrix Multiply. No matter what I do, I end up with the answer of zero.

I know that a 1x3 and a 3x1 should yield a 1x1 answer, and it should be non zero with the way I have it setup to create random floats. Here is my main body, and the kernel. Other than the warnings; What am I missing, I have been over this for hours and can't see the problem.

#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"

int main() 
{

    int nX = 1;
    int nY = 3;
    int nZ = 1;

    // Get all platforms
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    if(platforms.empty()){
        throw std::runtime_error("No Platforms found, check OpenCL installation.");
    }
    cl::Platform platform = platforms[0];
    std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
    std::vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
    if(devices.empty()){
        throw std::runtime_error ("No Devices Found, check installation.");
    }
    cl::Device device = devices[0];

    // Create an execusion context
    cl::Context context(device);

    // create a command queue
    cl::CommandQueue queue(context, device);

    // Load the kernel sources, use global memory
    std::ifstream fs("mCrossProd.cl");
    if(!fs.is_open()) {
        throw  std::runtime_error("Can not open kernel source file.");
    }
    std::stringstream ss;
    ss << fs.rdbuf();
    std::string code = ss.str();
    cl::Program::Sources sources;
    sources.push_back({code.c_str(), code.length()});

    // Build the kernel
    cl::Program program(context, sources);
    try{
        program.build({device});
    } catch(std::exception &err){
        throw  std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
    }

    //Create Matrix arrays and fill with random float values
    float *A = new float[nX*nY];
    float *B = new float[nY*nZ];
    float *C = new float[nX*nZ];

    for(int i =0; i < nX; i++){
        for(int j = 0; j < nY; j++)
        {
            A[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1;
            std::cout << " A[" << std::to_string(j + i * nY) << "] = ";
            std::cout << A[j + i*nY] << ' '; 
        }

        std::cout << std::endl;
    }

            std::cout << std::endl;


     for(int i =0; i < nY; i++){
        for(int j = 0; j < nZ; j++)
        {
            B[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1 ;
            std::cout << " B[" + std::to_string(j + i * nY) + "] = " ;
            std::cout << B[j + i * nY] << " "; 
        }

        std::cout << std::endl;

    }

            std::cout << std::endl;


    //fill Matrix C with random values
    for(int i =0; i < nX; i++){
        for(int j = 0; j < nZ; j++)
        {
            C[j + i*nX] = rand()/(float)RAND_MAX * 10 + 1 ;
            std::cout << " C[" + std::to_string(j + i * nX) + "] = " ;
            std::cout << B[j + i * nX] << " "; 
        }

        std::cout << std::endl;

    }

    // Create data/memory buffers, and equeue them
    cl::Buffer bufA(context, CL_MEM_READ_ONLY, sizeof(float) * nX * nY);
    cl::Buffer bufB(context, CL_MEM_READ_ONLY, sizeof(float) * nY * nZ);
    cl::Buffer bufC(context, CL_MEM_READ_WRITE, sizeof(float) * nX * nZ);
    queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nX * nY, A);
    queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nY * nZ, B);

    // Select kernel, pass arguments
    cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
    kernel.setArg(0, nX);
    kernel.setArg(1, nY);
    kernel.setArg(2, nZ);
    kernel.setArg(3, bufA);
    kernel.setArg(4, bufB);
    kernel.setArg(5, bufC);

    // Execute the kernel
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
    
    // Retrieve results from global memory
    queue.enqueueReadBuffer(bufC, CL_TRUE,0, sizeof(float) * nX * nZ, C);
    queue.finish();

    fs.close();

    std::cout << "\nThe solution is" << std::endl;

    for(int i = 0; i < nX; i++){
        for(int j = 0; j < nZ; j++)
        {
            std::cout << "C[" + std::to_string(j*nZ+i) + "] = " ;
            std::cout << C[j*nZ+i] << " "; 

        }

        std::cout << std::endl;

    }
        std::cout << std::endl;

This is my Kernel function:

__kernel void mCrossProd(const int nX, const int nY, const int nZ, __global float* A, __global float* B, __global float* C) {
    int i = get_global_id(0);
    int j = get_global_id(1);

    for(int k = 0; k < nX; k++){
        C[j*nY+i] += A[j*nX+k] * B[k*nY+i];
    }
}

Upvotes: 0

Views: 484

Answers (1)

Tagger5926
Tagger5926

Reputation: 442

The problem is in the following line of code:

  queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));

Try this instead:

  queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M,N), cl::NDRange(1,1));

I believe the arguments for enqueueNDRangeKernel refer to the number of workers/threads running for this kernel (suggest reading the docs). Try play around with the numbers and benchmark. Also, you can check for errors, which should make debugging easier.

if( queue.enqueueNDRangeKernel() != CL_SUCCESS ) { throw error; }

By adding this, you will find out that your code currently does not even compute the matrix multiplication and fails to queue up the kernel.

There is also a bug with the way you are accessing your array. You only allocate 3 floats for matrix A and matrix B. But you assign values to indices that are out of bounds. e.g. float *A = new float[3]; A[5] = 10.0f;. This is undefined behaviour, which is why it does not necessarily crash but it is quite dangerous. You also do not free up the memory. If you use new keyword, remember to match with delete or delete[] in this case when are you done using the memory. Otherwise you get memory leaks. Alternatively, you can look into using vectors, smart pointers, statically sized arrays etc.

Also,

 try{
    program.build({device});
} catch(std::exception &err){
    throw  std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}

Should probably be replace with

if(program.build({device}!=CL_SUCCESS))
{
     std::cerr << "Failed to compile kernel code" << std::endl;
     exit(1);
}

Or at the very least do catch(...) instead of catching a specific exception. This is because I do not think program.build throws an exception. The documents state that it returns an error code, which is something you should check for.

One last thing, try to make the code simpler so it is easier to debug. For instance, you can try create a simple kernel that just adds two arrays rather than multiplying.

Anyways, I have modified your code. Hopefully it makes sense:

#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#include <iostream>
#include <fstream>
#include <sstream>
#include <CL/cl.hpp>

int main()
{
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    if(platforms.empty()){
        throw std::runtime_error("No Platforms found, check OpenCL installation.");
    }

    cl::Platform platform = platforms[0];
    std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
    std::vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
    if(devices.empty()){
        throw std::runtime_error ("No Devices Found, check installation.");
    }
    cl::Device device = devices[0];

    // Create an execusion context
    cl::Context context(device);

    // Load the kernel sources, use global memory
    std::ifstream fs("mCrossProd.cl");
    if(!fs.is_open()){
        throw  std::runtime_error("Cannot open kernel source file.");
    }

    // Extract kernel code
    std::stringstream ss;
    ss << fs.rdbuf();
    auto code = ss.str();
    cl::Program::Sources sources;
    sources.push_back({code.c_str(), code.length()});
    fs.close();

    // Build the kernel
    cl::Program program(context, sources);
    if(program.build({device})!=CL_SUCCESS){
        std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)<<"\n";
        exit(1);
    }

    // Output matrix dimensions
    int M = 4, N = 3, K = 6;
    int A_dims = M * K;
    int B_dims = N * K;
    int C_dims = M * N;

    // Create buffers for device
    cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
    cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
    cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);

    float A[] = {2.0f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f,
                 4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f,
                 3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f,
                 0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f};
    float B[] = {3.0f, 2.0f, 4.0f,
                 1.0f, 1.0f, 2.0f,
                 4.0f, 2.0f, 1.0f,
                 0.0f, 0.0f, 1.0f,
                 9.0f, 2.0f, 1.0f,
                 2.0f, 1.0f, 0.0f};
    float C[] = {0.0f, 0.0f, 0.0f,
                 0.0f, 0.0f, 0.0f,
                 0.0f, 0.0f, 0.0f,
                 0.0f, 0.0f, 0.0f};

    cl::CommandQueue queue(context,device);

    //write arrays A and B to the device
    queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(float)*A_dims,A);
    queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(float)*B_dims,B);

    // Select kernel, pass arguments
    cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
    kernel.setArg(0, M);
    kernel.setArg(1, N);
    kernel.setArg(2, K);
    kernel.setArg(3, buffer_A);
    kernel.setArg(4, buffer_B);
    kernel.setArg(5, buffer_C);

    // Execute kernel
    if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
    {
        std::cout << "Failed to launch kernel" << std::endl;
        exit(1);
    }
    queue.finish();

    // read result C from the device to array C
    queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(float)*C_dims,C);
    std::cout << sizeof(C) / sizeof(float) << std::endl;
    std::cout << C_dims << std::endl;
    std::cout << M << " " << N << std::endl;
    std::cout << "\nThe solution is" << std::endl;
     for(int i = 0; i < M; i++) {
        for(int j = 0; j < N; j++) {
            std::cout << "C[" + std::to_string(i*N+j) + "] = ";
            std::cout << C[i*N+j] << " ";
        }
        std::cout << std::endl;
    }
}

Kernel source:

__kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
    int const i = get_global_id(0);
    int const j = get_global_id(1);
    int const debug_elem_id = 3; // purely for debug purposes.

    for(int k = 0; k < K; k++){
        C[i*N+j] += A[i*K+k] * B[N*k+j];
        if((i*N+j)==debug_elem_id)
        {
            printf("PROD, %.2f\n", A[i*K+k] * B[N*k+j]);
        }
    }
    if((i*N+j)==debug_elem_id)
    {
        printf("SUM: %.2f\n", C[i*N+j]);
    }
}

Edit: Made corrections to example code and explanation. Credit to @mogu

Upvotes: 2

Related Questions