OpenCL Matrix Multiplication fails

Question

I am implementing a matrix multiplication using OpenCL but the problem is that i am always getting wrong results. Maybe someone here can give me a hint where the mistake might be.

Data Types and constants:

 /*
     *  Data types
     */

    // Matrix data type
    typedef float TMatrix;

    // This struct hold data for one Matrix
    struct Dimension {
        size_t x;
        size_t y;
        size_t z;
    };

    struct TMat {
        Dimension dims;
        TMatrix* pData;
    };

    /*
     *  Globals
     */

    // name of graphic card
    std::string const kPlatformName = "NVIDIA";
    std::string const kDeviceName = "GTX";

    // used block size (16 for below Fermi (2.x) and 32 for above)
    /*
     *  WARNING: This constant is also defined in the Kernel Source
     *           Do not forget to apply changes there too!
     */
    int const kBlockSize = 32;

    TMatrix const kFailure = 1e-5f;

    // Matrix dimensions - scale factor! (i [= kStart, kStart+kStep .. kStop] * block_size, ...)
    size_t const kStart = 1;
    size_t const kStep = 1;
    size_t const kStop = 30 + 1;        // check for < kStop


    // number of iterations to calculate avg exec time on GPU
    size_t const kNrIter = 1;

Host Code:

// enable OpenCL exceptions
    #define __CL_ENABLE_EXCEPTIONS

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include "types.h"

    #include "StopWatch.h"

    // include OpenCL C++ Wrapper classes instead of pure C API
    #include "CL/cl.hpp"

    using namespace std;

    /*
     *  Forward declaration
     */
    cl::Program LoadProgram(cl::Context& context, std::string const& fname, std::string const& params = "");
    int oclMatrixMult(TMat const& matA, TMat const& matB, TMat& matC);
    int CPUMatrixMultiplyNaive(TMat const& matA, TMat const& matB, TMat& matC, size_t const nrIt);

    int main(int argc, char** argv) {
        srand(2013);
        int retCPU = 1, retGPU = 1;
        volatile TMatrix GPUOptPrevent = 0;

        for (size_t i = kStart; i < kStop; i += kStep) {

                // Allocate host memory
            TMat matA;
            matA.dims.x = i*kBlockSize;
            matA.dims.y = i*kBlockSize;
            matA.dims.z = 1;
            matA.pData = new TMatrix[sizeof(TMatrix) * matA.dims.x * matA.dims.y];

            TMat matB;
            matB.dims.x = i*2*kBlockSize;
            matB.dims.y = i*kBlockSize;
            matB.dims.z = 1;
            matB.pData = new TMatrix[sizeof(TMatrix) * matB.dims.x * matB.dims.y];

            TMat matC;
            matC.dims.x = matB.dims.x;
            matC.dims.y = matA.dims.y;
            matC.dims.z = 1;
            matC.pData = new TMatrix[sizeof(TMatrix) * matC.dims.x * matC.dims.y];

            TMat matCPU;
            matCPU.dims.x = matB.dims.x;
            matCPU.dims.y = matA.dims.y;
            matCPU.dims.z = 1;
            matCPU.pData = new TMatrix[sizeof(TMatrix) * matCPU.dims.x * matCPU.dims.y];

            cout << "Matrix dimensions: A(" << matA.dims.x << "," << matA.dims.y << "), B(" << matB.dims.x << "," << matB.dims.y << ")" << endl;

                // Initialize host memory
    #if 0
            generate(matA.pData, matA.pData + matA.dims.x * matA.dims.y, rand);
            generate(matB.pData, matB.pData + matB.dims.x * matB.dims.y, rand);
    #else
            for (size_t a=0;a kFailure) {
                    correct = false;
                    break;
                }
            }
            cout << ((correct)?"OK":"Failed!") << endl << endl;

            delete [] matA.pData; matA.pData = 0;
            delete [] matB.pData; matB.pData = 0;
            delete [] matC.pData; matC.pData = 0;
            delete [] matCPU.pData; matCPU.pData = 0;
        }

        return retCPU + retGPU;
    }

    int oclMatrixMult(TMat const& matA, TMat const& matB, TMat& matC) {
        try {
                // Initialize OpenCL
            std::vector platforms;
            std::vector devices;
            int pidx = -1;  // platform index
            int didx = -1;  // device index

            size_t memsize_A = sizeof(TMatrix) * matA.dims.x * matA.dims.y;
            size_t memsize_B = sizeof(TMatrix) * matB.dims.x * matB.dims.y;
            size_t memsize_C = sizeof(TMatrix) * matC.dims.x * matC.dims.y;

                // checkout available GPU devices
            cl::Platform::get(&platforms);
            if (platforms.size() < 1) {
                return 1;
            }

            for (size_t i = 0; i < platforms.size(); ++i) {
                platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices);
                if (platforms[i].getInfo().find(kPlatformName) == string::npos) {
                    continue;
                }
                if (devices.size() > 0) {
                    for (size_t j = 0; j < devices.size(); ++j) {
                        if (devices[j].getInfo().find(kDeviceName) != string::npos) {
                            pidx = i;
                            didx = j;
                        }
                    }
                }
                if (didx >= 0) {
                    break;
                }
            }

                // create context for found GPU devices.
            std::vector device;
            device.push_back(devices[didx]);
            cl_context_properties cop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[pidx])(), 0};

    #if 1
            cout << "Platform: " << platforms[pidx].getInfo() << endl
                << "Device: " << devices[didx].getInfo() << endl;
    #endif

            cl::Context context;
            context = cl::Context(device, cop, 0, 0, 0);

                // create a cmd queue
            cl::CommandQueue CmdQueue(context, context.getInfo() [0]);

            // create buffers
            // two read only (ad Nvidia Programmers Guide for OpenCL)
            // and one write only buffer!
            cl_int err = CL_SUCCESS;
            cl::Buffer dA(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memsize_A, matA.pData, &err);
            if (err != CL_SUCCESS) {
                cerr << "cl::Buffer() failed (dA)" << endl;
            }
            cl::Buffer dB(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memsize_B, matB.pData, &err);
            if (err != CL_SUCCESS) {
                cerr << "cl::Buffer() failed (dA)" << endl;
            }
            cl::Buffer dC(context, CL_MEM_WRITE_ONLY, memsize_C, matC.pData, &err);
            if (err != CL_SUCCESS) {
                cerr << "cl::Buffer() failed (dA)" << endl;
            }

                // Load and build OpenCL kernel
            cl::Program program = LoadProgram(context, "MatrixMulKernel.cl");

                // Launch OpenCL kernel
            cl::Kernel MatMulKernel;
            MatMulKernel = cl::Kernel(program, "MatrixMulKernel");

            MatMulKernel.setArg(0, dC);
            MatMulKernel.setArg(1, dA);
            MatMulKernel.setArg(2, dB);
            MatMulKernel.setArg(3, matA.dims.x);
            MatMulKernel.setArg(4, matB.dims.x);

                // execute the kernel
            cl::NDRange LocalWorksize(kBlockSize,kBlockSize);
            cl::NDRange GlobalWorksize(matB.dims.x/(kBlockSize), matA.dims.y/(kBlockSize));

            // necessary?
            //CmdQueue.enqueueWriteBuffer(dA, CL_TRUE, 0, memsize_A, matA.pData);
            //CmdQueue.enqueueWriteBuffer(dB, CL_TRUE, 0, memsize_B, matB.pData);

            CmdQueue.enqueueNDRangeKernel(MatMulKernel, cl::NullRange, GlobalWorksize, cl::NullRange);

                // Retrieve result from device
            CmdQueue.enqueueReadBuffer(dC, CL_TRUE, 0, memsize_C, matC.pData);
            CmdQueue.finish();
        }
        catch (cl::Error err) {
            std::cerr << "ERROR: " << err.what() << " ("    << err.err() << ")" << std::endl;
            return 1;
        }
        catch (std::string err) {
            std::cerr << "ERROR: " << err << std::endl;
            return 1;
        }
        catch (...) {
            std::cerr << "Unknown error occurred!" << std::endl;
            return 1;
        }

        return 0;
    }

    int CPUMatrixMultiplyNaive(TMat const& matA, TMat const& matB, TMat& matC, size_t const nrIt) {

        if (nrIt == 0) return 1;

        TMatrix tmp = 0;
        for (size_t i = 0; i < nrIt; ++i) {

            for (size_t i = 0; i < matA.dims.y; ++i) {          // height of C
                for (size_t j = 0; j < matB.dims.x; ++j) {      // width of C
                    for (size_t k = 0; k < matA.dims.x; ++k) {  // width of A and height of B
                        tmp += matA.pData[i*matA.dims.x + k] * matB.pData[k*matB.dims.x + j];
                    }
                    matC.pData[i*matC.dims.x + j] = tmp;
                    tmp = 0;
                }
            }

        }
        return 0;
    }

    cl::Program LoadProgram(cl::Context& context, std::string const& fname, std::string const& params /*= ""*/) {
        cl::Program::Sources sources;
        cl::Program program;
        std::vector device = context.getInfo();

        std::ifstream src_file(fname.c_str());
        if (!src_file) { throw std::string("Failed to open Kernel-Source file!"); }
        std::string src_code(std::istreambuf_iterator(src_file), (std::istreambuf_iterator()));

        sources.insert(sources.end(), std::make_pair(src_code.c_str(), src_code.length()));
        program = cl::Program(context, sources);
        try {
            // build kernel source
            program.build(device, params.c_str());
        }
        catch (cl::Error e) {
            std::cerr << "Compilation build error log: " << std::endl <<
                program.getBuildInfo(device[0]) << std::endl;

            throw e;
        }
        return program;
    }

Kernel Code:

 #define BLOCK_SIZE 32

__kernel void MatrixMulKernel(__global float* C, __global float* A, __global float* B, unsigned int wA, unsigned int wB) {

    // Block index
    int bx = get_group_id(0);
    int by = get_group_id(1);

    // Thread index
    int tx = get_local_id(0);
    int ty = get_local_id(1);

    // Index of the first sub-matrix of A processed 
    // by the block
    int aBegin = wA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed 
    // by the block
    int aEnd   = aBegin + wA - 1;

    // Step size used to iterate through the 
    // sub-matrices of A
    int aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed 
    // by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the 
    // sub-matrices of B
    int bStep  = BLOCK_SIZE * wB;

    float Csub = 0;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {

        // Declaration of the local memory array As 
        // used to store the sub-matrix of A
        __local float As[BLOCK_SIZE][BLOCK_SIZE];

        // Declaration of the local memory array Bs 
        // used to store the sub-matrix of B
        __local float Bs[BLOCK_SIZE][BLOCK_SIZE];

        // Load the matrices from global memory
        // to local memory; each thread loads
        // one element of each matrix
        As[ty][tx] = A[a + wA * ty + tx];
        Bs[ty][tx] = B[b + wB * ty + tx];

        // Synchronize to make sure the matrices 
        // are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < BLOCK_SIZE; ++k) {
            Csub += As[ty][k] * Bs[k][tx];
        }

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);

    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    C[c + wB * ty + tx] = Csub;
}

A Visual Studio 10 project can be found here:
VS2010oclMatrixMultiplication

When executing this with matrices filles up from 0 .. matX.dims.x*matX.dims.y-1 I receive the following results (for A(32,32) B(64,32)):

  6.20951,6.02305,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
    2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;

Is someone here able to locate my fault? I played aroud to solve it for a couple of days and did some research but the kernel code has to be right. I also implemented the same kernel in CUDA and there it works perfectely and calculates propper results!

Thanks in advance
baron

Edit: I forgot to list my HW. I am using a NVIDIA GTX 460 to calculate the problem. This card only supports OpenCL 1.1.

Eric Bainville · Accepted Answer

Check you global and local size.

Apparently your kernel assumes the local size to be BLOCK_SIZE x BLOCK_SIZE, so you should pass these in enqueueNDRangeKernel instead of null.

Since each work item writes one single element to the output, the global size should be at least the output size (you divide it by BLOCK_SIZE in your code, which may be wrong).

To debug, write some trivial values, like (tx+1000*ty), in the output, so you will see immediately if something is wrong.

OpenCL Matrix Multiplication fails

Answers (2)

Related Questions