Reputation: 33
I am implementing a matrix multiplication using OpenCL but the problem is that i am always getting wrong results. Maybe someone here can give me a hint where the mistake might be.
Data Types and constants:
/*
* Data types
*/
// Matrix data type
typedef float TMatrix;
// This struct hold data for one Matrix
struct Dimension {
size_t x;
size_t y;
size_t z;
};
struct TMat {
Dimension dims;
TMatrix* pData;
};
/*
* Globals
*/
// name of graphic card
std::string const kPlatformName = "NVIDIA";
std::string const kDeviceName = "GTX";
// used block size (16 for below Fermi (2.x) and 32 for above)
/*
* WARNING: This constant is also defined in the Kernel Source
* Do not forget to apply changes there too!
*/
int const kBlockSize = 32;
TMatrix const kFailure = 1e-5f;
// Matrix dimensions - scale factor! (i [= kStart, kStart+kStep .. kStop] * block_size, ...)
size_t const kStart = 1;
size_t const kStep = 1;
size_t const kStop = 30 + 1; // check for < kStop
// number of iterations to calculate avg exec time on GPU
size_t const kNrIter = 1;
Host Code:
// enable OpenCL exceptions
#define __CL_ENABLE_EXCEPTIONS
#include <iostream>
#include <algorithm>
#include <string>
#include <cstdio>
#include <cstdlib>
#include <stdlib.h>
#include <iterator>
#include <fstream>
#include "types.h"
#include "StopWatch.h"
// include OpenCL C++ Wrapper classes instead of pure C API
#include "CL/cl.hpp"
using namespace std;
/*
* Forward declaration
*/
cl::Program LoadProgram(cl::Context& context, std::string const& fname, std::string const& params = "");
int oclMatrixMult(TMat const& matA, TMat const& matB, TMat& matC);
int CPUMatrixMultiplyNaive(TMat const& matA, TMat const& matB, TMat& matC, size_t const nrIt);
int main(int argc, char** argv) {
srand(2013);
int retCPU = 1, retGPU = 1;
volatile TMatrix GPUOptPrevent = 0;
for (size_t i = kStart; i < kStop; i += kStep) {
// Allocate host memory
TMat matA;
matA.dims.x = i*kBlockSize;
matA.dims.y = i*kBlockSize;
matA.dims.z = 1;
matA.pData = new TMatrix[sizeof(TMatrix) * matA.dims.x * matA.dims.y];
TMat matB;
matB.dims.x = i*2*kBlockSize;
matB.dims.y = i*kBlockSize;
matB.dims.z = 1;
matB.pData = new TMatrix[sizeof(TMatrix) * matB.dims.x * matB.dims.y];
TMat matC;
matC.dims.x = matB.dims.x;
matC.dims.y = matA.dims.y;
matC.dims.z = 1;
matC.pData = new TMatrix[sizeof(TMatrix) * matC.dims.x * matC.dims.y];
TMat matCPU;
matCPU.dims.x = matB.dims.x;
matCPU.dims.y = matA.dims.y;
matCPU.dims.z = 1;
matCPU.pData = new TMatrix[sizeof(TMatrix) * matCPU.dims.x * matCPU.dims.y];
cout << "Matrix dimensions: A(" << matA.dims.x << "," << matA.dims.y << "), B(" << matB.dims.x << "," << matB.dims.y << ")" << endl;
// Initialize host memory
#if 0
generate(matA.pData, matA.pData + matA.dims.x * matA.dims.y, rand);
generate(matB.pData, matB.pData + matB.dims.x * matB.dims.y, rand);
#else
for (size_t a=0;a<matA.dims.x * matA.dims.y;++a) {
matA.pData[a] = (float)a;
}
for (size_t b=0;b<matB.dims.x * matB.dims.y;++b) {
matB.pData[b] = (float)b;
}
#endif
memset(matC.pData, 0, matC.dims.x*matC.dims.y*sizeof(TMatrix));
memset(matCPU.pData, 0, matCPU.dims.x*matCPU.dims.y*sizeof(TMatrix));
#if 1
ofstream fA("matA.txt");
ofstream fB("matB.txt");
for (size_t r=0; r < matA.dims.y; ++r) {
for (size_t c=0; c < matA.dims.x; ++c) {
fA << matA.pData[r*matA.dims.x + c];
fA << ((c<matA.dims.x-1)?",":";\n");
}
}
for (size_t r=0; r < matB.dims.y; ++r) {
for (size_t c=0; c < matB.dims.x; ++c) {
fB << matB.pData[r*matB.dims.x + c];
fB << ((c<matB.dims.x-1)?",":";\n");
}
}
fA.close();
fB.close();
#endif
// Performing kernel execution
double GPUtime = 0;
for (size_t it = 0; it < kNrIter; ++it) {
stw::Start();
retGPU = oclMatrixMult(matA, matB, matC);
GPUtime += stw::Stop();
// prevent optimization
volatile int r = rand() % (matC.dims.x*matC.dims.y);
GPUOptPrevent += matC.pData[r];
}
cout << "Average Time for " << kNrIter << " matrix multiplication on GPU: " << GPUtime << " seconds." << endl;
cout << "Average Time for one matrix multiplication on GPU: " << GPUtime/kNrIter << " seconds." << endl;
// CPU calculation only once to verify result
retCPU = CPUMatrixMultiplyNaive(matA, matB, matCPU, 1);
// prevent optimization
volatile int r = rand() % (matA.dims.x*matA.dims.y);
volatile int s = rand() % (matB.dims.x*matB.dims.y);
volatile int t = rand() % (matC.dims.x*matC.dims.y);
volatile int u = rand() % (matCPU.dims.x*matCPU.dims.y);
cout << "Optimization prevention: " << GPUOptPrevent << matA.pData[r] << matB.pData[s] << matC.pData[t] << matCPU.pData[u] << endl;
#if 1
ofstream fC("matC.txt");
ofstream fCPU("matCPU.txt");
for (size_t r = 0; r < matC.dims.y; r++) {
for (size_t e = 0; e < matC.dims.x; ++e) {
fC << matC.pData[r*matC.dims.x + e] << ((e<(matC.dims.x-1)) ? ",":";");
fCPU << matCPU.pData[r*matCPU.dims.x + e] << ((e<(matCPU.dims.x-1)) ? ",":";");
}
fC << endl;
fCPU << endl;
}
fC.close();
fCPU.close();
#endif
// Verify result
bool correct = true;
cout << "Checking results for correctness: ";
for (size_t v = 0; v < (matC.dims.x*matC.dims.y); ++v) {
if (fabs(matC.pData[v] - matCPU.pData[v]) > kFailure) {
correct = false;
break;
}
}
cout << ((correct)?"OK":"Failed!") << endl << endl;
delete [] matA.pData; matA.pData = 0;
delete [] matB.pData; matB.pData = 0;
delete [] matC.pData; matC.pData = 0;
delete [] matCPU.pData; matCPU.pData = 0;
}
return retCPU + retGPU;
}
int oclMatrixMult(TMat const& matA, TMat const& matB, TMat& matC) {
try {
// Initialize OpenCL
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
int pidx = -1; // platform index
int didx = -1; // device index
size_t memsize_A = sizeof(TMatrix) * matA.dims.x * matA.dims.y;
size_t memsize_B = sizeof(TMatrix) * matB.dims.x * matB.dims.y;
size_t memsize_C = sizeof(TMatrix) * matC.dims.x * matC.dims.y;
// checkout available GPU devices
cl::Platform::get(&platforms);
if (platforms.size() < 1) {
return 1;
}
for (size_t i = 0; i < platforms.size(); ++i) {
platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (platforms[i].getInfo<CL_PLATFORM_NAME>().find(kPlatformName) == string::npos) {
continue;
}
if (devices.size() > 0) {
for (size_t j = 0; j < devices.size(); ++j) {
if (devices[j].getInfo<CL_DEVICE_NAME>().find(kDeviceName) != string::npos) {
pidx = i;
didx = j;
}
}
}
if (didx >= 0) {
break;
}
}
// create context for found GPU devices.
std::vector<cl::Device> device;
device.push_back(devices[didx]);
cl_context_properties cop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[pidx])(), 0};
#if 1
cout << "Platform: " << platforms[pidx].getInfo<CL_PLATFORM_NAME>() << endl
<< "Device: " << devices[didx].getInfo<CL_DEVICE_NAME>() << endl;
#endif
cl::Context context;
context = cl::Context(device, cop, 0, 0, 0);
// create a cmd queue
cl::CommandQueue CmdQueue(context, context.getInfo<CL_CONTEXT_DEVICES>() [0]);
// create buffers
// two read only (ad Nvidia Programmers Guide for OpenCL)
// and one write only buffer!
cl_int err = CL_SUCCESS;
cl::Buffer dA(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memsize_A, matA.pData, &err);
if (err != CL_SUCCESS) {
cerr << "cl::Buffer() failed (dA)" << endl;
}
cl::Buffer dB(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, memsize_B, matB.pData, &err);
if (err != CL_SUCCESS) {
cerr << "cl::Buffer() failed (dA)" << endl;
}
cl::Buffer dC(context, CL_MEM_WRITE_ONLY, memsize_C, matC.pData, &err);
if (err != CL_SUCCESS) {
cerr << "cl::Buffer() failed (dA)" << endl;
}
// Load and build OpenCL kernel
cl::Program program = LoadProgram(context, "MatrixMulKernel.cl");
// Launch OpenCL kernel
cl::Kernel MatMulKernel;
MatMulKernel = cl::Kernel(program, "MatrixMulKernel");
MatMulKernel.setArg(0, dC);
MatMulKernel.setArg(1, dA);
MatMulKernel.setArg(2, dB);
MatMulKernel.setArg(3, matA.dims.x);
MatMulKernel.setArg(4, matB.dims.x);
// execute the kernel
cl::NDRange LocalWorksize(kBlockSize,kBlockSize);
cl::NDRange GlobalWorksize(matB.dims.x/(kBlockSize), matA.dims.y/(kBlockSize));
// necessary?
//CmdQueue.enqueueWriteBuffer(dA, CL_TRUE, 0, memsize_A, matA.pData);
//CmdQueue.enqueueWriteBuffer(dB, CL_TRUE, 0, memsize_B, matB.pData);
CmdQueue.enqueueNDRangeKernel(MatMulKernel, cl::NullRange, GlobalWorksize, cl::NullRange);
// Retrieve result from device
CmdQueue.enqueueReadBuffer(dC, CL_TRUE, 0, memsize_C, matC.pData);
CmdQueue.finish();
}
catch (cl::Error err) {
std::cerr << "ERROR: " << err.what() << " (" << err.err() << ")" << std::endl;
return 1;
}
catch (std::string err) {
std::cerr << "ERROR: " << err << std::endl;
return 1;
}
catch (...) {
std::cerr << "Unknown error occurred!" << std::endl;
return 1;
}
return 0;
}
int CPUMatrixMultiplyNaive(TMat const& matA, TMat const& matB, TMat& matC, size_t const nrIt) {
if (nrIt == 0) return 1;
TMatrix tmp = 0;
for (size_t i = 0; i < nrIt; ++i) {
for (size_t i = 0; i < matA.dims.y; ++i) { // height of C
for (size_t j = 0; j < matB.dims.x; ++j) { // width of C
for (size_t k = 0; k < matA.dims.x; ++k) { // width of A and height of B
tmp += matA.pData[i*matA.dims.x + k] * matB.pData[k*matB.dims.x + j];
}
matC.pData[i*matC.dims.x + j] = tmp;
tmp = 0;
}
}
}
return 0;
}
cl::Program LoadProgram(cl::Context& context, std::string const& fname, std::string const& params /*= ""*/) {
cl::Program::Sources sources;
cl::Program program;
std::vector<cl::Device> device = context.getInfo<CL_CONTEXT_DEVICES>();
std::ifstream src_file(fname.c_str());
if (!src_file) { throw std::string("Failed to open Kernel-Source file!"); }
std::string src_code(std::istreambuf_iterator<char>(src_file), (std::istreambuf_iterator<char>()));
sources.insert(sources.end(), std::make_pair(src_code.c_str(), src_code.length()));
program = cl::Program(context, sources);
try {
// build kernel source
program.build(device, params.c_str());
}
catch (cl::Error e) {
std::cerr << "Compilation build error log: " << std::endl <<
program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device[0]) << std::endl;
throw e;
}
return program;
}
Kernel Code:
#define BLOCK_SIZE 32
__kernel void MatrixMulKernel(__global float* C, __global float* A, __global float* B, unsigned int wA, unsigned int wB) {
// Block index
int bx = get_group_id(0);
int by = get_group_id(1);
// Thread index
int tx = get_local_id(0);
int ty = get_local_id(1);
// Index of the first sub-matrix of A processed
// by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed
// by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the
// sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed
// by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the
// sub-matrices of B
int bStep = BLOCK_SIZE * wB;
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the local memory array As
// used to store the sub-matrix of A
__local float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the local memory array Bs
// used to store the sub-matrix of B
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from global memory
// to local memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices
// are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
A Visual Studio 10 project can be found here:
VS2010oclMatrixMultiplication
When executing this with matrices filles up from 0 .. matX.dims.x*matX.dims.y-1 I receive the following results (for A(32,32) B(64,32)):
6.20951,6.02305,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,2.35099e-038,2.35099e-038,2.35099e-038,2.35099e-038,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
Is someone here able to locate my fault? I played aroud to solve it for a couple of days and did some research but the kernel code has to be right. I also implemented the same kernel in CUDA and there it works perfectely and calculates propper results!
Thanks in advance
baron
Edit: I forgot to list my HW. I am using a NVIDIA GTX 460 to calculate the problem. This card only supports OpenCL 1.1.
Upvotes: 1
Views: 809
Reputation: 33
I fond the mistake:
The wrong result were caused, as Eric mentioned, by the local and global size variables.
The OpenCL 1.1 standard specifies the local and global worksize as follows:
The explicitly specified local_work_size will be used to determine how to break the global work-items specified by Last Revision Date: 6/1/11 Page 134 global_work_size into appropriate work-group instances. If local_work_size is specified, the values specified in global_work_size[0], … global_work_size[work_dim - 1] must be evenly divisible by the corresponding values specified in local_work_size[0], … local_work_size[work_dim – 1].
opencl-1.1.pdf Defined at page 133/134
When I change the following lines, specifying the local and global worksize from
cl::NDRange GlobalWorksize(kBlockSize,kBlockSize);
cl::NDRange LocalWorksize(matB.dims.x/(kBlockSize), matA.dims.y/(kBlockSize));
to
cl::NDRange GlobalWorksize(matB.dims.x, matA.dims.y);
cl::NDRange LocalWorksize(range,range);
all results are computed correctly.
Thank you Eric for your hint!!
Upvotes: 2
Reputation: 9916
Check you global and local size.
Apparently your kernel assumes the local size to be BLOCK_SIZE x BLOCK_SIZE, so you should pass these in enqueueNDRangeKernel instead of null.
Since each work item writes one single element to the output, the global size should be at least the output size (you divide it by BLOCK_SIZE in your code, which may be wrong).
To debug, write some trivial values, like (tx+1000*ty), in the output, so you will see immediately if something is wrong.
Upvotes: 1