Shared memory in OpenCL

Question

I intend to perform vector manipulations and was trying a small dummy program with vector addition and multiplication. However, the code does not run due to limitations on my knowledge on shared memory. All the sources in the internet show 2D matrix operations which I cannot translate to my vector problems. Please try to explain where am I going wrong considering the fact I am a novice in OpenCL. The code is given below:

Host Code:

std::vector platforms;
std::vector devices;
cl::Context context;
cl::CommandQueue queue;
cl::Program program;
cl::Kernel kernel;

cl::Platform::get(&platforms);

deviceUsed = 0;

cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(),0 };

context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
devices = context.getInfo();

queue = cl::CommandQueue(context, devices[deviceUsed]);
cl::Program::Sources source( 1, std::make_pair(kernel_source.c_str(),  kernel_source.size()));
program = cl::Program(context, source);
program.build(devices);

std::vector < float > a;
std::vector < float > b;
std::vector < float > sum;
std::vector < float > prod;

int globalSize = 128;
int localSize = 16;

a.resize(globalSize);
b.resize(globalSize);
sum.resize(globalSize);
prod.resize(globalSize);

for (int i = 0; i < globalSize ; i++)
{
    a[i] = 1.0f * i;
    b[i] = 5.0f * i;
}
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_sum;
cl::Buffer buffer_prod;

buffer_A = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
buffer_B = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);

queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * globalSize , &a[0]);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * globalSize , &b[0]);

buffer_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
buffer_prod = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);

kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_sum);
kernel.setArg(3, buffer_prod);

queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize/localSize), cl::NDRange(N), NULL);
queue.finish();
queue.enqueueReadBuffer(buffer_sum, CL_TRUE, 0, sizeof(float) * globalSize, &sum[0]);
queue.enqueueReadBuffer(buffer_prod, CL_TRUE, 0, sizeof(float) * globalSize, &prod[0]);

Kernel:

#define STRINGI(ker) #ker
std::string kernel_source = STRINGI(

__kernel void KernelAddMul(__global float* a, __global float* b, __global float* sum, __global float* prod)
{
    unsigned int j = get_local_id(0);
    int N = get_local_size(0);
    unsigned int i = N * get_global_id(0) + j;

    float locSum[N];
    float locProd[N];

    __local float Asub[N];
    __local float Bsub[N];

    for(int k = 0; k < N; k++){

        Asub[k] = a[i];
        Bsub[k] = b[i];
        barrier(CLK_LOCAL_MEM_FENCE);

        locSum[k] = Asub[k] + Bsub[k];
        locProd[k] = Asub[k] * Bsub[k];
        barrier(CLK_LOCAL_MEM_FENCE);

        sum[i] = locSum[k];
        prod[i] = locProd[k];
     }

} 

);

Shared memory in OpenCL

Answers (1)

Related Questions