OpenCL kernel sets an improper value to a matrix after convolution of a 2d cnn

Question

I am writing a software for convolution of 2D images. I am adopting OpenCL due to the high performance on big images (30 MB and above). I have the following kernel code:

static const char* src3x3= R"(
    __kernel void conv3x3(__global unsigned int *pixels, unsigned int gwidth, unsigned int gheight,
    __global float* kern)
    {
        unsigned int MyMat[9];
        int x = get_global_id(0);
        int y = get_global_id(1);

        int res = 0;
        for(int i=0; i <3; i++) {
            int r,g,b;
            for(int j=0; j < 3; j++) {
                r = (pixels[(x+j) + (gwidth * (y+i))] >> 16) & 0xFFu;
                g = (pixels[(x+j) + (gwidth * (y+i))] >> 8) & 0xFFu;
                b = (pixels[(x+j) + (gwidth * (y+i))]) & 0xFFu;
                r *= kern[i * 3 + j];
                g *= kern[i * 3 + j];
                b *= kern[i * 3 + j];
                MyMat[i * 3 + j]= (0xFF000000) | ( ( r & 0xFFu) << 16) | ( (g & 0xFFu) << 8)  | (b & 0xFFu);
           }
        }
        pixels[y * gwidth + x] = MyMat[4];

    }
)";

and the following [host]:

void gpu_kernel::conv3x3(unsigned int *pixels, unsigned int x, unsigned int y, unsigned int w, unsigned int h,
                         float (&kern)[9])
{
    size_t dim = 2;
    size_t global_offset[] = {x, y};
    size_t global_size[] = {w, h};
    size_t wsize = 0 ;
    size_t globwsize = w * h * sizeof(unsigned int);
    cl_mem bufferx,  bufferkern;
    size_t log_size;
    char* program_log = 0;
    printf("x:%lu y:%lu w:%lu h:%lu size is (%lu)
", x, y, w, h,globwsize);
    clGetPlatformIDs(1, &m_kernctx.platform, &m_kernctx.platforms);
    clGetDeviceIDs(m_kernctx.platform, CL_DEVICE_TYPE_ALL, 1, &m_kernctx.device, &m_kernctx.numdevs);
    m_kernctx.context = clCreateContext(NULL, 1, &m_kernctx.device, NULL, NULL, NULL);
    m_kernctx.command_queue =     clCreateCommandQueue(m_kernctx.context, m_kernctx.device, 0, NULL);
    bufferx =   clCreateBuffer(m_kernctx.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                             globwsize , pixels, NULL);

    bufferkern = clCreateBuffer(m_kernctx.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                sizeof(float) * 9 , kern , NULL);

    m_kernctx.program = clCreateProgramWithSource(m_kernctx.context, 1, &src3x3, NULL, &m_kernctx.compilerr);
    if (m_kernctx.compilerr) {
        return;
    }

    cl_int res = clBuildProgram(m_kernctx.program, 1, &m_kernctx.device, "", NULL, NULL);
    if (res < 0) {
        clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
                              0, NULL, &log_size);
        program_log = (char*) calloc(log_size+1, sizeof(char));
        clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
                              log_size+1, program_log, NULL);
        printf("%s
", program_log);
        free(program_log);
        return;
    }
    cl_int callres = 0;
    m_kernctx.kernel = clCreateKernel(m_kernctx.program, "conv3x3", NULL);
    callres |= clSetKernelArg(m_kernctx.kernel, 0, sizeof(cl_mem), &bufferx);
    callres |= clSetKernelArg(m_kernctx.kernel, 1, sizeof(unsigned int), &w);
    callres |= clSetKernelArg(m_kernctx.kernel, 2, sizeof(unsigned int), &h);
    callres |= clSetKernelArg(m_kernctx.kernel, 3, sizeof(cl_mem), &bufferkern);

    callres |= clGetKernelWorkGroupInfo(m_kernctx.kernel, m_kernctx.device,  CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wsize, NULL);
    callres |= clEnqueueNDRangeKernel(m_kernctx.command_queue,
                                      m_kernctx.kernel,
                                      dim,
                                      global_offset,
                                      global_size,
                                      0,
                                      0,
                                      NULL,
                                      NULL);

    clFlush(m_kernctx.command_queue);
    clFinish(m_kernctx.command_queue);

    callres|= clEnqueueReadBuffer(m_kernctx.command_queue, bufferx, CL_TRUE, 0,  globwsize, pixels, 0, NULL, NULL);
    clReleaseMemObject(bufferx);
    clReleaseMemObject(bufferkern);
}

The problem is in the kernel assigning the new middle element to the original data :

        pixels[y * gwidth + x] = MyMat[4];

if however changed to :

        pixels[y * gwidth + x] ^= MyMat[4];

it does change the image across however not with the bulr filter that is :

[0.111,0.111,0.111,
0.111,0.111,0.111,
0.111,0.111,0.111];

Below are some examples of the application:

The original image:

Without opencl blurring:

Assigning with pixels[y * gwidth + x] = MyMat[4];

Substracting (to test) the original by mid pixels[y * gwidth + x] -= MyMat[4];

I am using AMD RYZEN PRO7 Series with GPU (however can't recall the model) on Ubuntu 22.04 with installed AMD driver from the official site https://support.zivid.com/en/latest/getting-started/software-installation/gpu/install-opencl-drivers-ubuntu.html I've tested different modifications and the only one that seems not to work is the one that I need to set the middle element to the output data. I've searched for similar problems but I couldn't find any.

Any advice would be helpful. I am new to OpenCL so I may also have misused it or did something incorrect.

[EDIT] I've changed my convolutional kernel to:

    __kernel void conv3x3(__global unsigned int *pixels, unsigned int w, unsigned int h,
    __global float* kern)
    {
        unsigned int MyMat[9];
        int x = get_global_id(0);
        int y = get_global_id(1);
        for(int i=0; i <3; i++) {
            int resr = 0, resg=0, resb=0;
            int r,g,b;
            for(int j=0; j < 3; j++) {
                r = ((pixels[(x+j) + w * (y+i)] >> 16) & 0xFFu) * kern[i * 3 +j];
                g = ((pixels[(x+j) + w * (y+i)] >> 8) & 0xFFu) * kern[i * 3 +j];
                b = ((pixels[(x+j) + w * (y+i)] >> 0) & 0xFFu) * kern[i * 3 +j];
                resr += r;
                resg += g;
                resb += b;
                MyMat[i * 3 + j] = (0xFF000000) | (resr & 0xFFu) << 16 | (resg & 0xFFu) << 8 | (resb & 0xFFu);
            }
        }
        pixels[y * w + x] = MyMat[4];
    }

then I've applied the identity kernel filter:

[0,0,0,
0,1,0,
0,0,0]

And I got a non modified image assuming the opencl works as expected, however I got some error on cumulative convolutions, and still blur does not work makes the image too dark, but assuming the identity does not change the color it means the pipeline is at least ok.

Here is how it looks on cumulative convolutions with identity kernel and the error. Guess I will investigate for bugs and for now I think there is no problem in opencl. Will close it. Thanks to everyone who gave some suggestions.

OpenCL kernel sets an improper value to a matrix after convolution of a 2d cnn

Answers (1)

Related Questions