Ilian Zapryanov
Ilian Zapryanov

Reputation: 1184

OpenCL kernel sets an improper value to a matrix after convolution of a 2d cnn

I am writing a software for convolution of 2D images. I am adopting OpenCL due to the high performance on big images (30 MB and above). I have the following kernel code:

static const char* src3x3= R"(
    __kernel void conv3x3(__global unsigned int *pixels, unsigned int gwidth, unsigned int gheight,
    __global float* kern)
    {
        unsigned int MyMat[9];
        int x = get_global_id(0);
        int y = get_global_id(1);

        int res = 0;
        for(int i=0; i <3; i++) {
            int r,g,b;
            for(int j=0; j < 3; j++) {
                r = (pixels[(x+j) + (gwidth * (y+i))] >> 16) & 0xFFu;
                g = (pixels[(x+j) + (gwidth * (y+i))] >> 8) & 0xFFu;
                b = (pixels[(x+j) + (gwidth * (y+i))]) & 0xFFu;
                r *= kern[i * 3 + j];
                g *= kern[i * 3 + j];
                b *= kern[i * 3 + j];
                MyMat[i * 3 + j]= (0xFF000000) | ( ( r & 0xFFu) << 16) | ( (g & 0xFFu) << 8)  | (b & 0xFFu);
           }
        }
        pixels[y * gwidth + x] = MyMat[4];

    }
)";

and the following [host]:

void gpu_kernel::conv3x3(unsigned int *pixels, unsigned int x, unsigned int y, unsigned int w, unsigned int h,
                         float (&kern)[9])
{
    size_t dim = 2;
    size_t global_offset[] = {x, y};
    size_t global_size[] = {w, h};
    size_t wsize = 0 ;
    size_t globwsize = w * h * sizeof(unsigned int);
    cl_mem bufferx,  bufferkern;
    size_t log_size;
    char* program_log = 0;
    printf("x:%lu y:%lu w:%lu h:%lu size is (%lu)\r\n", x, y, w, h,globwsize);
    clGetPlatformIDs(1, &m_kernctx.platform, &m_kernctx.platforms);
    clGetDeviceIDs(m_kernctx.platform, CL_DEVICE_TYPE_ALL, 1, &m_kernctx.device, &m_kernctx.numdevs);
    m_kernctx.context = clCreateContext(NULL, 1, &m_kernctx.device, NULL, NULL, NULL);
    m_kernctx.command_queue =     clCreateCommandQueue(m_kernctx.context, m_kernctx.device, 0, NULL);
    bufferx =   clCreateBuffer(m_kernctx.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                             globwsize , pixels, NULL);

    bufferkern = clCreateBuffer(m_kernctx.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                sizeof(float) * 9 , kern , NULL);

    m_kernctx.program = clCreateProgramWithSource(m_kernctx.context, 1, &src3x3, NULL, &m_kernctx.compilerr);
    if (m_kernctx.compilerr) {
        return;
    }

    cl_int res = clBuildProgram(m_kernctx.program, 1, &m_kernctx.device, "", NULL, NULL);
    if (res < 0) {
        clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
                              0, NULL, &log_size);
        program_log = (char*) calloc(log_size+1, sizeof(char));
        clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
                              log_size+1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        return;
    }
    cl_int callres = 0;
    m_kernctx.kernel = clCreateKernel(m_kernctx.program, "conv3x3", NULL);
    callres |= clSetKernelArg(m_kernctx.kernel, 0, sizeof(cl_mem), &bufferx);
    callres |= clSetKernelArg(m_kernctx.kernel, 1, sizeof(unsigned int), &w);
    callres |= clSetKernelArg(m_kernctx.kernel, 2, sizeof(unsigned int), &h);
    callres |= clSetKernelArg(m_kernctx.kernel, 3, sizeof(cl_mem), &bufferkern);

    callres |= clGetKernelWorkGroupInfo(m_kernctx.kernel, m_kernctx.device,  CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wsize, NULL);
    callres |= clEnqueueNDRangeKernel(m_kernctx.command_queue,
                                      m_kernctx.kernel,
                                      dim,
                                      global_offset,
                                      global_size,
                                      0,
                                      0,
                                      NULL,
                                      NULL);

    clFlush(m_kernctx.command_queue);
    clFinish(m_kernctx.command_queue);

    callres|= clEnqueueReadBuffer(m_kernctx.command_queue, bufferx, CL_TRUE, 0,  globwsize, pixels, 0, NULL, NULL);
    clReleaseMemObject(bufferx);
    clReleaseMemObject(bufferkern);
}

The problem is in the kernel assigning the new middle element to the original data :

        pixels[y * gwidth + x] = MyMat[4];

if however changed to :

        pixels[y * gwidth + x] ^= MyMat[4];

it does change the image across however not with the bulr filter that is :

[0.111,0.111,0.111,
0.111,0.111,0.111,
0.111,0.111,0.111];

Below are some examples of the application:

The original image: Original image

Without opencl blurring: enter image description here

Assigning with pixels[y * gwidth + x] = MyMat[4]; enter image description here

Substracting (to test) the original by mid pixels[y * gwidth + x] -= MyMat[4]; enter image description here

I am using AMD RYZEN PRO7 Series with GPU (however can't recall the model) on Ubuntu 22.04 with installed AMD driver from the official site https://support.zivid.com/en/latest/getting-started/software-installation/gpu/install-opencl-drivers-ubuntu.html I've tested different modifications and the only one that seems not to work is the one that I need to set the middle element to the output data. I've searched for similar problems but I couldn't find any.

Any advice would be helpful. I am new to OpenCL so I may also have misused it or did something incorrect.

[EDIT] I've changed my convolutional kernel to:

    __kernel void conv3x3(__global unsigned int *pixels, unsigned int w, unsigned int h,
    __global float* kern)
    {
        unsigned int MyMat[9];
        int x = get_global_id(0);
        int y = get_global_id(1);
        for(int i=0; i <3; i++) {
            int resr = 0, resg=0, resb=0;
            int r,g,b;
            for(int j=0; j < 3; j++) {
                r = ((pixels[(x+j) + w * (y+i)] >> 16) & 0xFFu) * kern[i * 3 +j];
                g = ((pixels[(x+j) + w * (y+i)] >> 8) & 0xFFu) * kern[i * 3 +j];
                b = ((pixels[(x+j) + w * (y+i)] >> 0) & 0xFFu) * kern[i * 3 +j];
                resr += r;
                resg += g;
                resb += b;
                MyMat[i * 3 + j] = (0xFF000000) | (resr & 0xFFu) << 16 | (resg & 0xFFu) << 8 | (resb & 0xFFu);
            }
        }
        pixels[y * w + x] = MyMat[4];
    }

then I've applied the identity kernel filter:

[0,0,0,
0,1,0,
0,0,0]

And I got a non modified image assuming the opencl works as expected, however I got some error on cumulative convolutions, and still blur does not work makes the image too dark, but assuming the identity does not change the color it means the pipeline is at least ok.

Here is how it looks on cumulative convolutions with identity kernel and the error. Guess I will investigate for bugs and for now I think there is no problem in opencl. Will close it. Thanks to everyone who gave some suggestions.

enter image description here

Upvotes: 1

Views: 88

Answers (1)

Ilian Zapryanov
Ilian Zapryanov

Reputation: 1184

For anyone interested in the solution, apparently I was able to find how to fix the problem.

  1. It was a logic miss in the kernel. I've refactored it from 2d to 3d (see below):
    __kernel void conv3x3(__global unsigned int *pixels, unsigned int w, unsigned int h,
    __global float* kern)
    {
        unsigned int MyMat[9];
        int x = get_global_id(0);
        int y = get_global_id(1);
        int k = get_global_id(2);
        int resr = 0, resg=0, resb=0;
        int r,g,b;
        unsigned int it=0;
        for(int i=0; i <3; i++) {
            for(int j=0; j < 3; j++) {
                r = ((pixels[(x+j) + w * (y+i)] >> 16) & 0xFFu) * kern[i * 3 +j];
                g = ((pixels[(x+j) + w * (y+i)] >> 8) & 0xFFu) * kern[i * 3 +j];
                b = ((pixels[(x+j) + w * (y+i)] >> 0) & 0xFFu) * kern[i * 3 +j];
                resr += r;
                resg += g;
                resb += b;
            }
        }
        MyMat[k] = (0xFF000000) | (resr & 0xFFu) << 16 | (resg & 0xFFu) << 8 | (resb & 0xFFu);
        resr=resg=resb=r=g=b=0;
        pixels[y * w + x] = MyMat[4];
    }

And the host now uses 3 dimensions (first 2 are for the image x/y) 3rd is used for iterating the 3x3 kernel (only the difference from the OP). host:

    size_t dim = 3;
    size_t global_offset[] = {x, y,0};
    size_t global_size[] = {w, h,9};

The result is blurry (not perfect but pretty close) and using CL_DEVICE_TYPE_GPU.

It's much more blurred but I assume that mixing float and ints in OpenCL may result in some artefacts and also I need to revisit or to examine this again as a standalone host program to verify same outputs. enter image description here

Upvotes: 0

Related Questions