Reputation: 1184
I am writing a software for convolution of 2D images. I am adopting OpenCL due to the high performance on big images (30 MB and above). I have the following kernel code:
static const char* src3x3= R"(
__kernel void conv3x3(__global unsigned int *pixels, unsigned int gwidth, unsigned int gheight,
__global float* kern)
{
unsigned int MyMat[9];
int x = get_global_id(0);
int y = get_global_id(1);
int res = 0;
for(int i=0; i <3; i++) {
int r,g,b;
for(int j=0; j < 3; j++) {
r = (pixels[(x+j) + (gwidth * (y+i))] >> 16) & 0xFFu;
g = (pixels[(x+j) + (gwidth * (y+i))] >> 8) & 0xFFu;
b = (pixels[(x+j) + (gwidth * (y+i))]) & 0xFFu;
r *= kern[i * 3 + j];
g *= kern[i * 3 + j];
b *= kern[i * 3 + j];
MyMat[i * 3 + j]= (0xFF000000) | ( ( r & 0xFFu) << 16) | ( (g & 0xFFu) << 8) | (b & 0xFFu);
}
}
pixels[y * gwidth + x] = MyMat[4];
}
)";
and the following [host]:
void gpu_kernel::conv3x3(unsigned int *pixels, unsigned int x, unsigned int y, unsigned int w, unsigned int h,
float (&kern)[9])
{
size_t dim = 2;
size_t global_offset[] = {x, y};
size_t global_size[] = {w, h};
size_t wsize = 0 ;
size_t globwsize = w * h * sizeof(unsigned int);
cl_mem bufferx, bufferkern;
size_t log_size;
char* program_log = 0;
printf("x:%lu y:%lu w:%lu h:%lu size is (%lu)\r\n", x, y, w, h,globwsize);
clGetPlatformIDs(1, &m_kernctx.platform, &m_kernctx.platforms);
clGetDeviceIDs(m_kernctx.platform, CL_DEVICE_TYPE_ALL, 1, &m_kernctx.device, &m_kernctx.numdevs);
m_kernctx.context = clCreateContext(NULL, 1, &m_kernctx.device, NULL, NULL, NULL);
m_kernctx.command_queue = clCreateCommandQueue(m_kernctx.context, m_kernctx.device, 0, NULL);
bufferx = clCreateBuffer(m_kernctx.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
globwsize , pixels, NULL);
bufferkern = clCreateBuffer(m_kernctx.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * 9 , kern , NULL);
m_kernctx.program = clCreateProgramWithSource(m_kernctx.context, 1, &src3x3, NULL, &m_kernctx.compilerr);
if (m_kernctx.compilerr) {
return;
}
cl_int res = clBuildProgram(m_kernctx.program, 1, &m_kernctx.device, "", NULL, NULL);
if (res < 0) {
clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
log_size+1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
return;
}
cl_int callres = 0;
m_kernctx.kernel = clCreateKernel(m_kernctx.program, "conv3x3", NULL);
callres |= clSetKernelArg(m_kernctx.kernel, 0, sizeof(cl_mem), &bufferx);
callres |= clSetKernelArg(m_kernctx.kernel, 1, sizeof(unsigned int), &w);
callres |= clSetKernelArg(m_kernctx.kernel, 2, sizeof(unsigned int), &h);
callres |= clSetKernelArg(m_kernctx.kernel, 3, sizeof(cl_mem), &bufferkern);
callres |= clGetKernelWorkGroupInfo(m_kernctx.kernel, m_kernctx.device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wsize, NULL);
callres |= clEnqueueNDRangeKernel(m_kernctx.command_queue,
m_kernctx.kernel,
dim,
global_offset,
global_size,
0,
0,
NULL,
NULL);
clFlush(m_kernctx.command_queue);
clFinish(m_kernctx.command_queue);
callres|= clEnqueueReadBuffer(m_kernctx.command_queue, bufferx, CL_TRUE, 0, globwsize, pixels, 0, NULL, NULL);
clReleaseMemObject(bufferx);
clReleaseMemObject(bufferkern);
}
The problem is in the kernel assigning the new middle element to the original data :
pixels[y * gwidth + x] = MyMat[4];
if however changed to :
pixels[y * gwidth + x] ^= MyMat[4];
it does change the image across however not with the bulr filter that is :
[0.111,0.111,0.111,
0.111,0.111,0.111,
0.111,0.111,0.111];
Below are some examples of the application:
Assigning with pixels[y * gwidth + x] = MyMat[4];
Substracting (to test) the original by mid pixels[y * gwidth + x] -= MyMat[4];
I am using AMD RYZEN PRO7 Series with GPU (however can't recall the model) on Ubuntu 22.04 with installed AMD driver from the official site https://support.zivid.com/en/latest/getting-started/software-installation/gpu/install-opencl-drivers-ubuntu.html I've tested different modifications and the only one that seems not to work is the one that I need to set the middle element to the output data. I've searched for similar problems but I couldn't find any.
Any advice would be helpful. I am new to OpenCL so I may also have misused it or did something incorrect.
[EDIT] I've changed my convolutional kernel to:
__kernel void conv3x3(__global unsigned int *pixels, unsigned int w, unsigned int h,
__global float* kern)
{
unsigned int MyMat[9];
int x = get_global_id(0);
int y = get_global_id(1);
for(int i=0; i <3; i++) {
int resr = 0, resg=0, resb=0;
int r,g,b;
for(int j=0; j < 3; j++) {
r = ((pixels[(x+j) + w * (y+i)] >> 16) & 0xFFu) * kern[i * 3 +j];
g = ((pixels[(x+j) + w * (y+i)] >> 8) & 0xFFu) * kern[i * 3 +j];
b = ((pixels[(x+j) + w * (y+i)] >> 0) & 0xFFu) * kern[i * 3 +j];
resr += r;
resg += g;
resb += b;
MyMat[i * 3 + j] = (0xFF000000) | (resr & 0xFFu) << 16 | (resg & 0xFFu) << 8 | (resb & 0xFFu);
}
}
pixels[y * w + x] = MyMat[4];
}
then I've applied the identity kernel filter:
[0,0,0,
0,1,0,
0,0,0]
And I got a non modified image assuming the opencl works as expected, however I got some error on cumulative convolutions, and still blur does not work makes the image too dark, but assuming the identity does not change the color it means the pipeline is at least ok.
Here is how it looks on cumulative convolutions with identity kernel and the error. Guess I will investigate for bugs and for now I think there is no problem in opencl. Will close it. Thanks to everyone who gave some suggestions.
Upvotes: 1
Views: 88
Reputation: 1184
For anyone interested in the solution, apparently I was able to find how to fix the problem.
__kernel void conv3x3(__global unsigned int *pixels, unsigned int w, unsigned int h,
__global float* kern)
{
unsigned int MyMat[9];
int x = get_global_id(0);
int y = get_global_id(1);
int k = get_global_id(2);
int resr = 0, resg=0, resb=0;
int r,g,b;
unsigned int it=0;
for(int i=0; i <3; i++) {
for(int j=0; j < 3; j++) {
r = ((pixels[(x+j) + w * (y+i)] >> 16) & 0xFFu) * kern[i * 3 +j];
g = ((pixels[(x+j) + w * (y+i)] >> 8) & 0xFFu) * kern[i * 3 +j];
b = ((pixels[(x+j) + w * (y+i)] >> 0) & 0xFFu) * kern[i * 3 +j];
resr += r;
resg += g;
resb += b;
}
}
MyMat[k] = (0xFF000000) | (resr & 0xFFu) << 16 | (resg & 0xFFu) << 8 | (resb & 0xFFu);
resr=resg=resb=r=g=b=0;
pixels[y * w + x] = MyMat[4];
}
And the host now uses 3 dimensions (first 2 are for the image x/y) 3rd is used for iterating the 3x3 kernel (only the difference from the OP). host:
size_t dim = 3;
size_t global_offset[] = {x, y,0};
size_t global_size[] = {w, h,9};
The result is blurry (not perfect but pretty close) and using CL_DEVICE_TYPE_GPU
.
It's much more blurred but I assume that mixing float and ints in OpenCL may result in some artefacts and also I need to revisit or to examine this again as a standalone host program to verify same outputs.
Upvotes: 0