Reputation: 587
I am trying to run portions of this code on the GPU using OpenCL. I am now trying to run the function which deals with YCbCr to RGB conversion.
Please note that as of now I am not trying to optimize the GPU code. I simply want an output identical to that on the CPU.
The function is originally written like this:
void YCbCr_to_ARGB(uint8_t *YCbCr_MCU[3], uint32_t *RGB_MCU, uint32_t nb_MCU_H, uint32_t nb_MCU_V)
{
uint8_t *MCU_Y, *MCU_Cb, *MCU_Cr;
int R, G, B;
uint32_t ARGB;
uint8_t index, i, j;
MCU_Y = YCbCr_MCU[0];
MCU_Cb = YCbCr_MCU[1];
MCU_Cr = YCbCr_MCU[2];
for (i = 0; i < 8 * nb_MCU_V; i++) {
for (j = 0; j < 8 * nb_MCU_H; j++) {
index = i * (8 * nb_MCU_H) + j;
R = (MCU_Cr[index] - 128) * 1.402f + MCU_Y[index];
B = (MCU_Cb[index] - 128) * 1.7772f + MCU_Y[index];
G = MCU_Y[index] - (MCU_Cb[index] - 128) * 0.34414f -
(MCU_Cr[index] - 128) * 0.71414f;
/* Saturate */
if (R > 255)
R = 255;
if (R < 0)
R = 0;
if (G > 255)
G = 255;
if (G < 0)
G = 0;
if (B > 255)
B = 255;
if (B < 0)
B = 0;
ARGB = ((R & 0xFF) << 16) | ((G & 0xFF) << 8) | (B & 0xFF);
// ARGB = 0xFF << 8;
RGB_MCU[(i * (8 * nb_MCU_H) + j)] = ARGB;
}
}
}
The variables of this function are declared in main.c
in the following way:
cl_uchar* YCbCr_MCU[3] = { NULL, NULL, NULL};
cl_uint* RGB_MCU = NULL;
Memory for these variables is being allocated in this way:
if (screen_init_needed == 1)
{
screen_init_needed = 0;
.....
.....
//Some code
for (index = 0 ; index < SOF_section.n ; index++) {
YCbCr_MCU[index] = malloc(MCU_sx * MCU_sy * max_ss_h * max_ss_v);
YCbCr_MCU_ds[index] = malloc(MCU_sx * MCU_sy * max_ss_h * max_ss_v);
}
RGB_MCU = malloc (MCU_sx * MCU_sy * max_ss_h * max_ss_v * sizeof(cl_int));
}
break;
}
I directly copied and pasted it in my .cl
file and made a few minor changes to make it agree with OpenCL standards. My modified OpenCL code looked like this:
__kernel void YCbCr_to_ARGB(__global uchar* YCbCr_MCU[3], __global uint* RGB_MCU, uint nb_MCU_H, uint nb_MCU_V)
{
__global uchar *MCU_Y, *MCU_Cb, *MCU_Cr;
int R, G, B;
uint ARGB;
uchar index, i, j;
MCU_Y = YCbCr_MCU[0];
MCU_Cb = YCbCr_MCU[1];
MCU_Cr = YCbCr_MCU[2];
//Same code as the first code snippet
......
......
......
}
When I built and ran my application with the above kernel code in my .cl
file, I got errors. One of the errors stated that OpenCL doesn't allow pointer to pointer arguments.
In order to get around these errors, I modified my code again to look like this:
__kernel void YCbCr_to_ARGB(__global uchar YCbCr_MCU[3], __global uint* RGB_MCU, uint nb_MCU_H, uint nb_MCU_V)
{
__global uchar *MCU_Y, *MCU_Cb, *MCU_Cr;
int R, G, B;
uint ARGB;
uchar index, i, j;
MCU_Y = &YCbCr_MCU[0];
MCU_Cb = &YCbCr_MCU[1];
MCU_Cr = &YCbCr_MCU[2];
//Same code as the first code snippet
......
......
......
}
When I built and ran the application again, I did not get any errors. This prompted me to write the host code for this kernel.
It looks like this:
color_kernel= clCreateKernel(program, "YCbCr_to_ARGB", &ret);
//YCbCr_MCU for YCbCrtoARGB
cl_mem colorMCU_GPU= clCreateBuffer(context, CL_MEM_READ_WRITE, 3 * sizeof(cl_uchar), NULL, &ret);
//rgb_MCU for YCbCrtoARGB
cl_mem RGB_GPU= clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &ret);
I called the kernel arguments exactly where the original function was called in main.c
. I performed the remaining steps for this kernel in the following way:
if(color&&(SOF_section.n>1)
{
ret = clEnqueueWriteBuffer(command_queue, colorMCU_GPU, CL_TRUE, 0, 3 * sizeof(cl_uchar), YCbCr_MCU, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, RGB_GPU, CL_TRUE, 0, sizeof(cl_uint), RGB_MCU, 0, NULL, NULL);
ret = clSetKernelArg(color_kernel, 0, sizeof(cl_mem), (void *)&colorMCU_GPU);
ret |= clSetKernelArg(color_kernel, 1, sizeof(cl_mem), (void *)&RGB_GPU);
ret = clSetKernelArg(color_kernel, 2, sizeof(cl_uint), (void *)&max_ss_h);
ret |= clSetKernelArg(color_kernel, 3, sizeof(cl_uint), (void *)&max_ss_v);
ret = clEnqueueTask(command_queue, color_kernel, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, RGB_GPU, CL_TRUE, 0, sizeof(cl_uint), RGB_MCU, 0, NULL, NULL);
//YCbCr_to_ARGB(YCbCr_MCU, RGB_MCU, max_ss_h, max_ss_v);
After I run and build the code with these arguments, the code keeps running indefinitely (The output of this is supposed to be a movie clip running on a screen. With this code, I only get a black screen). I have to close Eclipse and reopen it to make additional changes to the code after this.
What is causing the program to behave like this? Is there anyway to safely run this function on the GPU?
Update:
I followed Anders Cedronius' advice and changed my kernel code in the following way:
__kernel void YCbCr_to_ARGB(__global uchar YCbCr_MCU[3], __global uint* RGB_MCU, uint nb_MCU_H, uint nb_MCU_V)
{
printf("Doing color conversion\n");
__global uchar *MCU_Y, *MCU_Cb, *MCU_Cr;
int R, G, B;
uint ARGB;
uchar index, i, j;
i= get_global_id(0);
j= get_global_id(1);
MCU_Y = &YCbCr_MCU[0];
MCU_Cb = &YCbCr_MCU[1];
MCU_Cr = &YCbCr_MCU[2];
if (i < 8 * nb_MCU_V && j < 8 * nb_MCU_H)
{
index = i * (8 * nb_MCU_H) + j;
R = (MCU_Cr[index] - 128) * 1.402f + MCU_Y[index];
B = (MCU_Cb[index] - 128) * 1.7772f + MCU_Y[index];
G = MCU_Y[index] - (MCU_Cb[index] - 128) * 0.34414f -
(MCU_Cr[index] - 128) * 0.71414f;
/* Saturate */
if (R > 255)
R = 255;
if (R < 0)
R = 0;
if (G > 255)
G = 255;
if (G < 0)
G = 0;
if (B > 255)
B = 255;
if (B < 0)
B = 0;
ARGB = ((R & 0xFF) << 16) | ((G & 0xFF) << 8) | (B & 0xFF);
// ARGB = 0xFF << 8;
RGB_MCU[(i * (8 * nb_MCU_H) + j)] = ARGB;
}
printf("Finished color conversion\n");
}
My host code for calling the kernel now looks like this:
color_kernel= clCreateKernel(program, "YCbCr_to_ARGB", &ret);
I am setting the work size and the kernel arguments in the following way:
ret = clEnqueueWriteBuffer(command_queue, colorMCU_GPU, CL_TRUE, 0, 3*sizeof(cl_uchar), YCbCr_MCU, 0, NULL, NULL);
chk(ret, "clEnqueueWriteBuffer");
ret = clEnqueueWriteBuffer(command_queue, RGB_GPU, CL_TRUE, 0, sizeof(cl_uint), RGB_MCU, 0, NULL, NULL);
chk(ret, "clEnqueueWriteBuffer");
ret = clSetKernelArg(color_kernel, 0, sizeof(cl_mem), (void *)&colorMCU_GPU);
ret |= clSetKernelArg(color_kernel, 1, sizeof(cl_mem), (void *)&RGB_GPU);
ret = clSetKernelArg(color_kernel, 2, sizeof(cl_uint), (void *)&max_ss_h);
ret |= clSetKernelArg(color_kernel, 3, sizeof(cl_uint), (void *)&max_ss_v);
size_t itemColor[2] = {1, 1};
ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, itemColor, NULL, 0, NULL, NULL);
chk(ret, "clEnqueueNDRange");
ret = clEnqueueReadBuffer(command_queue, RGB_GPU, CL_TRUE, 0, sizeof(cl_uint), RGB_MCU, 0, NULL, NULL);
clFinish(command_queue);
I ran this code and I NO LONGER get a black screen. However, the kernel for "YCbCr to RGB" is not being recognized now. Even my printf comments are not being displayed on the output console. It is like my code does not have the color conversion function.
Update:
I hadn't changed the name of my kernel in the command EnqueueNDRangeKernel
. I changed the name and now the printf statements are appearing on the console. However, I am not still not getting the correct output.
size_t itemColor[2] = {1, 1};
ret = clEnqueueNDRangeKernel(command_queue, color_kernel, 2, NULL, itemColor, NULL, 0, NULL, NULL);
chk(ret, "clEnqueueNDRange");
clFinish(command_queue);
Update:
I followed pmdj's advice and made changes to my kernel code. It now looks like this:
__kernel void YCbCr_to_ARGB(__global uchar* Y_GPU, __global uchar* Cb_GPU, __global uchar* Cr_GPU, __global uint* RGB_MCU, uint nb_MCU_H, uint nb_MCU_V)
{
__global uchar *MCU_Y, *MCU_Cb, *MCU_Cr;
int R, G, B;
uint ARGB;
uchar index, i, j;
unsigned char iid= get_global_id(0);
unsigned char jid= get_global_id(1);
// MCU_Y = &YCbCr_MCU[0];
// MCU_Cb = &YCbCr_MCU[1];
// MCU_Cr = &YCbCr_MCU[2];
MCU_Y= Y_GPU;
MCU_Cb= Cb_GPU;
MCU_Cr= Cr_GPU;
if (iid <= (8 * nb_MCU_V) && jid <= (8 * nb_MCU_H))
{
index = iid * (8 * nb_MCU_H) + jid;
R = (MCU_Cr[index] - 128) * 1.402f + MCU_Y[index];
B = (MCU_Cb[index] - 128) * 1.7772f + MCU_Y[index];
G = MCU_Y[index] - (MCU_Cb[index] - 128) * 0.34414f -
(MCU_Cr[index] - 128) * 0.71414f;
/* Saturate */
if (R > 255)
R = 255;
if (R < 0)
R = 0;
if (G > 255)
G = 255;
if (G < 0)
G = 0;
if (B > 255)
B = 255;
if (B < 0)
B = 0;
ARGB = ((R & 0xFF) << 16) | ((G & 0xFF) << 8) | (B & 0xFF);
RGB_MCU[(iid * (8 * nb_MCU_H) + jid)] = ARGB;
}
}
In the host code, I created and allocated memory for 4 new variables:
Y_ForGPU= (cl_uchar *)malloc(MCU_sx * MCU_sy * max_ss_h * max_ss_v);
Cb_ForGPU= (cl_uchar *)malloc(MCU_sx * MCU_sy * max_ss_h * max_ss_v);
Cr_ForGPU= (cl_uchar *)malloc(MCU_sx * MCU_sy * max_ss_h * max_ss_v);
//Now will do it for RGB
RGB_testing= (cl_uint *)malloc (MCU_sx * MCU_sy * max_ss_h * max_ss_v * sizeof(cl_int));
I created buffers in the following way:
cl_mem for_Y= clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v), Y_ForGPU, &ret);
cl_mem for_Cb= clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v), Cb_ForGPU , &ret);
cl_mem for_Cr= clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v), Cr_ForGPU, &ret);
//rgb_MCU for YCbCrtoARGB
cl_mem RGB_GPU= clCreateBuffer(context, CL_MEM_READ_WRITE, (MCU_sx * MCU_sy * max_ss_h * max_ss_v * sizeof(cl_int)), NULL, &ret);
I then set the kernel arguments, executed the kernel and sent the computed data back on the host:
ret = clSetKernelArg(color_kernel, 0, sizeof(cl_mem), &for_Y);
ret |= clSetKernelArg(color_kernel, 1, sizeof(cl_mem), &for_Cb);
ret |= clSetKernelArg(color_kernel, 2, sizeof(cl_mem), &for_Cr);
ret |= clSetKernelArg(color_kernel, 3, sizeof(cl_mem), &RGB_GPU);
ret |= clSetKernelArg(color_kernel, 4, sizeof(cl_uint), &max_ss_h);
ret |= clSetKernelArg(color_kernel, 5, sizeof(cl_uint), &max_ss_v);
const size_t itemColor[2] = {100, 100};
ret = clEnqueueNDRangeKernel(command_queue, color_kernel, 2, NULL, itemColor, NULL, 0, NULL, NULL);
clFinish(command_queue);
//Copy result to the host
ret = clEnqueueReadBuffer(command_queue, RGB_GPU, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v * sizeof(cl_int)), RGB_testing, 0, NULL, NULL);
However, now my code simply terminates abruptly. Why might this be happening?
Update:
My code is now working. The problems were probably occurring due to differences in the pointers. I set the Y, Cb, Cr and RGB variables (which I created) equal to the original variables in the host code.
//---Setting color variables equal to array elements----//
Y_ForGPU= YCbCr_MCU[0];
Cb_ForGPU= YCbCr_MCU[1];
Cr_ForGPU= YCbCr_MCU[2];
//----RGB is being assigned value-----//
RGB_testing= RGB_MCU;
Upvotes: 1
Views: 308
Reputation: 23438
I don't know if this is the only cause for your problems (there may be more I haven't yet spotted), but you have a type mismatch in your YCbCr_MCU
kernel argument. You can't have pointer-to-pointer arguments, this is true. Simply removing the *
won't fix it though.
In particular, the line
MCU_Cb = &YCbCr_MCU[1];
in the kernel gets 1 byte past the start of whatever YCbCr_MCU points to, which, looking at the host code, is actually the start of the array of pointers, not the array of pixels.
ret = clSetKernelArg(color_kernel, 0, sizeof(cl_mem), (void *)&colorMCU_GPU);
It looks like YCbCr_MCU
is supposed to be an array of 3 pointers to the Y, Cb, Cr planes containing your source pixels. You will need to pass these to your kernel as 3 direct pointers to the 3 arrays instead of a pointer to the 3 pointers. In other words, turn it into Y, Cb, and, Cr arguments, and set them to colorMCU_GPU[0]
through colorMCU_GPU[2]
on the host.
Upvotes: 1