Reputation: 1066
I am trying to add 2 matrix of 100 cells each one. I need to do it in a task parallel. No data parallel. I got the following code that add, multiple, subtract, divide in the same matrix but when i run it it returns only 0, or sometime 2, -0, -2 etc...
I need to do it with OpenCL in a MAC Any ideas how to do this?
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define MAX_SOURCE_SIZE (0x100000)
const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 0; \n" \
" \n" \
" C[base+0] = A[base+0] + B[base+0]; \n" \
" C[base+4] = A[base+4] + B[base+4]; \n" \
" C[base+8] = A[base+8] + B[base+8]; \n" \
" C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 1; \n" \
" \n" \
" C[base+0] = A[base+0] - B[base+0]; \n" \
" C[base+4] = A[base+4] - B[base+4]; \n" \
" C[base+8] = A[base+8] - B[base+8]; \n" \
" C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 2; \n" \
" \n" \
" C[base+0] = A[base+0] * B[base+0]; \n" \
" C[base+4] = A[base+4] * B[base+4]; \n" \
" C[base+8] = A[base+8] * B[base+8]; \n" \
" C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 3; \n" \
" \n" \
" C[base+0] = A[base+0] / B[base+0]; \n" \
" C[base+4] = A[base+4] / B[base+4]; \n" \
" C[base+8] = A[base+8] / B[base+8]; \n" \
" C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";
int main()
{
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
cl_program program = NULL;
cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
int i, j;
float* A;
float* B;
float* C;
A = (float*)malloc(4*4*sizeof(float));
B = (float*)malloc(4*4*sizeof(float));
C = (float*)malloc(4*4*sizeof(float));
/* Initialize input data */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
A[i*4+j] = i*4+j+1;
B[i*4+j] = j*4+i+1;
}
}
/* Get platform/device information */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue */
command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
/* Create buffer object */
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
/* Copy input data to memory buffer */
ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
/* Create kernel from source */
program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create task parallel OpenCL kernel */
kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);
/* Set OpenCL kernel arguments */
for (i=0; i<4; i++) {
ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
}
/* Execute OpenCL kernel as task parallel */
for (i=0; i<4; i++) {
ret = clEnqueueTask(command_queue, kernel[i], 0, NULL, NULL);
}
/* Copy result to host */
ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
/* Display result */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
printf("%7.2f ", C[i*4+j]);
}
printf("\n");
}
/* Finalization */
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel[0]);
ret = clReleaseKernel(kernel[1]);
ret = clReleaseKernel(kernel[2]);
ret = clReleaseKernel(kernel[3]);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
Upvotes: 1
Views: 1894
Reputation: 20018
The command_queue is not being created as clCreateCommandQueue
is returning -35: CL_INVALID_QUEUE_PROPERTIES
, so basically nothing is working beyond that (no kernels even get to run). You're just printing out whatever random memory values the C
matrix memory gets mapped to (as it is uninitialised). You really need to check the return values of all the API calls for errors, which would have highlighted this right away.
The error relates to your use of the CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
flag. This isn't supported apparently, and in any case it doesn't really do what you want. This flag tells the OpenCL runtime that the kernels do not need to be executed in the same order in which they are enqueued, for a given queue. But the nature of the operation is still sequential kernel execution with data parallelism. This is different to running the kernels concurrently, which is what you want with task parallel execution.
What you need to do is create four command queues, one for each kernel. Then you can wait on events for all the queues to finish. You'll need to be careful if you're sharing the same output matrix though, to ensure you don't accidentally introduce a race condition.
The Task Parallel model is described in section 3.4.2 of the OpenCL 1.2 Reference Manual. When running multiple queues like this, you probably want to use events to track the execution and complettion status of each queue. See section 5.9 of the reference for details.
Here's your test code, updated with multiple queues and running the tasks in parallel. I did a quick verification that the results are correct.
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define MAX_SOURCE_SIZE (0x100000)
const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 0; \n" \
" \n" \
" C[base+0] = A[base+0] + B[base+0]; \n" \
" C[base+4] = A[base+4] + B[base+4]; \n" \
" C[base+8] = A[base+8] + B[base+8]; \n" \
" C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 1; \n" \
" \n" \
" C[base+0] = A[base+0] - B[base+0]; \n" \
" C[base+4] = A[base+4] - B[base+4]; \n" \
" C[base+8] = A[base+8] - B[base+8]; \n" \
" C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 2; \n" \
" \n" \
" C[base+0] = A[base+0] * B[base+0]; \n" \
" C[base+4] = A[base+4] * B[base+4]; \n" \
" C[base+8] = A[base+8] * B[base+8]; \n" \
" C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 3; \n" \
" \n" \
" C[base+0] = A[base+0] / B[base+0]; \n" \
" C[base+4] = A[base+4] / B[base+4]; \n" \
" C[base+8] = A[base+8] / B[base+8]; \n" \
" C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";
int main()
{
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue[4] = {NULL, NULL, NULL, NULL};
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
cl_program program = NULL;
cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
int i, j;
float* A;
float* B;
float* C;
A = (float*)malloc(4*4*sizeof(float));
B = (float*)malloc(4*4*sizeof(float));
C = (float*)malloc(4*4*sizeof(float));
/* Initialize input data */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
A[i*4+j] = i*4+j+1;
printf("A[%u] = %u\n", i*4+j, i*4+j+1);
B[i*4+j] = j*4+i+1;
printf("B[%u] = %u\n", i*4+j, j*4+i+1);
}
}
/* Get platform/device information */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create buffer object */
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
/* Set up each queue */
for (i = 0; i < 4; i++)
{
command_queue[i] = clCreateCommandQueue(context, device_id, 0, &ret);
/* Copy input data to memory buffer */
ret = clEnqueueWriteBuffer(command_queue[i], Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[i], Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
}
/* Create kernel from source */
program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create task parallel OpenCL kernel */
kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);
/* Set OpenCL kernel arguments */
for (i=0; i<4; i++) {
ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
}
/* Execute OpenCL kernel as task parallel */
for (i=0; i<4; i++) {
ret = clEnqueueTask(command_queue[i], kernel[i], 0, NULL, NULL);
}
/* Wait for each queue to finish */
for (i=0; i<4; i++) {
printf("Waiting for %u to finish...\n", i);
ret = clFinish(command_queue[i]);
}
ret = clEnqueueReadBuffer(command_queue[0], Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
/* Display result */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
printf("%7.2f ", C[i*4+j]);
}
printf("\n");
}
/* Finalization */
ret = clReleaseKernel(kernel[0]);
ret = clReleaseKernel(kernel[1]);
ret = clReleaseKernel(kernel[2]);
ret = clReleaseKernel(kernel[3]);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue[0]);
ret = clReleaseCommandQueue(command_queue[1]);
ret = clReleaseCommandQueue(command_queue[2]);
ret = clReleaseCommandQueue(command_queue[3]);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
Upvotes: 1