Reputation: 477
Right now I programmed made several algorithms running in parallel on one GPU, but all of them have the same problem, when I try to execute them on several GPUs (for example, 3). The problem is that the code, executed on one GPU executes exactly the same amount of time on 3 GPUs (not faster). I tried to execute with more data, tried different tasks to be executed, nothing helped. Finally, I ended up trying to run the easiest task like elements sum and still got this awful mistake. That is why I don't believe it is a problem of a particular algorithm and I feel there is a mistake in my code (or even in my approach to parallelizing code on several GPUs).
Here is the header file for my Parallel.cpp class:
#ifndef PARALLEL_H
#define PARALLEL_H
#define __NO_STD_VECTOR // Use cl::vector and cl::string and
#define __NO_STD_STRING // not STL versions, more on this later
#include <CL/cl.h>
class Parallel
{
public:
Parallel();
int executeAttachVectorsKernel(int*, int*, int*, int);
static void getMaxWorkGroupSize(int*, int*, int*);
virtual ~Parallel();
protected:
private:
char* file_contents(const char*, int*);
void getShortInfo(cl_device_id);
int init(void);
cl_platform_id platform;
cl_device_id* devices;
cl_uint num_devices;
cl_command_queue* queues;
int* WGSizes;
int* WGNumbers;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_mem input1;
cl_mem input2;
cl_mem output;
};
#endif // PARALLEL_H
Here is the initialization method init:
int Parallel::init() {
cl_int err;
//Connect to the first platfrom
err = clGetPlatformIDs(1, &platform, NULL);
if (err != CL_SUCCESS) {
cerr << "Error occured while executing clGetPlatformIDs" << endl;
return EXIT_FAILURE;
}
//Get devices number
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
if (err != CL_SUCCESS) {
cerr << "Error: Failed to create a device group:" << endl;
return EXIT_FAILURE;
}
cout << "NUM DEVICES =" << num_devices << endl;
devices = new cl_device_id[num_devices];
//Get all the GPU devices
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
//Create one context for all the devices
context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
if (!context) {
cerr << "Error: Failed to create a compute context!" << endl;
return EXIT_FAILURE;
}
queues = new cl_command_queue[num_devices];
WGNumbers = new int[num_devices];
WGSizes = new int[num_devices];
for(int i = 0; i < num_devices; i++) {
//Create a command queue for every device
queues[i] = clCreateCommandQueue(context, devices[i], 0, &err);
if (!queues[i]) {
cerr << "Error: Failed to create a command commands!" << endl;
return EXIT_FAILURE;
}
cl_ulong temp;
clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(temp), &temp, NULL);
WGSizes[i] = (int)temp;
clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(temp), &temp, NULL);
WGNumbers[i] = (int)temp;
}
//Translate kernel code into chars
int pl;
size_t program_length;
string path = "./kernel/kernel_av.cl";
char* cSourceCL = file_contents(path.c_str(), &pl);
program_length = (size_t)pl;
//Create a program
program = clCreateProgramWithSource(context, 1,
(const char **) &cSourceCL, &program_length, &err);
if (!program) {
cerr << "Error: Failed to create compute program!" << endl;
return EXIT_FAILURE;
}
//Create an executable
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
cerr << "Error: Failed to build program executable!" << endl;
exit(1);
}
// Create the compute kernel in the program
kernel = clCreateKernel(program, "calculate2dim", &err);
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to create compute kernel!" << endl;
exit(1);
}
}
The method which executes kernel is here:
int Parallel::executeAttachVectorsKernel(int* data1, int* data2, int* results, int vectors_num) {
cl_int err;
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
int partition = vectors_num/num_devices;
unsigned int count = partition;
input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL);
input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * count, NULL, NULL);
if (!input1 || !input2 || !output) {
cerr << "Error: Failed to allocate device memory!" << endl;
exit(1);
}
int** data1_apart = new int*[num_devices];
int** data2_apart = new int*[num_devices];
int** results_apart = new int*[num_devices];
for(int i = 0; i < num_devices; i++) {
cout << "Executing parallel part on GPU " << i + 1 << endl;
cout << "Partition size = " << partition << endl;
data1_apart[i] = new int[partition];
data2_apart[i] = new int[partition];
results_apart[i] = new int[partition];
for(int j = i*partition, k = 0; k < partition; j++, k++) {
data1_apart[i][k] = data1[j];
data2_apart[i][k] = data2[j];
}
//Transfer the input vector into device memory
err = clEnqueueWriteBuffer(queues[i], input1,
CL_TRUE, 0, sizeof(int) * count,
data1_apart[i], 0, NULL, NULL);
err = clEnqueueWriteBuffer(queues[i], input2,
CL_TRUE, 0, sizeof(int) * count,
data2_apart[i], 0, NULL, NULL);
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to write to source array!" << endl;
exit(1);
}
int parameter4 = count/WGNumbers[i];
//Set the arguments to the compute kernel
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 3, sizeof(int), ¶meter4);
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to set kernel arguments! " << err << endl;
exit(1);
}
global = WGNumbers[i];
local = WGSizes[i];
if(local > global) {
local = global;
}
cout << "global = " << global << " local = " << local << endl;
err = clEnqueueNDRangeKernel(queues[i], kernel,
1, NULL, &global, &local,
0, NULL, NULL);
if (err)
{
cerr << "Error: Failed to execute kernel!" << endl;
return EXIT_FAILURE;
}
}
for(int i = 0; i < num_devices; i++) {
//Wait for all commands to complete
clFinish(queues[i]);
//Read back the results from the device to verify the output
err = clEnqueueReadBuffer(queues[i], output,
CL_TRUE, 0, sizeof(int) * count,
results_apart[i], 0, NULL, NULL );
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to read output array! " << err << endl;
exit(1);
}
for(int j = 0; j < partition; j++) {
results[i*partition + j] = results_apart[i][j];
}
delete [] data1_apart[i];
delete [] data2_apart[i];
delete [] results_apart[i];
}
clReleaseMemObject(input1);
clReleaseMemObject(input2);
clReleaseMemObject(output);
delete [] data1_apart;
delete [] data2_apart;
}
Before posting this question to stackoverflow I was fighting this problem for 2-3 weeks and now I really need someone's help, so I will highly appreciate any thoughts and answers!
Upvotes: 3
Views: 5793
Reputation:
Which GPUs are you using? I have a GTX590 which shows up at two GPU devices. When I tried to run on both devices with it seemed to wait for each device to finish before it moved to the next one (even though it was not suppose to). I don't know if Nvidia fixed this.
Reading some messages I think on the Nvidia site at the time I read something about Nvidia suggesting to create separate contexts for each device and run them in different threads. That's what I did and it works great. I used pthreads (or SDL_threads) for this. It's quite easy to setup.
Upvotes: 0
Reputation:
Here is what I think is happening. You call clEnqueueNDRangeKernel once for each participating opencl device. At this point, none of the kernels have started execution because clFlush has not been called. Next, you make a clFinish for each queue. The first clFinish call causes the first queued work group to run. It also waits for it to finish. Once the first work group completes, clFinish returns control to your app. Your app then calls clFinish for the next queue. This triggers the second work grout to run, and also waits for it to finish. So the work runs sequentially. The solution may be as simple as calling clFush immediately after each call to clEnqueueNDRangeKernel. This is how my AMD system behaves. I will post a working example shortly.
Upvotes: 2
Reputation: 9886
All your devices operate on the same buffers. The data will be moved between the devices when the kernels are executed. Without proper synchronization, the results will be undefined.
If possible, consider allocating a distinct set of buffers for each device.
Upvotes: 1