Reputation: 53
I’ve been trying to get a parallel implementation of the Gaussian elimination process done for some time now. It seems as if the kernels are ignoring the barriers set out, executing all the operations it can, AND THEN lets the next kernel do its work. But I need them to do work together, repeatedly. My input A is the modified matrix with the last column being the figurative 'output'. In words: Each kernel performs row-reduction on a separate jth row, to make the element in the ith column zero. Barrier. Rinse and repeat. At the end there will be an identity matrix. The same kernels that did the jth row also assign the x[j] to the value in the last column.
The main program creates a specified matrix organized in 2D, then makes it 1D for the kernel. It solves in serially on it's own and compares the serial results to the opencl results that are passed back.
Main:
// Entry point
int main(){
// Initialize OpenCL.
if(!init_opencl()){return -1;}
// Request user for size of parallel operation
kr=k_input();
// Initialize the problem data. Requires number of devices to be known
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
// Initializes the OpenCL objects
bool init_opencl(){
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false; }
// Get the OpenCL platform.
platform = findPlatform("Intel"); // CHANGE TO Altera FOR FINAL
if(platform == NULL) {
printf("ERROR: Unable to find Intel OpenCL platform.\n");
return false; }
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %s\n", getPlatformName(platform).c_str());
printf("Using %d device(s)\n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf(" %s\n\n", getDeviceName(device[i]).c_str()); // ANOTHER LINE FOR PRINTF DEBUGGING
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("GaussianEl", device[0]);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create per-device objects.
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
n = Make_Matrix(A);N=n;
for (int j = 0; j < n ; j++)
{
for (int k = 0 ; k < n+1 ; k++)
{
parallelA[k+j*(1+n)]=A[j][k];
}
}
for(unsigned i = 0; i < num_devices; ++i) {
// Command queue.
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Kernel.
const char *kernel_name = "GaussianEl";
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
// Determine the number of elements processed by this device.
n_per_device[i] = N / num_devices; // Maybe change to n ?
// Spread out the remainder of the elements over the first
// N % num_devices.
if(i < (N % num_devices)) {
n_per_device[i]++; }
// Input buffers. // A, x, n
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_per_device[i]*(n_per_device[i]+1)*sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(int), NULL, &status);
checkError(status, "Failed to create buffer for input B");
// Output buffer.
output_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for output");
}
return true;
}
// Request value for k
long int k_input(){
int ch;
char *p;
char buffer[16];
while ( ( ch=getchar() ) != '\n' && (ch != EOF) ){} // clear input stream
printf ( "In base 2, ranging from 1 to 1024, enter the number of parallel threads to execute: " );
INPUT:
if( fgets( buffer,sizeof(buffer),stdin ) != NULL ) // waits for input
{
long int kr = strtol(buffer,&p,10);
if( buffer[0] != '\n' && (*p == '\n' || *p == '\0'))
{
float x = log2(kr);
if( fmodf(x,1) == 0 )
{
if( kr<=1024 )
{
printf("%ld kernals will be launched, good job!\n\n",kr);
return kr;
}
else
{
printf("Too many kernals, must be from 1 to 1024.\n");
goto INPUT;
}
}
else
printf("%ld is not a power of 2, try again..\n",kr);
goto INPUT;
}
else //
{
for( int i = 0;i<strlen(buffer);i++ )
{
if( buffer[i]=='\n' )
buffer[i]='\0';
}
printf("Received %ld. Invalid input: \"%s\". Fatal Error.\n", kr, p);
return 0;
}
}
else // fgets is NULL due to error or EOF condition
printf ( "Error reading input.\n" );
return 0;
}
// Initialize the data for the problem. Requires num_devices to be known
void init_problem(){
if(num_devices == 0) {
checkError(-1, "No devices"); }
input_a.reset(num_devices);
input_b.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]*(n_per_device[i]+1));
input_b[i].reset(1);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
input_a[i]=parallelA;
input_b[i][0]=n;
// Serial Code
start_time = getCurrentTimestamp();
Solve_Matrix(n,A,x);
ref_output[i]=x; // ref_output[1]
end_time = getCurrentTimestamp();
}
Print_Solution(n,x);
}
// Run kernel ///////////////////////////////////
void run(){
cl_int status;
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
//status = clEnqueueMapBuffer(queue[i],input_a_buf[i], CL_TRUE, ,
// 0, n_per_device[i]*(n_per_device[i]+1)*sizeof(float), 0, NULL, &write_event[0]);
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_TRUE,
0, n_per_device[i]*(n_per_device[i]+1)*sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
// WRITTEN FROM input_a // CL_FALSE
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, sizeof(int), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// WRITTEN FROM input_b
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = kr; // kr = (1024, 512, 256, etc)
printf("\nLaunching for device %d (%lu elements)\n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i]*sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// READ INTO output WHICH IS parallelx
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
//parallelx=*output[1]; /////////////////////////////////// not required for pass
//Print_Solution(n,parallelx);
// Wall-clock time taken.
Serial_Time = (end_time - start_time) * 1e3;
printf("\nSerial Time: %0.3f ms\n", Serial_Time);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
printf("Speed-up is: %0.3f", Serial_Time/(double(time_ns)*1e-6));
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}
// Verify results.
bool pass = true;
for(unsigned i = 0; i < num_devices && pass; ++i) {
for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n",
i, j, output[i][j], ref_output[i][j]);
pass = false;
}
}
}
printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
}
// Free the resources allocated during initialization
void cleanup(){
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}
Kernel:
__kernel void GaussianEl(__global float *A,
__global int *y,
__global float *restrict z)
{
unsigned index = get_global_id(0); // get index of the work item
unsigned kr = get_global_size(0); // number of global work items
unsigned n = *y; // dimension of matrix
float scale; // initialize scale
int a,b,i,j,k,h; // variables
if(index==0) { // Show received info is good
printf("----------------------\nThe dimension size is: %d\nThe kr is: %d\nThe original matrix is:\n",n,kr); //
print_matrix(n, index, A);
}
for(i=0; i<n; i++) //* LOOP FOR UPPER TRIANGULAR MATRIX
{
if( A[i+i*(n+1)] == 0 ){printf("Error, divide by zero encountered.\n");} // do something for if divide by zero for re-calculating matrix?
for(j=0; j<n; j++) // THIS FOR REPRESENTS THE jTH ROW OF THE iTH COLUMN PARALLELIZED
{ // BARRIER MUST BE PLACED AFTER THE iTH COLUMN ELEMENT IN A ROW IS ZEROED
h=j/kr; // h=1/2=0, h=2/2=1 E.G VALUES FOR 3X3 KR=2
if(j==i && index==(j-h*kr))
{
scale=A[i+i*(n+1)];
printf("kernel=%d, i=%d, j=%d, h=%d, scale=%.1f/%.1f, Value=%d\n", index, i, j, h, A[i+j*(n+1)], A[i+i*(n+1)],(j-h*kr) );
for(k=i; k<=n; k++)
{
A[k+j*(n+1)]=A[k+j*(n+1)]/scale;
}
print_matrix(n, index, A);
}
if( (j>i) && (index==(j-h*kr)) )
{
scale=A[i+j*(n+1)]/A[i+i*(n+1)];
printf("kernel=%d, i=%d, j=%d, h=%d, scale=%.1f/%.1f, Value=%d\n", index, i, j, h, A[i+j*(n+1)], A[i+i*(n+1)],(j-h*kr) );
for(k=0; k<=n; k++)
{
A[k+j*(n+1)]=A[k+j*(n+1)]-scale*A[k+i*(n+1)];
}
print_matrix(n, index, A);
}
}
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
}
... // 1 more loop after this to finish upper half
Results:
The serial/parallel solution is:
x0=3.0
x1=4.0
x2=-2.0
Launching for device 0 (2 elements)
----------------------
The dimension size is: 3
The kr is: 2
The original matrix is:
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
4.0 0.0 5.0 2.0
kernel=0, i=0, j=0, h=0, scale=1.0/1.0, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
4.0 0.0 5.0 2.0
kernel=0, i=0, j=2, h=1, scale=4.0/1.0, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
0.0 -4.0 1.0 -18.0
kernel=0, i=1, j=2, h=1, scale=-4.0/3.0, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
2.7 0.0 7.7 -7.3
kernel=0, i=2, j=2, h=1, scale=7.7/7.7, Value=0
1.0 1.0 1.0 5.0
2.0 3.0 5.0 8.0
2.7 0.0 1.0 -1.0
..and now the rest...
test 1
test 2: i=0,j=0, h=0
test 2: i=0,j=1, h=0
test 2: i=0,j=2, h=1
test 1
test 2: i=1,j=0, h=0
kernel=0, i=1, j=0, h=0, scale=1.0/3.0, Value=0
0.3 0.0 -0.7 2.3
2.0 3.0 5.0 8.0
2.7 0.0 1.0 -1.0
test 2: i=1,j=1, h=0
test 2: i=1,j=2, h=1
test 1
test 2: i=2,j=0, h=0
kernel=0, i=2, j=0, h=0, scale=-0.7/1.0, Value=0
2.1 0.0 0.0 1.7
2.0 3.0 5.0 8.0
2.7 0.0 1.0 -1.0
test 2: i=2,j=1, h=0
test 2: i=2,j=2, h=1
The value of x0 is: 1.7
The value of x1 is: 8.0
The value of x2 is: -1.0
kernel=1, i=0, j=1, h=0, scale=2.0/2.1, Value=1
kernel=1, i=1, j=1, h=0, scale=3.0/3.0, Value=1
test 1
test 2: i=0,j=0, h=0
test 2: i=0,j=1, h=0
test 2: i=0,j=2, h=1
test 1
test 2: i=1,j=0, h=0
test 2: i=1,j=1, h=0
test 2: i=1,j=2, h=1
test 1
test 2: i=2,j=0, h=0
test 2: i=2,j=1, h=0
test 2: i=2,j=2, h=1
The value of x0 is: 1.7
The value of x1 is: 2.1
The value of x2 is: -1.0
Serial Time: 0.570 ms
Kernel time (device 0): 1.442 ms
Speed-up is: 0.395Failed verification @ device 0, index 0
Output: 1.695652
Reference: 3.000000
Verification: FAIL
The problem seems to occur because I am using APIs and am scheduling the kernels in a way that I did in previous labs. The problem is that in previous labs, the kernels executed their respective sections, and returned. When I had debugging print statements within the kernel code, the command window would show them one after the other. And that was fine, and it worked, and I could see how it worked because I could see the speedup (when I uploaded the code to the Altera board, NOT during emulation ofcourse). However, now I need the results returned in a global sense before the kernel is finished, this needs to happen multiple times, and all kernels need to be able to read this change to the matrix that the other kernels perform. I've been fooling around with the clEnqueueMapBuffer but I'm not even certain if this is the right API to use, or to replace clEnqueueWriteBuffer with. I'll be removing this question tonight as requested by my professor.
Upvotes: 0
Views: 381
Reputation: 8484
First of all you should have mentioned from the beginning that it's about Altera FPGA as OpenCL works there a bit differently than on GPU.
From my experience, especially when working with FPGA, you need to do it step by step. So:
That's more or less all I remember from the time when I was developing kernels for Altera FPGA.
Upvotes: 1