Reputation: 4585
I am trying to print the execution time for some functions on GPU. But timing on GPU is always comming out to be 0. Also when I choose CL_DEVICE_TYPE_CPU in the following it works fine.
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
This works fine and shows non-zero value of execution time but if I choose CL_DEVICE_TYPE_GPU, then it always shows 0, irrespective of total no. of data points and threads. please note that in both cases (CL_DEVICE_TYPE_CPU and CL_DEVICE_TYPE_GPU), I am printing the execution time in same way. That is my host code and my kernel code is same in both cases(thats what openCL is!). Following are some of the code section:
// openCL code to get platform and device ids
errcode = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
// to create context
clGPUContext = clCreateContext( NULL, 1, &device_id, NULL, NULL, &errcode);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
device_id, CL_QUEUE_PROFILING_ENABLE, &errcode);
// Setup device memory
d_instances= clCreateBuffer(clGPUContext,CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR,mem_size_i,instances->data, &errcode);
d_centroids = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_c, NULL, &errcode);
d_distance = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_d,NULL, &errcode);
// d_dist_X = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//d_dist_Y = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//to build program
clProgram = clCreateProgramWithSource(clGPUContext,1, (const char **)&source_str,(const
size_t*)&source_size, &errcode);
errcode = clBuildProgram(clProgram, 0,NULL, NULL, NULL, NULL);
if (errcode == CL_BUILD_PROGRAM_FAILURE)
{
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
&log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, log_size, log,
NULL);
// Print the log
printf("%s\n", log);
}
clKernel = clCreateKernel(clProgram,"distance_finding", &errcode);
// Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
if(num_instances >= 500)
{
localWorkSize[0] = 500;
float block1=num_instances/localWorkSize[0];
int block= (int)(ceil(block1));
globalWorkSize[0] = block*localWorkSize[0];
}
else
{
localWorkSize[0]=num_instances;
globalWorkSize[0]=num_instances;
}
int iteration=1;
while(iteration < MAX_ITERATIONS)
{
errcode = clEnqueueWriteBuffer(clCommandQue,d_centroids , CL_TRUE, 0,
mem_size_c, (void*)centroids->data, 0, NULL, NULL);
errcode = clEnqueueWriteBuffer(clCommandQue,d_distance , CL_TRUE, 0, mem_size_d,
(void*)distance->data, 0, NULL, NULL);
//set kernel arguments
errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_instances);
errcode = clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_centroids);
errcode = clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_distance);
errcode = clSetKernelArg(clKernel, 3,sizeof(unsigned int), (void *)
&num_instances);
errcode = clSetKernelArg(clKernel,4,sizeof(unsigned int),(void *)&clusters);
errcode = clSetKernelArg(clKernel,5,sizeof(unsigned int),(void *)&dimensions);
errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 1, NULL,
globalWorkSize,localWorkSize, 0, NULL, &myEvent);
clFinish(clCommandQue); // wait for all events to finish
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START,sizeof(cl_ulong),
&startTime, NULL);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,sizeof(cl_ulong),
&endTime, NULL);
kernelExecTimeNs = endTime-startTime;
gpu_time+= kernelExecTimeNs;
// Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,d_distance, CL_TRUE, 0,
mem_size_d,distance->data, 0, NULL, NULL);
Printing the time in ms
printf("\n\n Time taken by GPU is %llu ms",gpu_time/1000000);
If the way I am calculating the GPU timing is wrong, why would it work on a CPU (by changing to CL_DEVICE_TYPE_CPU)? What is wrong here?
Edited:
System Information
AMD APP SDK 2.4 AMD ATI FirePro GL 3D, having 800 cores
Kerenel
#pragma OPENCL EXTENSION cl_khr_fp64:enable
double distance_cal(__local float* cent,float* data,int dimensions)
{
float dist1=0.00;
for(int i=0;i<dimensions;i++)
dist1 += ((data[i]-cent[i]) * (data[i]-cent[i]));
double sq_dist=sqrt(dist1);
return sq_dist;
}
void fetch_col(float* data,__constant float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
void fetch_col_cen(__local float* data,__global float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
__kernel void distance_finding(__constant float* data,__global float* cen,__global float*
dist,int inst,int clus,const int dimensions)
{
int idx=get_global_id(0);
float data_col[4];
fetch_col( data_col,data,idx,dimensions,inst);
for(int i=0;i<clus;i++)
{
int k=i*inst; // take each dimension value for each cluster data
__local float cent[4];
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
fetch_col_cen(cent,cen,i,dimensions,clus);
dist[idx+k]=distance_cal(cent,data_col,dimensions);// calculate distance wrt
each data n each centroid
}
}
Upvotes: 0
Views: 2154
Reputation: 43
change to
clFinish(clCommandQue); // wait for all events to finish
// add this after clFinish()
// Ensure kernel execution is finished
clWaitForEvents(1 , &myEvent);
..
double gpu_time = endTime-startTime;
..
printf("\n\n Time taken by GPU is %0.3f ms", gpu_time/1000000.0);
Upvotes: 0
Reputation: 991
clEnqueueNDRangeKernel()
is asynchronous if it is using GPU and therefore you only see the time it took to enqueue the request but not to execution it.
That said, I could be wrong, but I usually write c++ code to do the timing and put the start_time
before the instruction and end_time
after the
clFinish(cmd_queue);
just like you did with C++ timing code, that would be a good test, if you're sure your GPU shouldn't be finishing by 0
seconds.
Upvotes: 1
Reputation: 10906
An easy way to check would be to introduce an abnormally long operation inside the kernel. If THAT shows up as zero when there a perceptible lag in actual execution - then you have your answer.
That said, I believe (even though the indicated thread is for Linux, it probably holds water on Windows too) you might need to install the instrumented drivers to even have the system write to the performance counters. You can also use the CUDA profiler on nVidia's OpenCL implementation because it sits on top of CUDA.
Upvotes: 0