gpuguy
gpuguy

Reputation: 4585

Opencl: GPU Execution Time is always Zero

I am trying to print the execution time for some functions on GPU. But timing on GPU is always comming out to be 0. Also when I choose CL_DEVICE_TYPE_CPU in the following it works fine.

 errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);

This works fine and shows non-zero value of execution time but if I choose CL_DEVICE_TYPE_GPU, then it always shows 0, irrespective of total no. of data points and threads. please note that in both cases (CL_DEVICE_TYPE_CPU and CL_DEVICE_TYPE_GPU), I am printing the execution time in same way. That is my host code and my kernel code is same in both cases(thats what openCL is!). Following are some of the code section:

  // openCL code to get platform and device ids
errcode = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

// to create context

   clGPUContext = clCreateContext( NULL, 1, &device_id, NULL, NULL, &errcode);
   //Create a command-queue
   clCommandQue = clCreateCommandQueue(clGPUContext, 
              device_id, CL_QUEUE_PROFILING_ENABLE, &errcode);

// Setup device memory
   d_instances= clCreateBuffer(clGPUContext,CL_MEM_READ_ONLY |    
  CL_MEM_COPY_HOST_PTR,mem_size_i,instances->data, &errcode);
  d_centroids = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_c, NULL, &errcode);
  d_distance = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_d,NULL, &errcode);
// d_dist_X = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//d_dist_Y = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);

//to build program
clProgram = clCreateProgramWithSource(clGPUContext,1, (const char **)&source_str,(const 
  size_t*)&source_size, &errcode);

  errcode = clBuildProgram(clProgram, 0,NULL, NULL, NULL, NULL);

  if (errcode == CL_BUILD_PROGRAM_FAILURE) 
{
    // Determine the size of the log
    size_t log_size;
    clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, 
  &log_size);

    // Allocate memory for the log
    char *log = (char *) malloc(log_size);

    // Get the log
    clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, 

 NULL);

    // Print the log
    printf("%s\n", log);
}
clKernel = clCreateKernel(clProgram,"distance_finding", &errcode);

// Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
if(num_instances >= 500)
{
    localWorkSize[0] = 500;
    float block1=num_instances/localWorkSize[0];
    int block= (int)(ceil(block1));
    globalWorkSize[0] = block*localWorkSize[0];
}
else
{
    localWorkSize[0]=num_instances;
    globalWorkSize[0]=num_instances;
}

int iteration=1;
while(iteration < MAX_ITERATIONS)
{
    errcode = clEnqueueWriteBuffer(clCommandQue,d_centroids , CL_TRUE, 0, 
 mem_size_c, (void*)centroids->data, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue,d_distance , CL_TRUE, 0, mem_size_d, 

 (void*)distance->data, 0, NULL, NULL);

    //set kernel arguments
    errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_instances);
    errcode = clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_centroids);
    errcode = clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_distance);
    errcode = clSetKernelArg(clKernel, 3,sizeof(unsigned int), (void *)

  &num_instances);
    errcode = clSetKernelArg(clKernel,4,sizeof(unsigned int),(void *)&clusters);
    errcode = clSetKernelArg(clKernel,5,sizeof(unsigned int),(void *)&dimensions);

    errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 1, NULL, 
  globalWorkSize,localWorkSize, 0, NULL, &myEvent);

    clFinish(clCommandQue); // wait for all events to finish
    clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START,sizeof(cl_ulong), 

 &startTime, NULL);
    clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,sizeof(cl_ulong), 

 &endTime, NULL);
    kernelExecTimeNs = endTime-startTime;
    gpu_time+= kernelExecTimeNs;

    // Retrieve result from device
    errcode = clEnqueueReadBuffer(clCommandQue,d_distance, CL_TRUE, 0, 
 mem_size_d,distance->data, 0, NULL, NULL);

Printing the time in ms

printf("\n\n Time taken by GPU is %llu ms",gpu_time/1000000);

If the way I am calculating the GPU timing is wrong, why would it work on a CPU (by changing to CL_DEVICE_TYPE_CPU)? What is wrong here?

Edited:

System Information

AMD APP SDK 2.4 AMD ATI FirePro GL 3D, having 800 cores

Kerenel

 #pragma OPENCL EXTENSION cl_khr_fp64:enable
double distance_cal(__local float* cent,float* data,int dimensions)
{
float dist1=0.00;
for(int i=0;i<dimensions;i++)
    dist1 += ((data[i]-cent[i]) * (data[i]-cent[i]));
double sq_dist=sqrt(dist1);
return sq_dist;
}
void fetch_col(float* data,__constant float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
    col=col+len;
}
}
void fetch_col_cen(__local float* data,__global float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
    col=col+len;
}
}


 __kernel void distance_finding(__constant float* data,__global float* cen,__global float* 
 dist,int       inst,int clus,const int dimensions)
  {
int idx=get_global_id(0);
float data_col[4];
fetch_col(  data_col,data,idx,dimensions,inst);

for(int i=0;i<clus;i++)
{
    int k=i*inst; // take each dimension value for each cluster data

    __local float cent[4];
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    fetch_col_cen(cent,cen,i,dimensions,clus);

    dist[idx+k]=distance_cal(cent,data_col,dimensions);// calculate distance wrt     
 each data n each centroid

}

}

Upvotes: 0

Views: 2154

Answers (3)

javacom
javacom

Reputation: 43

change to

clFinish(clCommandQue); // wait for all events to finish

// add this after clFinish()
// Ensure kernel execution is finished
clWaitForEvents(1 , &myEvent);

..

double gpu_time = endTime-startTime;

..

printf("\n\n Time taken by GPU is %0.3f ms", gpu_time/1000000.0);

Upvotes: 0

Eyad Ebrahim
Eyad Ebrahim

Reputation: 991

clEnqueueNDRangeKernel() is asynchronous if it is using GPU and therefore you only see the time it took to enqueue the request but not to execution it.

That said, I could be wrong, but I usually write c++ code to do the timing and put the start_time before the instruction and end_time after the

clFinish(cmd_queue); 

just like you did with C++ timing code, that would be a good test, if you're sure your GPU shouldn't be finishing by 0 seconds.

Upvotes: 1

Ani
Ani

Reputation: 10906

An easy way to check would be to introduce an abnormally long operation inside the kernel. If THAT shows up as zero when there a perceptible lag in actual execution - then you have your answer.

That said, I believe (even though the indicated thread is for Linux, it probably holds water on Windows too) you might need to install the instrumented drivers to even have the system write to the performance counters. You can also use the CUDA profiler on nVidia's OpenCL implementation because it sits on top of CUDA.

Upvotes: 0

Related Questions