Reputation: 33669
I created a real time ray tracer in OpenCL. This was developed on a GTX 580. I stopped working on it for a few years and recently resurrected it. I expected that with newer and "better" Nvidia GPUs that it would run even faster. However, it still runs fastest on a GTX 580.
Here is a table of the times for a benchmark scene I use for three different computers and graphics cards
GPU Kernel time CPU OS System Mem
GTX 580 11 ms E5-1670 Windows 7 32 GB
GTX Titan 15 ms W5580 (two processors) Windows 7 48 GB
GTX 980M 15 ms i7-4710HQ (laptop) Windows 10 16 GB
Each computer had Nvidia driver 361.43 installed on Jan. 10 2016 and the host code was compiled with Visual Studio 2013 64-bit release mode.
I also observe a faster frame rate on the GTX 580.
I used
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
to get the kernel time. I don't use the double floating point extension (//#pragma OPENCL EXTENSION cl_khr_fp64 : enable
).
The kernel code is broken up into several kernel files which I assemble into one file which is several thousand lines of code.
Why is my kernel slower on newer and "better" hardware?
Here is the code where I create the context. It's not all going to make sense but it's probably better than nothing
void Contexts::init(string sourceCode) {
run_time = -1;
context = createCLContext(type, vendor);
cl_uint uiNumSupportedFormats = 0;
devices = context.getInfo<CL_CONTEXT_DEVICES>();
int err = 0;
try{
//queues.push_back(cl::CommandQueue(context, devices[i], 0, &err));
//queue = cl::CommandQueue(context, devices[device], CL_QUEUE_PROFILING_ENABLE, &err);
queue = cl::CommandQueue(context, devices[device], CL_QUEUE_PROFILING_ENABLE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
//printf("\t\tDevice: %s\n", devices[device].getInfo<CL_DEVICE_NAME>().c_str());
}
catch (cl::Error er) {
printf("ERROR: %s(%d)\n", er.what(), er.err());
}
//ndevices = devices.size();
//if(ndevices>max_devices) ndevices = max_devices;
program = buildProgramFromSource(context, sourceCode);
try{
kernel1 = cl::Kernel(program, "trace", &err);
kernel2 = cl::Kernel(program, "transform_primitives", &err);
kernel_postprocess = cl::Kernel(program, "post_process", &err);
}
catch (cl::Error er) {
printf("ERROR: %s(%d)\n", er.what(), er.err());
}
}
cl::Buffer Contexts::copy_buffer(int size, const void* ptr, int flags = CL_MEM_READ_ONLY) {
cl::Buffer out;
if(size>0) {
out = cl::Buffer(context, flags| CL_MEM_COPY_HOST_PTR, size, (void*)ptr);
}
else {
//NULL pointers to kernel do not seem to work on INTEL so use this hack
out = cl::Buffer(context, flags, 1, NULL);
}
return out;
}
void Contexts::copy_buffers() {
//int cubemap_size = para->cubemap->sizeX * para->cubemap->sizeY * 6 * para->cubemap->ncubemap;
//if(para->cubemap->sizeX== -1) cubemap_size = 0;
int nobj = para->kernel1_parameters.nobj;
int nprim = para->kernel1_parameters.nprim;
int nmat= para->kernel1_parameters.nmat;
int nlight = para->kernel1_parameters.nlight;
int nnode = para->kernel1_parameters.nnode;
int nmap = para->nmaps;
int err = 0;
int npixels = para->kernel1_parameters.height*para->kernel1_parameters.width;
int exposure_samples = para->kernel1_parameters.exposure_samples;
int mask_size = para->kernel1_parameters.mask_size;
int nmask = (2*mask_size+1)*(2*mask_size+1);
cl_objects_mem = copy_buffer(sizeof(CSG_object)*nobj, para->objects);
cl_node_mem = copy_buffer(sizeof(Node)*nnode, para->nodes);
cl_prim_mem = copy_buffer(sizeof(Primitive)*nprim, para->prims, CL_MEM_READ_WRITE);
cl_light_mem = copy_buffer(sizeof(Light)*nlight, para->lights);
cl_mat_mem = copy_buffer(sizeof(Material)*nmat, para->mats);
cubemap_info = copy_buffer(sizeof(Cubemap_info)*nmap, para->maps);
cubemap_images = copy_buffer(sizeof(cl_uchar4)*para->envmap_npixels, para->envmap_images);
cl_mask_mem = copy_buffer(sizeof(cl_float)*nmask, para->mask);
cl_image_mem = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar4)*npixels, NULL, &err);
cl_results_mem = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*npixels, NULL, &err);
cl_luminance = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float)*exposure_samples, NULL, &err);
if(para->surfacecpy_sw) {
cmPinnedBufOut1 = cl::Buffer(context, CL_MEM_WRITE_ONLY |CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uchar4)*npixels, NULL, NULL);
image = (int*)queue.enqueueMapBuffer(cmPinnedBufOut1, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar4)*npixels, 0, NULL, NULL);
//queue.enqueueUnmapMemObject(cmPinnedBufOut1, image);
//int pageSize = 4096;
//image = (int*) _aligned_malloc(sizeof(cl_uchar4)*npixels, pageSize);
//CL_MEM_USE_PERSISTENT_MEM_AMD
}
cmPinnedBufOut2 = cl::Buffer(context, CL_MEM_WRITE_ONLY |CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float)*exposure_samples, NULL, NULL);
luminance = (float*)queue.enqueueMapBuffer(cmPinnedBufOut2, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_float)*exposure_samples, 0, NULL, NULL);
queue.finish();
//int kindex = 0;
kernel1.setArg(0, cl_objects_mem);
kernel1.setArg(1, cl_node_mem);
kernel1.setArg(2, cl_prim_mem);
kernel1.setArg(3, cl_mat_mem);
kernel1.setArg(4, cl_light_mem);
kernel1.setArg(5, cubemap_info);
kernel1.setArg(6, cubemap_images);
kernel1.setArg(7, cl_results_mem);
kernel_postprocess.setArg(0, cl_results_mem);
kernel_postprocess.setArg(1, cl_luminance);
kernel_postprocess.setArg(2, cl_image_mem);
kernel_postprocess.setArg(3, cl_mask_mem);
kernel2.setArg(0, cl_prim_mem);
}
void Contexts::run() {
int nprim = para->kernel2_parameters.nprim;
cl_float speed = para->kernel2_parameters.speed;
cl_float4 speed_obj = para->kernel2_parameters.speed_obj;
cl_float16 cl_viewTransform;
for(int i=0; i<16; i++)
cl_viewTransform.s[i] = para->viewTransform[i];
//para->kernel1_parameters.offset = offset;
//para->kernel1_parameters.offset2 = offset2;
kernel1.setArg(8, cl_viewTransform);
kernel1.setArg(9, para->kernel1_parameters);
kernel1.setArg(10, offset);
kernel_postprocess.setArg(4, para->kernel1_parameters);
kernel_postprocess.setArg(5, offset);
kernel_postprocess.setArg(6, offset2);
//kernel1.setArg(11, offset2);
cl::NDRange local_size = cl::NDRange(local_work_size);
if(local_work_size == 0) {
local_size = cl::NullRange;
}
queue.enqueueNDRangeKernel(kernel1, cl::NullRange, cl::NDRange(size), local_size, NULL, &clevent);
queue.finish();
cl_ulong time_start, time_end;
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
run_time = (float)(time_end - time_start);
//post_process
queue.enqueueNDRangeKernel(kernel_postprocess, cl::NullRange, cl::NDRange(size), local_size, NULL, &clevent);
queue.finish();
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
run_time += (float)(time_end - time_start);
//printf("run time %f, run time2 %f\n", run_time, run_time2);
//kernel2
kernel2.setArg(1, speed);
kernel2.setArg(2, speed_obj);
queue.enqueueNDRangeKernel(kernel2, cl::NullRange, cl::NDRange(nprim), cl::NullRange, NULL, &clevent);
queue.finish();
time_end = clevent.getProfilingInfo<CL_PROFILING_COMMAND_END>();
time_start = clevent.getProfilingInfo<CL_PROFILING_COMMAND_START>();
run_time += (float)(time_end - time_start);
if(para->getoutput_sw) {
if(!para->surfacecpy_sw) {
if(SDL_MUSTLOCK(para->surface)) {
if(SDL_LockSurface(para->surface) < 0) return;
}
queue.enqueueReadBuffer(cl_image_mem, CL_TRUE, 0, sizeof(cl_uchar4)*size, (int*)para->surface->pixels + offset, NULL, &clevent);
queue.finish();
if(SDL_MUSTLOCK(para->surface))
SDL_UnlockSurface(para->surface);
}
else {
queue.enqueueReadBuffer(cl_image_mem, CL_TRUE, 0, sizeof(cl_uchar4)*size, (int*)image, NULL, &clevent);
queue.finish();
}
queue.enqueueReadBuffer(cl_luminance, CL_TRUE, 0, sizeof(cl_float)*size2, luminance, NULL, &clevent);
queue.finish();
}
}
Upvotes: 0
Views: 658
Reputation: 1615
I can't provide specific answers - there have been major changes to the streaming multiprocessor design between the GTX 580 and GTX 980. At the least, you'll probably need to find a new optimal local and global work group size.
What I would suggest is to use NVIDIA's profiling tools as they do still work for OpenCL. Take a look at this post by @jrprice for detailed instructions. Once you have the profiling data logged, you can import it into the Visual Profiler and inspect your kernels for register & local memory usage and occupancy.
Upvotes: 1
Reputation: 8484
(off the top of my head)
CUDA/PTX can be generated 32-bit or 64-bit.
OpenCL compiler generates by default:
Your GPU's are:
You can output generated ptx to double check, without ptx knowledge it should be apparent whether ptx code is 32bit or 64bit and which compute capability that is.
You are probably experiencing higher register usage because of the switch into 64 bit ptx - have a look at CUDA occupancy calculator to check if slow down is expected. If that confirms then you will need to fine tune your kernel.
Upvotes: 2
Reputation: 209
Have you enabled out of order processing, I have faced a similar issue in Nvidia GPU's when doing basic image processing?
The code runs slower when you create your command queue as in order, but if your command queue created in OpenCL is out of order which is possible in the Nvidia GPU's then execution is exponentially fast.
Refer to the API
cl_command_queue clCreateCommandQueue( cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_int *errcode_ret)
https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clCreateCommandQueue.html
The $cl_command_queue_properties should be set as $CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
But make sure that you have no data dependencies in your kernel as in that case you cannot use this option.
Also, make sure you query the number of compute units and give your global work size and local work size accordingly.
E.g. My Nvidia GPU has 4 compute units, thus for best performance my global work size should be divisible by 4 and my local work size should be an integral multiple of 4
Upvotes: 2