Reputation: 20396
I've written a program designed to test the performance of every single compute device registered with the OS. The Graphics Card is an AMD Radeon HD 5450. On my computer in particular, these are the four devices found:
On Devices 0, 1, and 3, when executing the kernels listed below, they all vary significantly in terms of their overall speed, but nominally all finish within expected magnitudes of differences in terms of direct comparisons. I've put the results for the Intel CPU on the Intel Platform on Pastebin.
But with Device 2, not only is execution rather dramatically slower than the other devices (for only the integral types, I might add!) it's slower by absurd orders of magnitude, despite the fact that it's supposedly using the same device (the Intel CPU) as the Intel Platform is using, which has no such issues. See this pastebin.
The notable outlying times (which are causing the massive slowdown) have to do with the vectorized versions of my code, and also depend on the code not being unary. Something about the AMD platform driving an Intel CPU seems very incompatible.
Does anyone have any idea what's going on? I've included my full complete code below, in case it has anything to do with the underlying problem.
Executor.hpp
#pragma once
#define CL_HPP_ENABLE_EXCEPTIONS
#pragma warning(disable : 4996)
#include<CL\cl2.hpp>
class buffers {
cl::Buffer a, b, c;
cl::Buffer output;
size_t size;
template<typename T>
buffers(cl::Context const& context, size_t size, T t) :
size(size){
std::vector<T> values;
values.resize(size * 16);
for (size_t i = 0; i < size; i++)
values[i] = T(i);
a = cl::Buffer( context, values.begin(), values.end(), true );
for (auto & val : values)
val *= 3;
b = cl::Buffer( context, values.begin(), values.end(), true );
for (auto & val : values)
val /= 10;
c = cl::Buffer( context, values.begin(), values.end(), true );
output = cl::Buffer( context, CL_MEM_WRITE_ONLY, size * 16 * sizeof(T) );
}
public:
template<typename T>
static buffers make_buffer(cl::Context const& context, size_t size) {
return buffers(context, size, T(0));
}
cl::Buffer get_a() const {
return a;
}
cl::Buffer get_b() const {
return b;
}
cl::Buffer get_c() const {
return c;
}
cl::Buffer get_output() const {
return output;
}
size_t get_size() const {
return size;
}
};
class task {
cl::CommandQueue queue;
cl::Kernel kernel;
buffers * b;
std::string type_name;
public:
task(cl::CommandQueue queue, cl::Kernel kernel, buffers * b, std::string const& type_name) :
queue(queue),
kernel(kernel),
b(b),
type_name(type_name){
int argc = kernel.getInfo<CL_KERNEL_NUM_ARGS>();
for (int i = 0; i < argc; i++) {
std::string something = kernel.getArgInfo<CL_KERNEL_ARG_NAME>(i);
if(something == "a")
kernel.setArg(i, b->get_a());
else if(something == "b")
kernel.setArg(i, b->get_b());
else if(something == "c")
kernel.setArg(i, b->get_c());
else if(something == "output")
kernel.setArg(i, b->get_output());
}
}
cl::Event enqueue() {
cl::Event event;
queue.enqueueNDRangeKernel(kernel, {}, cl::NDRange{ b->get_size() }, {}, nullptr, &event);
return event;
}
cl::Kernel get_kernel() const {
return kernel;
}
std::string get_type_name() const {
return type_name;
}
static std::chrono::nanoseconds time_event(cl::Event event) {
event.wait();
return std::chrono::nanoseconds{ event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>() };
}
};
TaskGenerator.hpp
#pragma once
#include "Executor.hpp"
#include<iostream>
#include<fstream>
class kernel_generator {
public:
static std::string get_primary_kernels() {
return ""
#include "Primary Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
static std::string get_trigonometric_kernels() {
return ""
#include "Trigonometry Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
static std::string get_utility_kernels() {
return ""
#include "Utility Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
private:
static std::vector<cl::Kernel> get_kernels(std::string src, cl::Context context, cl::Device device, std::ostream & err_log) {
try {
cl::Program program{ context, src, false };
program.build();
std::vector<cl::Kernel> kernels;
program.createKernels(&kernels);
return kernels;
}
catch (cl::BuildError const& e) {
std::cerr << "Unable to build kernels for " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
err_log << "Build Log:\n";
auto log = e.getBuildLog();
for (auto const& log_p : log) {
err_log << log_p.second << "\n";
}
return{};
}
}
public:
static std::vector<cl::Kernel> get_char_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "char_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_short_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "short_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_int_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "int_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_long_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "long_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_float_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "float_defines.cl"
+ get_primary_kernels();
std::vector<cl::Kernel> primary_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "float_defines.cl"
+ get_utility_kernels();
std::vector<cl::Kernel> utility_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "float_defines.cl"
+ get_trigonometric_kernels();
std::vector<cl::Kernel> trig_kernels = get_kernels(src, context, device, err_log);
std::vector<cl::Kernel> final_kernels;
final_kernels.insert(final_kernels.end(), primary_kernels.begin(), primary_kernels.end());
final_kernels.insert(final_kernels.end(), utility_kernels.begin(), utility_kernels.end());
final_kernels.insert(final_kernels.end(), trig_kernels.begin(), trig_kernels.end());
return final_kernels;
}
static std::vector<cl::Kernel> get_double_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "double_defines.cl"
+ get_primary_kernels();
std::vector<cl::Kernel> primary_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "double_defines.cl"
+ get_utility_kernels();
std::vector<cl::Kernel> utility_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "double_defines.cl"
+ get_trigonometric_kernels();
std::vector<cl::Kernel> trig_kernels = get_kernels(src, context, device, err_log);
std::vector<cl::Kernel> final_kernels;
final_kernels.insert(final_kernels.end(), primary_kernels.begin(), primary_kernels.end());
final_kernels.insert(final_kernels.end(), utility_kernels.begin(), utility_kernels.end());
final_kernels.insert(final_kernels.end(), trig_kernels.begin(), trig_kernels.end());
return final_kernels;
}
};
Kernel Testing.cpp (main)
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#pragma warning(disable : 4996)
#include<CL\cl2.hpp>
#include<iostream>
#include<iomanip>
#include<chrono>
#include<fstream>
#include<sstream>
#include<filesystem>
#include "TaskGenerator.hpp"
namespace filesystem = std::experimental::filesystem;
void print_device_info(std::ostream & out, cl::Platform platform, cl::Device device) {
out << std::setw(20) << std::left << "Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << "\n";
out << std::setw(20) << std::left << "Name: " << device.getInfo<CL_DEVICE_NAME>() << "\n";
out << std::setw(20) << std::left << "Memory Size: " << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << "\n";
out << std::setw(20) << std::left << "Device Type: " << ((device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU") << "\n";
}
void test_device(cl::Platform platform, cl::Device device, filesystem::path output_file_path) {
std::ofstream out(output_file_path);
print_device_info(out, platform, device);
cl::Context context(device);
cl::CommandQueue queue{ context, CL_QUEUE_PROFILING_ENABLE };
size_t size = 100'000;
buffers char_buffers = buffers::make_buffer<cl_char>(context, size);
buffers short_buffers = buffers::make_buffer<cl_short>(context, size);
buffers int_buffers = buffers::make_buffer<cl_int>(context, size);
buffers long_buffers = buffers::make_buffer<cl_long>(context, size);
buffers float_buffers = buffers::make_buffer<cl_float>(context, size);
buffers double_buffers = buffers::make_buffer<cl_double>(context, size);
auto char_kernels = kernel_generator::get_char_kernels(context, device, out);
auto short_kernels = kernel_generator::get_short_kernels(context, device, out);
auto int_kernels = kernel_generator::get_int_kernels(context, device, out);
auto long_kernels = kernel_generator::get_long_kernels(context, device, out);
auto float_kernels = kernel_generator::get_float_kernels(context, device, out);
std::vector<cl::Kernel> double_kernels = kernel_generator::get_double_kernels(context, device, out);
std::vector<task> tasks;
for (auto & kernel : char_kernels) {
tasks.emplace_back(queue, kernel, &char_buffers, "char");
}
for (auto & kernel : short_kernels) {
tasks.emplace_back(queue, kernel, &short_buffers, "short");
}
for (auto & kernel : int_kernels) {
tasks.emplace_back(queue, kernel, &int_buffers, "int");
}
for (auto & kernel : long_kernels) {
tasks.emplace_back(queue, kernel, &long_buffers, "long");
}
for (auto & kernel : float_kernels) {
tasks.emplace_back(queue, kernel, &float_buffers, "float");
}
for (auto & kernel : double_kernels) {
tasks.emplace_back(queue, kernel, &double_buffers, "double");
}
std::vector<cl::Event> events;
size_t index = 0;
for (auto & task : tasks) {
events.emplace_back(task.enqueue());
std::cout << "Enqueueing " << task.get_kernel().getInfo<CL_KERNEL_FUNCTION_NAME>() << "(" << task.get_type_name() << ")" << "\n";
cl::Event e = task.enqueue();
}
out << "==========================================\n\nProfiling Results:\n\n";
for (size_t i = 0; i < events.size(); i++) {
events[i].wait();
auto duration = task::time_event(events[i]);
std::cout << "Task " << (i + 1) << " of " << events.size() << " complete.\r";
std::string task_name = tasks[i].get_kernel().getInfo<CL_KERNEL_FUNCTION_NAME>();
task_name.append("(" + tasks[i].get_type_name() + ")");
out << " " << std::setw(40) << std::right << task_name;
out << ": " << std::setw(12) << std::right << duration.count() << "ns\n";
}
}
int main() {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
int i = 0;
for (auto & platform : platforms) {
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
for (auto & device : devices) {
test_device(platform, device, "Device " + std::to_string(i++) + ".txt");
}
}
system("pause");
return 0;
}
char_defines.cl, short_defines.cl, int_defines.cl, long_defines.cl, float_defines.cl, double_defines.cl
R"D(
typedef char Scalar;
typedef char2 Vector2;
typedef char4 Vector4;
typedef char8 Vector8;
typedef char16 Vector16;
)D"
//All the other defines files are identical, but with their respective types swapped in.
//Only exception is double_defines.cl, which also has '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' added.
Primary Transform.cl
R"D(
#define UNARY_TRANSFORM_PROCESS(a) a * a + a / a
#define BINARY_TRANSFORM_PROCESS(a, b) a * b + a / b
#define TERNARY_TRANSFORM_PROCESS(a, b, c) a * b + c / a
)D"
Trigonometry Transform.cl
R"D(
#define UNARY_TRANSFORM_PROCESS(a) sin(a) + cos(a) + tan(a)
#define BINARY_TRANSFORM_PROCESS(a, b) sin(a) + cos(b) + tan(a)
#define TERNARY_TRANSFORM_PROCESS(a, b, c) sin(a) + cos(b) + tan(c)
)D"
Utility Transform.cl
R"D(
#define UNARY_TRANSFORM_PROCESS(a) log(a) + hypot(a, a) + tgamma(a)
#define BINARY_TRANSFORM_PROCESS(a, b) log(a) + hypot(b, a) + tgamma(b)
#define TERNARY_TRANSFORM_PROCESS(a, b, c) log(a) + hypot(b, c) + tgamma(a)
)D"
Transformations.cl
R"D(
Scalar Unary_Transform1(Scalar a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector2 Unary_Transform2(Vector2 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector4 Unary_Transform4(Vector4 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector8 Unary_Transform8(Vector8 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Vector16 Unary_Transform16(Vector16 a) {
return UNARY_TRANSFORM_PROCESS(a);
}
Scalar Binary_Transform1(Scalar a, Scalar b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector2 Binary_Transform2(Vector2 a, Vector2 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector4 Binary_Transform4(Vector4 a, Vector4 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector8 Binary_Transform8(Vector8 a, Vector8 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Vector16 Binary_Transform16(Vector16 a, Vector16 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}
Scalar Ternary_Transform1(Scalar a, Scalar b, Scalar c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector2 Ternary_Transform2(Vector2 a, Vector2 b, Vector2 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector4 Ternary_Transform4(Vector4 a, Vector4 b, Vector4 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector8 Ternary_Transform8(Vector8 a, Vector8 b, Vector8 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
Vector16 Ternary_Transform16(Vector16 a, Vector16 b, Vector16 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
)D"
Transform Kernels.cl
R"D(
kernel void unary_transform_scalar(global Scalar * a, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform1(a[id]);
}
kernel void binary_transform_scalar(global Scalar * a, global Scalar * b, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Binary_Transform1(a[id], b[id]);
}
kernel void ternary_transform_scalar(global Scalar * a, global Scalar * b, global Scalar * c, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Ternary_Transform1(a[id], b[id], c[id]);
}
kernel void unary_transform_vector2(global Vector2 * a, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform2(a[id]);
}
kernel void binary_transform_vector2(global Vector2 * a, global Vector2 * b, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Binary_Transform2(a[id], b[id]);
}
kernel void ternary_transform_vector2(global Vector2 * a, global Vector2 * b, global Vector2 * c, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Ternary_Transform2(a[id], b[id], c[id]);
}
kernel void unary_transform_vector4(global Vector4 * a, global Vector4 * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform4(a[id]);
}
/* For the sake of brevity, I've cut the rest. It should be pretty clear what the
rest look like.*/
)D"
Upvotes: 2
Views: 132
Reputation: 8484
As far as I'm aware AMD Radeon HD 5450
isn't a dual GPU and is probably listed twice because 2 different versions of AMD OpenCL platform are installed. I remember such a case when there was OpenCL 1.2 and experimental OpenCL 2.0. Check the platform versions.
When it comes to CPU I think Experimental OpenCL 2.0 CPU Only Platform
is Intel implementation which is well optimized for Intel CPU. AMD OpenCL SDK is just made to work on Intel CPU and performance is poor - I was experiencing similar issue in the past.
To sum up - you don't have to use all available OpenCL platforms on all devices. Usually the latest OpenCL platform version for GPU gives decent performance and always use Intel OpenCL on Intel CPU.
Upvotes: 3