OpenCL implemented algorithms slower than normal loop

Question

I am new to parallel computing and OpenCL. I followed the book OpenCLProgramming Guide. In the convolution implementation part.

My main.cpp:

#include 
#include 
#include 
#include 
#include 

using namespace std;

const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;

cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
    {3, 1, 1, 4, 8, 2, 1, 3},
    {4, 2, 1, 1, 2, 1, 2, 3},
    {4, 4, 4, 4, 3, 2, 2, 2},
    {9, 8, 3, 8, 9, 0, 0, 0},
    {9, 3, 3, 9, 0, 0, 0, 0},
    {0, 9, 0, 8, 0, 0, 0, 0},
    {3, 0, 8, 8, 9, 4, 4, 4},
    {5, 9, 8 ,1 ,8, 1, 1, 1}
};

const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;

cl_uint outputSignal[outputSignalWidth][outputSignalHeight];

const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;

cl_uint mask[maskWidth][maskHeight] =
{
    {1, 1, 1}, {1, 0, 1}, {1, 1, 1}
};

inline void checkErr(cl_int err, const char* name)
{
    if (err != CL_SUCCESS)
    {
        cerr << "Error: " << name << endl;
        exit(EXIT_FAILURE);
    }
}

void CL_CALLBACK contextCallback(const char * errInfo,
                                 const void * private_info,
                                 size_t cb,
                                 void * user_data)
{
    cout << "Error occurred during contxt use: " << errInfo << endl;
    exit(EXIT_FAILURE);
}

int main(int argc, const char * argv[])
{
    cl_int errNum;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;
    cl_context context = NULL;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem inputSignalBuffer;
    cl_mem outputSignalBuffer;
    cl_mem maskBuffer;

    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

    platformIDs = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numPlatforms);
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatFormIDs");

    deviceIDs = NULL;
    cl_uint i;
    for (i = 0; i < numPlatforms; i++)
    {
        errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
        if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
        {
            checkErr(errNum, "clGetDeviceIDs");
        } else if (numDevices > 0)
        {
            deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
            errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
            checkErr(errNum, "clGetDeviceIDs");
            break;
        }
    }

    if (deviceIDs == NULL)
    {
        cout << "No CPU devices found." << endl;
        exit(-1);
    }

    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[i], 0
    };

    context = clCreateContext(contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
    checkErr(errNum, "clCreateContext");


    ifstream srcFile("Convolution.cl");
    checkErr(srcFile.is_open()?CL_SUCCESS:-1, "reading Convolution.cl");

    string srcProg(istreambuf_iterator(srcFile),
                   (istreambuf_iterator()));

    const char* src = srcProg.c_str();
    size_t length = srcProg.length();

    program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
    checkErr(errNum, "clCreateProgramWithSource");

    cout << "Device count: " << sizeof(deviceIDs)/sizeof(cl_device_id) << endl;

    errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
    checkErr(errNum, "clBuildProgram");

    kernel = clCreateKernel(program, "convolve", &errNum);
    checkErr(errNum, "clCreateKernel");

    inputSignalBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight*inputSignalWidth, static_cast(inputSignal), &errNum);
    checkErr(errNum, "clCreateBuffer(inputSignal)");

    maskBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast(mask), &errNum);
    checkErr(errNum, "clCreateBuffer(mask)");

    outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum);
    checkErr(errNum, "clCreateBuffer(outputSignal)");

    queue = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum);
    checkErr(errNum, "clCreateCommandQueue");

    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
    checkErr(errNum, "clSetKernelArg");

    const size_t globalWorkSize[1] =
    {
        outputSignalWidth * outputSignalWidth
    };

    const size_t localWorkSize[1] =
    {
        1
    };

    clock_t start, end;

    clFinish(queue);
    start = clock();
    errNum = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
    clFinish(queue);
    end = clock();
    cout << "time for calculation: " << (float)(end - start) << endl;
    checkErr(errNum, "clEnequeueNDRangeKernel");

    errNum = clEnqueueReadBuffer(queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, outputSignal, 0, NULL, NULL);
    checkErr(errNum, "clEnqueueReadBuffer");

    clFinish(queue);
    start = clock();
    for (int y = 0; y < outputSignalHeight; y++)
    {
        for (int x = 0; x < outputSignalHeight; x++)
        {
            uint sum = 0;
            for (int r = 0; r < maskWidth; r++)
            {
                for (int c =0; c < maskWidth; c++)
                {
                    sum += inputSignal[y+r][x+c]*mask[r][c];
                }
            }
            outputSignal[y][x] = sum;

        }
    }
    end = clock();
    cout << "Loop version time: " << (float)(end - start) << endl;


    return 0;
}

and Convolution.cl:

__kernel void convolve(const __global uint * const input,
                       __constant uint * const mask,
                       __global uint * const output,
                       const int inputWidth,
                       const int maskWidth)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    uint sum = 0;
    for (int r = 0; r < maskWidth; r++)
    {
        const int idxIntmp = (y + r) * inputWidth + x;
        for (int c =0; c < maskWidth; c++)
        {
            sum+= mask[r * maskWidth + c] * input[idxIntmp + c];
        }
    }

    output[y * get_global_id(0) + x] = sum;
}

The platform is MacOS 10.9 and AMD 6750M. It doesn't make sense that CL version is much slower than for loop version(around 10x slower). Could you guys help me point out what's wrong with the code?

user703016 · Accepted Answer

There are two main problems:

const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalWidth };

First, as pointed out in the comments by Basile Starynkevitch, your data set is extremely small. Way too small to benefit from any GPU acceleration. Here, you are running only 36 work items: this is so ridiculously small that it could fit in barely half of a wavefront on one compute unit.

You should be running thousands of work items to correctly harness the power of your GPU. The overhead of OpenCL therefore makes the GPU version slower than the CPU one. Try with a much larger dataset, and you should notice a significant gain in performance.

Additionally:

const size_t localWorkSize[1] = { 1 };

You are running outputSignalWidth * outputSignalWidth work groups of 1 work item each. This is hugely problematic.

On AMD GPUs, the wavefront size is 64. This means that you should be scheduling work groups of at least 64 work items (ideally a multiple of 64) if you want to fully utilize your hardware. You are currently wasting 63 of your 64 hardware threads, that is 98.4% of the GPU doing nothing!

Either adapt your code to use bigger work groups (and change the global work size accordingly), either let the OpenCL driver choose the best size for you, by passing NULL instead of localWorkSize.

In short, you are using a massive backhoe just to move one tiny gravel.

OpenCL implemented algorithms slower than normal loop

Answers (1)

Related Questions