Vega4
Vega4

Reputation: 979

OpenCL CL_INVALID_COMMAND_QUEUE when launching Kernel at Both NVIDIA and Intel GPUs

This might not the the most narrowed down problem but..

The program implements a wrapper around all of the OpenCL stuff. The wrapper detects all of the OpenCL devices then wraps these into yet another wrappers. The device wrappers contain all the objects related to it such as allocated cl_mem buffers the associated context etc.

I have chacked multiple times if there are no mistake, if no pointers are reused like if due to some error device wrappers from different platforms would share the same platform pointer. but no.

The problem: When I divide work between all the computational devices on my laptop (CPU+Intel GPU + Nvidia GPU), kernel execution issued to the NVIDIA GPU crahses with CL_INVALID_COMMAND_QUEUE.

I've checked everything.

I've tried the following scenarios:

most of the initialization code is below.

std::cout << "Initializing the OpenCL engine..\n";
cl_int ret;
unsigned int nrOfActiveContexts = 0;
ret = clGetPlatformIDs(0, NULL, &mRetNumPlatforms);
if (mRetNumPlatforms > 0)
{

    this->mPlatforms.resize(mRetNumPlatforms);
}
else
{
    fprintf(stderr, "No OpenCL platform available.\n");
    exit(1);
}

ret = clGetPlatformIDs(mRetNumPlatforms, mPlatforms.data(), NULL);

std::vector<cl_device_id> devices;
cl_context context;
cl_uint numberOfDevices;
//query for available compute platforms
for (int i = 0; i < mPlatforms.size() ; i++)
{
    bool error = false;
    numberOfDevices = 0;
    devices.clear();
    context = NULL;
    cl_device_type deviceTypes = CL_DEVICE_TYPE_ALL;
    if (useCPU &&useGPU)
        deviceTypes = CL_DEVICE_TYPE_ALL;
    else if (useCPU)
        deviceTypes = CL_DEVICE_TYPE_CPU;
    else if (useGPU)
        deviceTypes = CL_DEVICE_TYPE_GPU;

    ret = clGetDeviceIDs(mPlatforms[i], deviceTypes, 0, NULL, &numberOfDevices);
    if (numberOfDevices > 0)
    {
        devices.resize( numberOfDevices);
        ret = clGetDeviceIDs(mPlatforms[i], deviceTypes,
            numberOfDevices, devices.data(), NULL);
    }
    else continue;

    context = clCreateContext(NULL, numberOfDevices, devices.data(), NULL, NULL, &ret);
    if (ret != CL_SUCCESS)
        throw(std::abort);
    mContexts.push_back(context);
    if (ret != CL_SUCCESS)
    {
        error = true;
    }
    //query device properties create Workers
    size_t ret_size;
    cl_uint compute_units;
    cl_ulong max_alloc;
    size_t max_work_size;
    std::string name;
    std::vector<char> c_name;
    for (int y = 0; y < devices.size(); y++)
    {
        ret_size = compute_units = max_alloc = max_work_size = 0;
        c_name.clear();

        ret = clGetDeviceInfo(devices[y], CL_DEVICE_NAME, NULL, NULL, &ret_size);
        if (ret != CL_SUCCESS)
        {
            error = true; goto errored;
        }
        c_name.resize(ret_size);
        ret = clGetDeviceInfo(devices[y], CL_DEVICE_NAME, c_name.size(), c_name.data(), &ret_size);
        if (ret != CL_SUCCESS)
        {
            error = true; goto errored;
        }
        name = std::string(c_name.begin(), c_name.end());
        name = std::regex_replace(name, std::regex("[' ']{2,}"), " ");

        cl_device_type   devType;
        ret = clGetDeviceInfo(devices[y], CL_DEVICE_TYPE, sizeof(cl_device_type), (void *)&devType, NULL);
        if (ret != CL_SUCCESS)
        {
            error = true; goto errored;
        }



        ret = clGetDeviceInfo(devices[y], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&compute_units, NULL);
        if (ret != CL_SUCCESS)
        {
            error = true; goto errored;
        }
        ret = clGetDeviceInfo(devices[y], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&max_work_size, NULL);
        if (ret != CL_SUCCESS)
        {
            error = true; goto errored;
        }
        CWorker::eWorkerType type;
        if (devType & CL_DEVICE_TYPE_GPU)
            type = CWorker::eWorkerType::GPU;
        else
            if (devType & CL_DEVICE_TYPE_CPU)
                type = CWorker::eWorkerType::CPU;


        if (type == CWorker::eWorkerType::CPU)
        {
            if (compute_units > 8)
                max_work_size = compute_units / 4;
            else if (compute_units == 8)
                max_work_size = 2;
            else
                max_work_size = 1;

        }
        ret = clGetDeviceInfo(devices[y], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), (void *)&max_alloc, NULL);
        if (ret != CL_SUCCESS)
        {
            error = true;
            goto errored;
        }
    errored:
        if (error != true)
        {
            CWorker  * w = new CWorker();

            w->setDevice(devices[y]);
            w->setMaxComputeUnits(compute_units);
            w->setMaxMemAlloc(max_alloc);
            w->setMaxWorkGroupSize(max_work_size);
            w->setName(name);
            std::cmatch cm;
            if (std::regex_search(name.data(), cm, std::regex("\\w\+")))
                w->setShortName(std::string(cm[0]) +"-"+ std::to_string(mWorkers.size()+1));
            w->setContext(context);
            w->setType(type);

            mWorkers.push_back(w);
        }
    }

    nrOfActiveContexts++;
}

if (mWorkers.size() > 0)
    mInitialised = true;
if (mWorkers.size() > 0)
    return true;
else return false;

Upvotes: 0

Views: 680

Answers (1)

doqtor
doqtor

Reputation: 8484

Quite likely the problem is with how the context is being created:

context = clCreateContext(NULL, numberOfDevices, devices.data(), NULL, NULL, &ret);

The first parameter being passed is NULL which according to the OpenCL manual means that the platform being selected is implementation defined:

Specifies a list of context property names and their corresponding values. Each property name is immediately followed by the corresponding desired value. The list is terminated with 0. properties can be NULL in which case the platform that is selected is implementation-defined.

Try passing something like this:

cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[i])(), 0};
context = clCreateContext(properties, numberOfDevices, devices.data(), NULL, NULL, &ret);

If that doesn't help then maybe try to initialize Nvidia first (if that is not the case already). It may be that Intel is initialized first and it's OpenCL version driver is newer than Nvidia (for example Intel OpenCL 2.0 vs Nvidia 1.2) and some of that is being used for Nvidia hence error.

Upvotes: 0

Related Questions