TheSorcerer
TheSorcerer

Reputation: 125

OpenCL: CL_OUT_OF_RESOURCES in a for loop

I'm trying to execute an OpenCL but it gives me a CL_OUT_OF_RESOURCES. The situation is the following:

I'm testing with 100 workitems, so I set global_sizes and local_sizes to 100. I create a write only buffer of 100 * 128, for dealing with 128 values for workitem. I execute the kernel and when I'm going to read the resultant buffer I get the error.

The kernel code is below:

__kernel void k2(__global int* debug) {
    uint idx = 128 * get_global_id(0);
    uint i, k;
    for (i = 0; i < 128000; ++i) {
        for (k = 0; k < 128; ++k) {
            debug[idx+k] = 23;
        }
    }
}

I take the index of every workitem in the variable idx. Then, I do a looping 128000 times a subloop (I know it's a stupid thing but it's just for testing purposes!), and give the value 23 to every value of the buffer.

The launching code is below:

    cl_int status;
cl_uint num_platforms;
cl_platform_id* platforms;
cl_uint* num_devices;
cl_device_id** devices;
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_kernel kernel;
cl_program program;

cl_ulong max_mem_size;
cl_ulong max_work_group_size;
size_t max_work_item_size[3];

// Discover and populate the platforms
status = clGetPlatformIDs(0, NULL, &num_platforms);
chk_err(status, "Getting platform IDs", true);
if (num_platforms <= 0) {
    // If no platforms are available, we shouldn't continue
    fprintf(stderr, "No OpenCL platforms found!\n");
    exit(-1);
}

// Get all the platforms
platforms = new cl_platform_id[num_platforms];

status = clGetPlatformIDs(num_platforms, platforms, NULL);
chk_err(status, "Getting platform IDs", true);

// Allocate space for the device lists and lengths
num_devices = new cl_uint[num_platforms];
devices = new cl_device_id*[num_platforms];

// Traverse the platforms array printing information and
// populating devices
for (cl_uint i = 0; i < num_platforms; ++i) {
    // Print some platform info
    char* name = get_platform_info(platforms[i], CL_PLATFORM_NAME,
            "Getting platform name");
    char* vendor = get_platform_info(platforms[i], CL_PLATFORM_VENDOR,
            "Getting platform vendor");
    //printf("Platform: %s\nVendor: %s\n", name, vendor);
    delete[] name;
    delete[] vendor;

    // Retrieve the devices
    status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices[i]);
    if (chk_err(status, "Getting device IDs")) {
        printf("This is a known NVIDIA bug (if platform == AMD then die)\n");
        printf("Setting number of devices to 0 and continuing\n");
        num_devices[i] = 0;
    }

    //printf("Devices: %d\n", num_devices[i]);

    // Populate OpenCL devices if any exist
    if (num_devices[i] != 0) {
        // Allocate an array of devices of size "numDevices"
        devices[i] = new cl_device_id[num_devices[i]];

        // Populate Array with devices
        status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, num_devices[i],
            devices[i], NULL);
        chk_err(status, "Getting device IDs", true);
    }
}

cl_uint chosen_platform = 0;
cl_uint chosen_device = 0;

// Do a sanity check of platform/device selection
if (chosen_platform >= num_platforms ||
    chosen_device >= num_devices[chosen_platform]) {
    fprintf(stderr, "Invalid platform/device combination\n");
    exit(-1);
}

// Set the selected platform and device
platform = platforms[chosen_platform];
device = devices[chosen_platform][chosen_device];

// Get some device info
char* name = get_device_name(device);
char* vendor = get_device_vendor(device);
max_mem_size = get_device_max_mem_size(device);
max_work_group_size = get_device_max_work_group_size(device);
get_device_max_work_item_size(device, max_work_item_size);

printf("Device: %s\n", name);
printf("Vendor: %s\n", vendor);
printf("Max mem size: %llu Mb\n", max_mem_size / 1024);
printf("Max work group size: %llu\n", max_work_group_size);
printf("Max work item size: %llu, %llu, %llu\n",
        max_work_item_size[0], max_work_item_size[1], max_work_item_size[2]);

delete[] name;
delete[] vendor;

// Create the context
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
    (cl_context_properties)(platform), 0};
context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
chk_err(status, "Creating context", true);

// Create the command queue
queue = clCreateCommandQueue(context, device, 0, &status);
chk_err(status, "creating command queue", true);

// Load kernel source
char* source = load_kernel_source("vpm2.cl");
size_t source_size[] = { strlen(source) };

// Create the program object
program = clCreateProgramWithSource(context, 1, (const char**)&source,
        source_size, &status);
chk_err(status, "Creating program", true);
delete[] source;

// Try to compile the program
const char options[] = "-D ENABLE_DOUBLE -Werror -cl-nv-verbose";
status = clBuildProgram(program, 1, &device, options, NULL, NULL);

if (chk_err(status, "Building program")) {
    cl_build_status build_status;

    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
        sizeof(cl_build_status), &build_status, NULL);

    size_t size;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0,
        NULL, &size);

    char* build_log = new char[size+1];
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
        size+1, build_log, NULL);
    build_log[size] = '\0';

    printf("Build log:\n%s\nEnd log\n", build_log);
    chk_err(build_status, "Getting build info", true);
}

// Create the kernel
kernel = clCreateKernel(program, "k2", &status);
chk_err(status, "Creating kernel", true);

// Create the buffer
uint num_workitems = 100;
uint buf_size = num_workitems * 128;

cl_mem mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buf_size * sizeof(int), NULL, &status);
chk_err(status, "Error creating const mem buffer", true);

// Add arguments
status = clSetKernelArg(kernel, 0, sizeof(mem), &mem);
chk_err(status, "Setting kernel arg", true);

// Execute kernel
size_t global_sizes[1] = {num_workitems};
size_t local_sizes[1] = {num_workitems};
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
    global_sizes, local_sizes, 0, NULL, NULL);
chk_err(status, "Executing kernel", true);

// Read the results
int* res = new int[buf_size];
status = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0,
        buf_size * sizeof(int), (void*)res, 0, NULL, NULL);
chk_err(status, "Reading buffer", true);

// Release objects
status = clReleaseProgram(program);
chk_err(status, "Releasing program");
status = clReleaseKernel(kernel);
chk_err(status, "Releasing kernel");
status = clReleaseMemObject(mem);
chk_err(status, "Releasing mem object");
clReleaseCommandQueue(queue);
clReleaseContext(context);
for (cl_uint i = 0; i < num_platforms; ++i) {
    delete[] devices[i];
}
delete[] devices;
delete[] num_devices;
delete[] platforms;
delete res;

At first I though I was running out of scope in idx+k index but it's not the case.

That error is really curious because if I change idx+k for idx+127, for instance, it works. If I also change the number 128000 for a smaller one, 56000 for instance, it also works(!), so that fact discards something wrong in kernel's creation/execution. Amazing, isn't it? I've started thinking there is an issue in the local memory management or something like that. Any idea??

By the way ... I'm running the code in a NVIDIA Quadro 2000.

Thank you very much!

Upvotes: 3

Views: 9012

Answers (2)

Gaslight Deceive Subvert
Gaslight Deceive Subvert

Reputation: 20418

What you're seeing is the result of contention between your OpenCL kernels and your OS. Your OS wants to use your GPU to render windows and OpenCL wants to use it for your computation. Hence the screen freezes. Eventually your OS terminates your OpenCL program's GPU access which results in the above message. On my computer with an Intel GPU the following lines are logged in dmesg:

kernel: Asynchronous wait on fence 0000:00:02.0:kwin_x11[495]:1be343e timed out (hint:intel_atomic_commit_ready [i915])
kernel: i915 0000:00:02.0: [drm] GPU HANG: ecode 8:1:85ddfffb, in csim [541902]
kernel: i915 0000:00:02.0: [drm] Resetting rcs0 for stopped heartbeat on rcs0
kernel: i915 0000:00:02.0: [drm] csim[541902] context reset due to GPU hang

Upvotes: 0

DarkZeros
DarkZeros

Reputation: 8410

The most probable scenario is that you are SEG_FAULTing in the kernel and is giving you CL_OUT_OF_RESOURCES, which is the generic error when a kernel SEG_FAULTS in a nVIDIA platform. However, due to clEnqueueNDRangeKernel not being able to detect the error when queuing the kernel, it is being returned at read time of the buffer that SEG_FAULTED.

The cause can be:

  1. You are running more items than you think (can we see how you run the kernel?)
  2. You are creating less memory than needed for debug variable.
  3. The memory flags are not OK, they are read only, or any other problem.

PD: My original assumption is wrong if you only run 100 work items.


Alternative option for your error is that you are writing 6GB of data to a 120kB zone and in just 1 workgroup, and this is causing a huge bottleneck that makes the kernel take so much time to run that is killed by the driver. Returning a CL_OUT_OF_RESOURCES.

Reducing the amount of loops solves it, and setting k to a fixed value will eliminate the loop in the compiler optimization phase (thus, solving the issue as well). You can try if using more workgroups solve it as well.

Do you experience a 2 sec screen freeze? Then that is the problem for sure.

Upvotes: 4

Related Questions