Delfin
Delfin

Reputation: 343

OpenCL hangs forever when trying to read kernel output

This is a follow up to OpenCL Host ran out of Memory in trivial Kernel results that after applying the corrections and taking another errors I got that my program hangs at the EnqueueReadBuffer for an unknown reason!

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define CL_TARGET_OPENCL_VERSION 210
#include "CL/cl.h"

char* program_src = "__kernel void SAXPY (__global float* x, __global float* y, float a)\n"
"{\n"
"const int i = get_global_id (0);\n"
"y [i] += a * x [i];\n"
"}\n";

int main() {
    cl_platform_id platform_ids[16];
    cl_uint platform_count;

    if (clGetPlatformIDs(16, &platform_ids, &platform_count) != CL_SUCCESS) {
        return EXIT_FAILURE;
    }
    printf("%i cl platform(s) found\n", platform_count);

    if (platform_count == 0) {
        return EXIT_FAILURE;
    }

    printf("choosing platform 0...\n");

    cl_device_id device_ids[16];
    cl_int device_count;
    if (clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_ALL, 16, &device_ids, &device_count) != CL_SUCCESS) {
        return EXIT_FAILURE;
    }
    printf("%i cl device(s) found on platform 0\n", device_count);

    if (device_count == 0) {
        return EXIT_FAILURE;
    }

    cl_device_id device = device_ids[0];

    printf("** running test **\n");

    cl_int cl_fehler;
    cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &cl_fehler);
    if (ctx == NULL) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clCommandQueue\n");
    cl_fehler = CL_SUCCESS;
    cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    // Replace 1 mit Zahlen von Gärate IDs
    printf("Am clCreateProgram\n");
    cl_fehler = CL_SUCCESS;
    cl_program program = clCreateProgramWithSource(ctx, 1, &program_src, NULL, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clBuildProgram\n");
    cl_fehler = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clCreateKernel\n");
    cl_fehler = CL_SUCCESS;
    cl_kernel kernel = clCreateKernel(program, "SAXPY", &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clCreateBuffer\n");
    cl_fehler = CL_SUCCESS;
    cl_mem eingabe_buffer = clCreateBuffer(ctx, CL_MEM_READ_ONLY, sizeof(cl_float) * 10, NULL, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }
    
    printf("Am clCreateBuffer\n");
    cl_fehler = CL_SUCCESS;
    cl_mem ausgabe_buffer = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, sizeof(cl_float) * 10, NULL, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    cl_float eingabe_data[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};

    printf("Am clEnqueueWriteBuffer\n");
    cl_fehler = clEnqueueWriteBuffer(queue, eingabe_buffer, CL_TRUE, 0, sizeof(cl_float) * 10, &eingabe_data, 0, NULL, NULL); 
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clSetKernelArg\n");
    cl_fehler = clSetKernelArg(kernel, 0, sizeof(cl_mem), &eingabe_buffer);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clSetKernelArg\n");
    cl_fehler = clSetKernelArg(kernel, 1, sizeof(cl_mem), &ausgabe_buffer);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clSetKernelArg\n");
    cl_float f = 2.0;
    cl_fehler = clSetKernelArg(kernel, 2, sizeof(cl_float), &f);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clEnqueueNDRangeKernel\n");
    const size_t globalWorkSize[3] = { 10, 0, 0 };
    cl_fehler = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clEnqueueReadBuffer\n");
    cl_float ausgabe_data[3];
    cl_fehler = clEnqueueReadBuffer(queue, ausgabe_buffer, CL_TRUE, 0, sizeof(cl_float) * 10, &ausgabe_data, 0, NULL, NULL); 
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    // No point in share the rest of the code because the problem is being happening here
}

Update

Afer applying the suggestions from @pmdj the problems persists and can confirm that the clEnqueueReadBuffer part of the loggin is being printed.

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define CL_TARGET_OPENCL_VERSION 210
#include "CL/cl.h"

char* program_src = "__kernel void SAXPY (__global float* x, __global float* y, float a)\n"
"{\n"
"const int i = get_global_id (0);\n"
"y [i] = a * x [i];\n"
"}\n";

int main() {
    cl_platform_id platform_ids[16];
    cl_uint platform_count;

    if (clGetPlatformIDs(16, &platform_ids, &platform_count) != CL_SUCCESS) {
        return EXIT_FAILURE;
    }
    printf("%i cl platform(s) found\n", platform_count);

    if (platform_count == 0) {
        return EXIT_FAILURE;
    }

    printf("choosing platform 0...\n");

    cl_device_id device_ids[16];
    cl_int device_count;
    if (clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_ALL, 16, &device_ids, &device_count) != CL_SUCCESS) {
        return EXIT_FAILURE;
    }
    printf("%i cl device(s) found on platform 0\n", device_count);

    if (device_count == 0) {
        return EXIT_FAILURE;
    }

    cl_device_id device = device_ids[0];

    printf("** running test **\n");

    cl_int cl_fehler;
    cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &cl_fehler);
    if (ctx == NULL) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clCommandQueue\n");
    cl_fehler = CL_SUCCESS;
    cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    // Replace 1 mit Zahlen von Gärate IDs
    printf("Am clCreateProgram\n");
    cl_fehler = CL_SUCCESS;
    cl_program program = clCreateProgramWithSource(ctx, 1, &program_src, NULL, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clBuildProgram\n");
    cl_fehler = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clCreateKernel\n");
    cl_fehler = CL_SUCCESS;
    cl_kernel kernel = clCreateKernel(program, "SAXPY", &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clCreateBuffer\n");
    cl_fehler = CL_SUCCESS;
    cl_mem eingabe_buffer = clCreateBuffer(ctx, CL_MEM_READ_ONLY, sizeof(cl_float) * 10, NULL, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clCreateBuffer\n");
    cl_fehler = CL_SUCCESS;
    cl_mem ausgabe_buffer = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, sizeof(cl_float) * 10, NULL, &cl_fehler);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    cl_float eingabe_data[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};

    printf("Am clEnqueueWriteBuffer\n");
    cl_fehler = clEnqueueWriteBuffer(queue, eingabe_buffer, CL_TRUE, 0, sizeof(cl_float) * 10, &eingabe_data, 0, NULL, NULL);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clSetKernelArg\n");
    cl_fehler = clSetKernelArg(kernel, 0, sizeof(cl_mem), &eingabe_buffer);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clSetKernelArg\n");
    cl_fehler = clSetKernelArg(kernel, 1, sizeof(cl_mem), &ausgabe_buffer);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clSetKernelArg\n");
    cl_float f = 2.0;
    cl_fehler = clSetKernelArg(kernel, 2, sizeof(cl_float), &f);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    printf("Am clEnqueueNDRangeKernel\n");
    const size_t globalWorkSize[3] = { 10, 0, 0 };
    cl_fehler = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    clFinish(queue);

    printf("Am clEnqueueReadBuffer\n");
    cl_float ausgabe_data[10] = {1, 1, 1, 1, 1, 1, 1 ,1 ,1 ,1 };
    cl_fehler = clEnqueueReadBuffer(queue, ausgabe_buffer, CL_TRUE, 0, sizeof(cl_float) * 10, &ausgabe_data[0], 0, NULL, NULL);
    if (cl_fehler != CL_SUCCESS) {
        printf("Fehler: %i\n", cl_fehler);
        return EXIT_FAILURE;
    }

    // No point in share the rest of the code because the problem is being happening here
}

System data update:

The host is openSUSE Leap 15.5 x86_64 on an AMD ATI 04:00.0 Lucienne and AMD Ryzen 7 5700U with Radeon Graphics (16) @ 1.800GHz with AMD rocm drivers installed systemwide.

I am compiling with cc src/main.c -lOpenCL -I/opt/rocm-6.0.0/include/ -L/opt/rocm-6.0.0/lib/ -o test

And I have the rocm-opencl-sdk package installed from here which is the vendors (AMD's) official page.

Upvotes: 0

Views: 137

Answers (1)

pmdj
pmdj

Reputation: 23428

I can spot at least 3 issues:

1. Buffer size

cl_float ausgabe_data[3];
---------------------^^^

versus

clEnqueueReadBuffer(queue, ausgabe_buffer, CL_TRUE, 0, sizeof(cl_float) * 10, &ausgabe_data, 0, NULL, NULL);
-------------------------------------------------------^^^^^^^^^^^^^^^^^^^^^

This is trying to read 10 floats into a buffer that can hold 3.

2. Pointer syntax

This may not cause issues depending on your C compiler, but as far as I'm aware it's not strictly correct. When passing the pointer to the copy destination, you have:

…, &ausgabe_data, …

This takes the address of the array itself, not of the data items within it. In most cases these should be the same, but better stick to the safe approach. Either remove the ampersand (&) to use the fact that an array automatically degrades into a pointer to its 0th element, or explicitly specify the array item at which to start writing (&ausgabe_data[0]).

2a. Ignoring build warnings.

Note that the call to clEnqueueWriteBuffer() with &eingabe_data has the same issue. The clGetPlatformIDs(16, &platform_ids, and clGetDeviceIDs(…, &device_ids, calls have the same problem, and your compiler should be warning you about these latter 2. (clEnqueueWriteBuffer and clEnqueueReadBuffer unfortunately are not type safe and accept void* so your compiler cannot spot mistakes there. All the more reason to double check them manually.)

Don't ignore warnings! Please go and fix any other warnings your compiler is telling you about before posting questions here. If you're not seeing warnings about this, add something like -Wall -Wextra to your C compiler flags. Then go fix those issues as well as the ones I identified.

3. Buffer use.

The kernel code uses:

y [i] += …

so, it reads from y, adds that value to another, then writes it back. Yet, the buffer for y is created as write-only, and its contents is never initialised:

cl_mem ausgabe_buffer = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
---------------------------------------------------^^^^^^^^^^

Either make the buffer read-write and initialise it with sensible initial data (all zeroes?), or change the kernel code to pure assignment. (y [i] = …)

Upvotes: 1

Related Questions