Ivy The Great
Ivy The Great

Reputation: 1

OpenCL memory leaks and errors even after releasing everything with clRelease*()

I've written a really simple OpenCL program that just adds up two arrays. It works correctly, however if I run it with Valgrind I'm getting lots of memory leaks even after I release everything using clRelease*. I am also getting a Mismatched free() / delete / delete[] Valgrind error when have clReleaseProgram(program) in my source code. To be clear I'm getting memory leaks regardless of if I have that line or not, but that line also gives me the error.

I've included the main.c and kernel.cl source code files as well as the Valgrind output I get with the clReleaseProgram(program) line in main.c

I don't know if this is relevant but I am using AMD's ROCM OpenCL implementation and am using version 1.2, and this is running on a Radeon RX 5700XT graphics card. OS is Ubuntu 20.04.

I'd really appreciate any advice!

// Simple test OpenCL program that adds two arrays

// Define OpenCL version
#define CL_TARGET_OPENCL_VERSION 120

#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>


// Main program
int main(int argc, char **argv) {
    // Create all three arrays needed
    float arr1[] = {1.0, 2.0, 3.0, 4.0};
    float arr2[] = {10.0, 11.0, 12.0, 13.0};
    float arr3[4]; 
    
    // Initialize platform id structure
    cl_int err;
    cl_platform_id platform;
    err = clGetPlatformIDs(1, &platform, NULL);

    // Initialize device id structure
    cl_device_id device;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    // Initialize context
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

    // Determine file size of kernel source file
    FILE *fp;
    fp = fopen("kernel.cl", "r");
    fseek(fp, 0, SEEK_END);
    long int src_size = ftell(fp);
    rewind(fp);

    // Read kernel source file into buffer
    char *buf = malloc(sizeof(char) * (src_size + 1));
    fread(buf, sizeof(char), src_size, fp);
    buf[src_size] = '\0';

    // Create the program
    cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buf, NULL, &err);
    free(buf);

    // Build the program
    const char options[] = "-cl-std=CL1.2 -D MULT=2";
    err = clBuildProgram(program, 1, &device, options, NULL, NULL);

    // Create command queue
    cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);

    // Create kernel
    const char kernel_name[] = "test_kernel";
    cl_kernel kernel = clCreateKernel(program, kernel_name, &err);

    // Create the memory objects
    cl_mem arr1_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(float), NULL, &err);
    cl_mem arr2_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(float), NULL, &err);
    cl_mem arr3_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 4 * sizeof(float), NULL, &err);

    // Write the input arrays into the input mem objects
    err = clEnqueueWriteBuffer(queue, arr1_mem, CL_TRUE, 0, 4 * sizeof(float), arr1, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, arr2_mem, CL_TRUE, 0, 4 * sizeof(float), arr2, 0, NULL, NULL);

    // Set the kernel arguments
    err = clSetKernelArg(kernel, 0, sizeof(arr1_mem), &arr1_mem);
    err = clSetKernelArg(kernel, 1, sizeof(arr2_mem), &arr2_mem);
    err = clSetKernelArg(kernel, 2, sizeof(arr3_mem), &arr3_mem);

    // Execute kernel
    size_t gws[] = {4};
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gws, NULL, 0, NULL, NULL);

    // Read the output array into the output mem object
    err = clEnqueueReadBuffer(queue, arr3_mem, CL_TRUE, 0, 4 * sizeof(float), arr3, 0, NULL, NULL);

    // Print the output
    for (int i = 0; i < 4; ++i) {
        printf("%f\n", arr3[i]);
    }

    // Release all OpenCL objects
    clReleaseMemObject(arr1_mem);
    clReleaseMemObject(arr2_mem);
    clReleaseMemObject(arr3_mem);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program); // This line causes Mismatched free / delete[] / delete valgrind error
    clReleaseContext(context);

    return 0;
}
// Test OpenCL kernel
__kernel void test_kernel(__constant float *arr1, __constant float *arr2, __global float *arr3) {
    // Get the index into the arrays
    int index = get_global_id(0);

    // Sum arr1 + arr2 and store in arr3
    arr3[index] = arr1[index] + arr2[index];
}

ivy@ubuntu-main:~/cl_test$ valgrind ./a.out
==10340== Memcheck, a memory error detector
==10340== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==10340== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==10340== Command: ./a.out
==10340== 
==10340== Warning: set address range perms: large range [0x59c9d000, 0x159e9e000) (noaccess)
11.000000
13.000000
15.000000
17.000000
==10340== Mismatched free() / delete / delete []
==10340==    at 0x483CFBF: operator delete(void*) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340==    by 0x542B8C0: rocr::amd::hsa::loader::AmdHsaCodeLoader::DestroyExecutable(rocr::amd::hsa::loader::Executable*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5407ECF: rocr::HSA::hsa_executable_destroy(hsa_executable_s) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5159C13: roc::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x515A606: roc::LightningProgram::~LightningProgram() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5106B22: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5106D28: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x51006C5: amd::ReferenceCountedObject::release() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x50DACB3: clReleaseProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x1099A3: main (main.c:92)
==10340==  Address 0xe3634c0 is 0 bytes inside a block of size 42 alloc'd
==10340==    at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340==    by 0x4B0750E: strdup (strdup.c:42)
==10340==    by 0x54348A3: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s, hsa_code_object_s, unsigned long, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5434F52: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s, hsa_code_object_s, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5408460: rocr::HSA::hsa_executable_load_agent_code_object(hsa_executable_s, hsa_agent_s, hsa_code_object_reader_s, char const*, hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5159FBF: roc::LightningProgram::setKernels(amd::option::Options*, void*, unsigned long) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5153358: device::Program::linkImplLC(amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5153ABC: device::Program::build(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, char const*, amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5108F31: amd::Program::build(std::vector<amd::Device*, std::allocator<amd::Device*> > const&, char const*, void (*)(_cl_program*, void*), void*, bool, bool) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x50DAF42: clBuildProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x1096B5: main (main.c:51)
==10340== 
==10340== 
==10340== HEAP SUMMARY:
==10340==     in use at exit: 991,949 bytes in 4,653 blocks
==10340==   total heap usage: 576,804 allocs, 572,151 frees, 246,308,729 bytes allocated
==10340== 
==10340== LEAK SUMMARY:
==10340==    definitely lost: 67,584 bytes in 1 blocks
==10340==    indirectly lost: 0 bytes in 0 blocks
==10340==      possibly lost: 7,692 bytes in 66 blocks
==10340==    still reachable: 916,673 bytes in 4,586 blocks
==10340==         suppressed: 0 bytes in 0 blocks
==10340== Rerun with --leak-check=full to see details of leaked memory
==10340== 
==10340== For lists of detected and suppressed errors, rerun with: -s
==10340== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)

Edit: Output when running: Valgrind --leak-check=full ./a.out I omitted the beginning of the Valgrind output because its exactly the same as above. I also omitted most of the middle output because all the errors look essentially the same, and there are too many of them to fit in this post.

... Same output here as above ...


==4996== HEAP SUMMARY:
==4996==     in use at exit: 991,947 bytes in 4,653 blocks
==4996==   total heap usage: 575,705 allocs, 571,052 frees, 246,015,102 bytes allocated
==4996== 
==4996== 4 bytes in 1 blocks are possibly lost in loss record 7 of 2,212
==4996==    at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2950F: fmm_init_process_apertures (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2CD2B: hsaKmtOpenKFD (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5401368: rocr::AMD::Load() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x542440D: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==    by 0x50F832E: amd::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==    by 0x5102525: amd::Runtime::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== 
==4996== 4 bytes in 1 blocks are possibly lost in loss record 8 of 2,212
==4996==    at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2C520: hsaKmtMapMemoryToGPU (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2F29B: allocate_exec_aligned_memory_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B25679: hsaKmtCreateEvent (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5414E62: rocr::core::InterruptSignal::CreateEvent(_HSA_EVENTTYPE, bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x541FD00: rocr::core::Runtime::BindVmFaultHandler() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5424427: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)


... Lots of similar repeating errors here ...


==4996== 720 bytes in 3 blocks are possibly lost in loss record 2,165 of 2,212
==4996==    at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x5B262DF: vm_create_and_init_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B265EC: fmm_allocate_memory_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B28219: fmm_allocate_host (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2B9FA: hsaKmtAllocMemory (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x53F0E31: rocr::AMD::MemoryRegion::Allocate(unsigned long&, unsigned int, void**) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x541C4C2: std::_Function_handler<void* (unsigned long, unsigned long, unsigned int), rocr::core::Runtime::RegisterAgent(rocr::core::Agent*)::{lambda(unsigned long, unsigned long, unsigned int)#1}>::_M_invoke(std::_Any_data const&, unsigned long&&, std::_Any_data const&, unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53DF22E: rocr::AMD::GpuAgent::AssembleShader(char const*, rocr::AMD::GpuAgent::AssembleTarget, void*&, unsigned long&) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53D39B2: rocr::AMD::BlitKernel::Initialize(rocr::core::Agent const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53DD58A: std::_Function_handler<rocr::core::Blit* (), rocr::AMD::GpuAgent::InitDma()::{lambda()#3}>::_M_invoke(std::_Any_data const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53DD2F9: rocr::AMD::GpuAgent::DmaCopy(void*, void const*, unsigned long) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53ED80A: rocr::(anonymous namespace)::RegionMemory::Freeze() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== 
==4996== 1,200 bytes in 5 blocks are possibly lost in loss record 2,173 of 2,212
==4996==    at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x5B262DF: vm_create_and_init_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B265EC: fmm_allocate_memory_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B28219: fmm_allocate_host (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2B9FA: hsaKmtAllocMemory (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x53F0E31: rocr::AMD::MemoryRegion::Allocate(unsigned long&, unsigned int, void**) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x541C4C2: std::_Function_handler<void* (unsigned long, unsigned long, unsigned int), rocr::core::Runtime::RegisterAgent(rocr::core::Agent*)::{lambda(unsigned long, unsigned long, unsigned int)#1}>::_M_invoke(std::_Any_data const&, unsigned long&&, std::_Any_data const&, unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x542B8D5: rocr::core::Shared<rocr::core::SharedSignal, rocr::core::SharedSignalPool_t>::Shared(rocr::core::SharedSignalPool_t*, int) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x542921E: rocr::core::LocalSignal::LocalSignal(long, bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5414EE8: rocr::core::InterruptSignal::InterruptSignal(long, _HsaEvent*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5410E85: rocr::AMD::hsa_amd_signal_create(long, unsigned int, hsa_agent_s const*, unsigned long, hsa_signal_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x513B727: roc::VirtualGPU::initPool(unsigned long, unsigned int) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== 
==4996== 67,584 bytes in 1 blocks are definitely lost in loss record 2,209 of 2,212
==4996==    at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0xA977E83: RegisterHandlers() (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0xA9787EB: llvm::sys::RemoveFileOnSignal(llvm::StringRef, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2BC0E: clang::CompilerInstance::createOutputFile(llvm::StringRef, std::error_code&, bool, bool, llvm::StringRef, llvm::StringRef, bool, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2BFD2: clang::CompilerInstance::createOutputFile(llvm::StringRef, bool, bool, llvm::StringRef, llvm::StringRef, bool, bool) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2C39E: clang::CompilerInstance::createDefaultOutputFile(bool, llvm::StringRef, llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x77F3913: GetOutputStream(clang::CompilerInstance&, llvm::StringRef, clang::BackendAction) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x77F959A: clang::CodeGenAction::CreateASTConsumer(clang::CompilerInstance&, llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F6D175: clang::FrontendAction::CreateWrappedASTConsumer(clang::CompilerInstance&, llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F70865: clang::FrontendAction::BeginSourceFile(clang::CompilerInstance&, clang::FrontendInputFile const&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2F079: clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x743641A: clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== 
==4996== LEAK SUMMARY:
==4996==    definitely lost: 67,584 bytes in 1 blocks
==4996==    indirectly lost: 0 bytes in 0 blocks
==4996==      possibly lost: 7,692 bytes in 66 blocks
==4996==    still reachable: 916,671 bytes in 4,586 blocks
==4996==         suppressed: 0 bytes in 0 blocks
==4996== Reachable blocks (those to which a pointer was found) are not shown.
==4996== To see them, rerun with: --leak-check=full --show-leak-kinds=all
==4996== 
==4996== For lists of detected and suppressed errors, rerun with: -s
==4996== ERROR SUMMARY: 56 errors from 56 contexts (suppressed: 0 from 0)

Upvotes: 0

Views: 927

Answers (1)

Paul Floyd
Paul Floyd

Reputation: 6906

If you are using 'standard' libstdc++ and libc, then in your case, a mismatch strdup / delete (rather than strdup / free) should not pose a problem. This is because libstdc++ new simply calls malloc, and delete calls free. On the other hand, if ever you use a replacement delete that is not based on free, then your application will very likely crash horribly.

For the memory leaks, you need to run memcheck with --leak-check=full, as it says near the end of the Valgrind output.

Upvotes: 1

Related Questions