Reputation: 1
I've written a really simple OpenCL program that just adds up two arrays. It works correctly, however if I run it with Valgrind I'm getting lots of memory leaks even after I release everything using clRelease*. I am also getting a Mismatched free() / delete / delete[]
Valgrind error when have clReleaseProgram(program)
in my source code. To be clear I'm getting memory leaks regardless of if I have that line or not, but that line also gives me the error.
I've included the main.c
and kernel.cl
source code files as well as the Valgrind output I get with the clReleaseProgram(program)
line in main.c
I don't know if this is relevant but I am using AMD's ROCM OpenCL implementation and am using version 1.2, and this is running on a Radeon RX 5700XT graphics card. OS is Ubuntu 20.04.
I'd really appreciate any advice!
// Simple test OpenCL program that adds two arrays
// Define OpenCL version
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
// Main program
int main(int argc, char **argv) {
// Create all three arrays needed
float arr1[] = {1.0, 2.0, 3.0, 4.0};
float arr2[] = {10.0, 11.0, 12.0, 13.0};
float arr3[4];
// Initialize platform id structure
cl_int err;
cl_platform_id platform;
err = clGetPlatformIDs(1, &platform, NULL);
// Initialize device id structure
cl_device_id device;
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// Initialize context
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
// Determine file size of kernel source file
FILE *fp;
fp = fopen("kernel.cl", "r");
fseek(fp, 0, SEEK_END);
long int src_size = ftell(fp);
rewind(fp);
// Read kernel source file into buffer
char *buf = malloc(sizeof(char) * (src_size + 1));
fread(buf, sizeof(char), src_size, fp);
buf[src_size] = '\0';
// Create the program
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buf, NULL, &err);
free(buf);
// Build the program
const char options[] = "-cl-std=CL1.2 -D MULT=2";
err = clBuildProgram(program, 1, &device, options, NULL, NULL);
// Create command queue
cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
// Create kernel
const char kernel_name[] = "test_kernel";
cl_kernel kernel = clCreateKernel(program, kernel_name, &err);
// Create the memory objects
cl_mem arr1_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(float), NULL, &err);
cl_mem arr2_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(float), NULL, &err);
cl_mem arr3_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 4 * sizeof(float), NULL, &err);
// Write the input arrays into the input mem objects
err = clEnqueueWriteBuffer(queue, arr1_mem, CL_TRUE, 0, 4 * sizeof(float), arr1, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, arr2_mem, CL_TRUE, 0, 4 * sizeof(float), arr2, 0, NULL, NULL);
// Set the kernel arguments
err = clSetKernelArg(kernel, 0, sizeof(arr1_mem), &arr1_mem);
err = clSetKernelArg(kernel, 1, sizeof(arr2_mem), &arr2_mem);
err = clSetKernelArg(kernel, 2, sizeof(arr3_mem), &arr3_mem);
// Execute kernel
size_t gws[] = {4};
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gws, NULL, 0, NULL, NULL);
// Read the output array into the output mem object
err = clEnqueueReadBuffer(queue, arr3_mem, CL_TRUE, 0, 4 * sizeof(float), arr3, 0, NULL, NULL);
// Print the output
for (int i = 0; i < 4; ++i) {
printf("%f\n", arr3[i]);
}
// Release all OpenCL objects
clReleaseMemObject(arr1_mem);
clReleaseMemObject(arr2_mem);
clReleaseMemObject(arr3_mem);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program); // This line causes Mismatched free / delete[] / delete valgrind error
clReleaseContext(context);
return 0;
}
// Test OpenCL kernel
__kernel void test_kernel(__constant float *arr1, __constant float *arr2, __global float *arr3) {
// Get the index into the arrays
int index = get_global_id(0);
// Sum arr1 + arr2 and store in arr3
arr3[index] = arr1[index] + arr2[index];
}
ivy@ubuntu-main:~/cl_test$ valgrind ./a.out
==10340== Memcheck, a memory error detector
==10340== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==10340== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==10340== Command: ./a.out
==10340==
==10340== Warning: set address range perms: large range [0x59c9d000, 0x159e9e000) (noaccess)
11.000000
13.000000
15.000000
17.000000
==10340== Mismatched free() / delete / delete []
==10340== at 0x483CFBF: operator delete(void*) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340== by 0x542B8C0: rocr::amd::hsa::loader::AmdHsaCodeLoader::DestroyExecutable(rocr::amd::hsa::loader::Executable*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5407ECF: rocr::HSA::hsa_executable_destroy(hsa_executable_s) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5159C13: roc::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x515A606: roc::LightningProgram::~LightningProgram() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5106B22: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5106D28: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x51006C5: amd::ReferenceCountedObject::release() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x50DACB3: clReleaseProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x1099A3: main (main.c:92)
==10340== Address 0xe3634c0 is 0 bytes inside a block of size 42 alloc'd
==10340== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340== by 0x4B0750E: strdup (strdup.c:42)
==10340== by 0x54348A3: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s, hsa_code_object_s, unsigned long, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5434F52: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s, hsa_code_object_s, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5408460: rocr::HSA::hsa_executable_load_agent_code_object(hsa_executable_s, hsa_agent_s, hsa_code_object_reader_s, char const*, hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5159FBF: roc::LightningProgram::setKernels(amd::option::Options*, void*, unsigned long) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5153358: device::Program::linkImplLC(amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5153ABC: device::Program::build(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, char const*, amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5108F31: amd::Program::build(std::vector<amd::Device*, std::allocator<amd::Device*> > const&, char const*, void (*)(_cl_program*, void*), void*, bool, bool) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x50DAF42: clBuildProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x1096B5: main (main.c:51)
==10340==
==10340==
==10340== HEAP SUMMARY:
==10340== in use at exit: 991,949 bytes in 4,653 blocks
==10340== total heap usage: 576,804 allocs, 572,151 frees, 246,308,729 bytes allocated
==10340==
==10340== LEAK SUMMARY:
==10340== definitely lost: 67,584 bytes in 1 blocks
==10340== indirectly lost: 0 bytes in 0 blocks
==10340== possibly lost: 7,692 bytes in 66 blocks
==10340== still reachable: 916,673 bytes in 4,586 blocks
==10340== suppressed: 0 bytes in 0 blocks
==10340== Rerun with --leak-check=full to see details of leaked memory
==10340==
==10340== For lists of detected and suppressed errors, rerun with: -s
==10340== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
Edit: Output when running: Valgrind --leak-check=full ./a.out
I omitted the beginning of the Valgrind
output because its exactly the same as above. I also omitted most of the middle output because all the errors look essentially the same, and there are too many of them to fit in this post.
... Same output here as above ...
==4996== HEAP SUMMARY:
==4996== in use at exit: 991,947 bytes in 4,653 blocks
==4996== total heap usage: 575,705 allocs, 571,052 frees, 246,015,102 bytes allocated
==4996==
==4996== 4 bytes in 1 blocks are possibly lost in loss record 7 of 2,212
==4996== at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2950F: fmm_init_process_apertures (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2CD2B: hsaKmtOpenKFD (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5401368: rocr::AMD::Load() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x542440D: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== by 0x50F832E: amd::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== by 0x5102525: amd::Runtime::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==
==4996== 4 bytes in 1 blocks are possibly lost in loss record 8 of 2,212
==4996== at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2C520: hsaKmtMapMemoryToGPU (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2F29B: allocate_exec_aligned_memory_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B25679: hsaKmtCreateEvent (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5414E62: rocr::core::InterruptSignal::CreateEvent(_HSA_EVENTTYPE, bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x541FD00: rocr::core::Runtime::BindVmFaultHandler() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5424427: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
... Lots of similar repeating errors here ...
==4996== 720 bytes in 3 blocks are possibly lost in loss record 2,165 of 2,212
==4996== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x5B262DF: vm_create_and_init_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B265EC: fmm_allocate_memory_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B28219: fmm_allocate_host (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2B9FA: hsaKmtAllocMemory (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x53F0E31: rocr::AMD::MemoryRegion::Allocate(unsigned long&, unsigned int, void**) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x541C4C2: std::_Function_handler<void* (unsigned long, unsigned long, unsigned int), rocr::core::Runtime::RegisterAgent(rocr::core::Agent*)::{lambda(unsigned long, unsigned long, unsigned int)#1}>::_M_invoke(std::_Any_data const&, unsigned long&&, std::_Any_data const&, unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53DF22E: rocr::AMD::GpuAgent::AssembleShader(char const*, rocr::AMD::GpuAgent::AssembleTarget, void*&, unsigned long&) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53D39B2: rocr::AMD::BlitKernel::Initialize(rocr::core::Agent const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53DD58A: std::_Function_handler<rocr::core::Blit* (), rocr::AMD::GpuAgent::InitDma()::{lambda()#3}>::_M_invoke(std::_Any_data const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53DD2F9: rocr::AMD::GpuAgent::DmaCopy(void*, void const*, unsigned long) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53ED80A: rocr::(anonymous namespace)::RegionMemory::Freeze() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==
==4996== 1,200 bytes in 5 blocks are possibly lost in loss record 2,173 of 2,212
==4996== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x5B262DF: vm_create_and_init_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B265EC: fmm_allocate_memory_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B28219: fmm_allocate_host (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2B9FA: hsaKmtAllocMemory (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x53F0E31: rocr::AMD::MemoryRegion::Allocate(unsigned long&, unsigned int, void**) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x541C4C2: std::_Function_handler<void* (unsigned long, unsigned long, unsigned int), rocr::core::Runtime::RegisterAgent(rocr::core::Agent*)::{lambda(unsigned long, unsigned long, unsigned int)#1}>::_M_invoke(std::_Any_data const&, unsigned long&&, std::_Any_data const&, unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x542B8D5: rocr::core::Shared<rocr::core::SharedSignal, rocr::core::SharedSignalPool_t>::Shared(rocr::core::SharedSignalPool_t*, int) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x542921E: rocr::core::LocalSignal::LocalSignal(long, bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5414EE8: rocr::core::InterruptSignal::InterruptSignal(long, _HsaEvent*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5410E85: rocr::AMD::hsa_amd_signal_create(long, unsigned int, hsa_agent_s const*, unsigned long, hsa_signal_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x513B727: roc::VirtualGPU::initPool(unsigned long, unsigned int) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==
==4996== 67,584 bytes in 1 blocks are definitely lost in loss record 2,209 of 2,212
==4996== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0xA977E83: RegisterHandlers() (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0xA9787EB: llvm::sys::RemoveFileOnSignal(llvm::StringRef, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2BC0E: clang::CompilerInstance::createOutputFile(llvm::StringRef, std::error_code&, bool, bool, llvm::StringRef, llvm::StringRef, bool, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2BFD2: clang::CompilerInstance::createOutputFile(llvm::StringRef, bool, bool, llvm::StringRef, llvm::StringRef, bool, bool) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2C39E: clang::CompilerInstance::createDefaultOutputFile(bool, llvm::StringRef, llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x77F3913: GetOutputStream(clang::CompilerInstance&, llvm::StringRef, clang::BackendAction) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x77F959A: clang::CodeGenAction::CreateASTConsumer(clang::CompilerInstance&, llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F6D175: clang::FrontendAction::CreateWrappedASTConsumer(clang::CompilerInstance&, llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F70865: clang::FrontendAction::BeginSourceFile(clang::CompilerInstance&, clang::FrontendInputFile const&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2F079: clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x743641A: clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==
==4996== LEAK SUMMARY:
==4996== definitely lost: 67,584 bytes in 1 blocks
==4996== indirectly lost: 0 bytes in 0 blocks
==4996== possibly lost: 7,692 bytes in 66 blocks
==4996== still reachable: 916,671 bytes in 4,586 blocks
==4996== suppressed: 0 bytes in 0 blocks
==4996== Reachable blocks (those to which a pointer was found) are not shown.
==4996== To see them, rerun with: --leak-check=full --show-leak-kinds=all
==4996==
==4996== For lists of detected and suppressed errors, rerun with: -s
==4996== ERROR SUMMARY: 56 errors from 56 contexts (suppressed: 0 from 0)
Upvotes: 0
Views: 927
Reputation: 6906
If you are using 'standard' libstdc++ and libc, then in your case, a mismatch strdup / delete (rather than strdup / free) should not pose a problem. This is because libstdc++ new
simply calls malloc
, and delete
calls free
. On the other hand, if ever you use a replacement delete
that is not based on free
, then your application will very likely crash horribly.
For the memory leaks, you need to run memcheck
with --leak-check=full
, as it says near the end of the Valgrind output.
Upvotes: 1