Memory leak with cuPy (CUDA in Python)

Question

I am using raw CUDA kernels in python scripts. In the below MWE, I have a super simple raw kernel which is not doing anything. In the code below, I am simply creating a large array (around 2 GB) and passing it to the CUDA kernel.

MWE (Python - cupPy, Not Working):

import numpy as np
import cupy as cp


# custom raw kernel
custom_kernel = cp.RawKernel(r'''
extern "C" __global__
void custom_kernel(double* large_array)
{
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int frame = blockIdx.z * blockDim.z + threadIdx.z;
}
''', 'custom_kernel')


# launch kernel
large_array_gpu = cp.zeros((101*101*9*9*301), dtype=cp.float64) # around 2 GB
block_dim_2 = (32, 32, 1)
bx2 = int((101 * 101 + block_dim_2[0] - 1) / block_dim_2[0])
by2 = int((9 * 9 + block_dim_2[1] - 1) / block_dim_2[1])
bz2 = int((301 + block_dim_2[2] -1 ) / block_dim_2[2])
grid_dim_2 = (bx2, by2, bz2)

custom_kernel(grid_dim_2, block_dim_2, large_array_gpu) # gets stuck at this statement, and RAM usage keeps increasing

large_array_cpu = cp.asnumpy(large_array_gpu)

print('done')

Problem: As soon as the kernel gets called at the line custom_kernel(grid_dim_2, block_dim_2, large_array_gpu), my RAM usage starts increasing to the maximum capacity of 32 GBs (almost exponentially), and the kernel never gets finished. As shown in the screenshot below, the GPU memory usage is around 2 GB (which was expected) but the CPU RAM usage is kept increasing. Just as a test, I wrote the C++ version of the program, it works fine and quite fast (given below).

Why there is such a memory leak on the CPU side?
Why the CUDA kernel is never getting finished?

C++ Version (Working Fine):

#include 

// gpu
#include 
#include "device_launch_parameters.h"

__global__ void test_kernel(double* large_array)
{
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int frame = blockIdx.z * blockDim.z + threadIdx.z;

    if (y < (9 * 9) && x < (101 * 101) && frame < 301)
    {
        int resultIdx = (frame * (101 * 101) * (9 * 9)) + (y * (101 * 101) + x);
        large_array[resultIdx] = 1.1;
    }
}

int main()
{
    printf("start...");

    cudaError_t cudaStatus;

    // device
    double* dev_largeArray = 0;

    // Memory allocations   
    cudaStatus = cudaMalloc((void**)&dev_largeArray, 101 * 101 * 9 * 9 * 301 * sizeof(double));
    cudaMemset(dev_largeArray, 0, 101 * 101 * 9 * 9 * 301 * sizeof(double)); // initialize the result with zeros

    dim3 blockSize(32, 32, 1);
    int bx2 = ((101 * 101) + blockSize.x - 1) / blockSize.x;
    int by2 = ((9 * 9) + blockSize.y - 1) / blockSize.y;
    int bz2 = (301 + blockSize.z - 1) / blockSize.z;
    dim3 gridSize = dim3(bx2, by2, bz2);
    test_kernel << > > (dev_largeArray);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "Kernel launch failed: %s
", cudaGetErrorString(cudaStatus));
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!
", cudaStatus);
    }

    // Copy the results back to the host
    double* h_largeArray = new double[101 * 101 * 9 * 9 * 301];
    cudaStatus = cudaMemcpy(h_largeArray, dev_largeArray, 101 * 101 * 9 * 9 * 301 * sizeof(double), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    delete[] h_largeArray;

    cudaFree(dev_largeArray);
    return 0;
}

Memory leak with cuPy (CUDA in Python)

Answers (1)

Related Questions