Can we use dynamic allocation for array of arrays in Cuda C++?

Question

I have a very big array in Device memory, and I need to partition it into some smaller parts. Now, I wondered if I could use an array of arrays to access them by indices.

I tried to write the following code, however, it returns rubbish which is I think because of its undefined behavior. It has no error and I don't know if it is possible.

#include 
#include 
#include 

inline
cudaError_t checkCuda(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s
", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
#endif
  return result;
}

__global__ void cudaVectorFill(int **array, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        array[0][i] = 1;
    else if (i < 2 * N)
        array[1][i - N] = 2;
    else if (i < 3 * N)
        array[2][i - 2 * N] = 3;
}

int main() {

    int N = 100000000;

    int **array = new int*[3];
 
    checkCuda( cudaMalloc(&array[0], N * sizeof(int)) );
    checkCuda( cudaMalloc(&array[1], N * sizeof(int)) );
    checkCuda( cudaMalloc(&array[2], N * sizeof(int)) );
 
    cudaVectorFill<<<(3 * N + 1023) / 1024, 1024>>>(array, N);

    checkCuda( cudaPeekAtLastError() );
 
    auto *host_array0 = new int[1];
    auto *host_array1 = new int[1];
    auto *host_array2 = new int[1];
 
    checkCuda( cudaMemcpy(host_array0, array[0], 1 * sizeof(int), cudaMemcpyDeviceToHost) );
    checkCuda( cudaMemcpy(host_array1, array[1], 1 * sizeof(int), cudaMemcpyDeviceToHost) );
    checkCuda( cudaMemcpy(host_array2, array[2], 1 * sizeof(int), cudaMemcpyDeviceToHost) );
 
    std::cout << *host_array0 << std::endl << *host_array1 << std::endl << *host_array2 << std::endl;

    return 0;
}

Output is something like:

707093096
707093104
707093112

Correct Output should be:

1
2
3

talonmies · Accepted Answer

As noted in comments, if you are passing pointers to a GPU kernel, they have to be accessible to the GPU. That means you either explicitly allocate a copy of the host array of device pointers and populate it on the device, or rely on managed or otherwise GPU accessible host memory.

One approach that will probably work in this case is:

int N = 100000000;

int **array = new int*[3];
 
checkCuda( cudaMalloc(&array[0], N * sizeof(int)) );
checkCuda( cudaMalloc(&array[1], N * sizeof(int)) );
checkCuda( cudaMalloc(&array[2], N * sizeof(int)) );

int **array_d;
checkCuda( cudaMalloc(&array_d, 3 * sizeof(int*)) );
checkCuda( cudaMemcpy(array_d, array, 3 * sizeof(int*), cudaMemcpyHostToDevice) );
 
cudaVectorFill<<<(3 * N + 1023) / 1024, 1024>>>(array_d, N);

[Standard disclaimer, code written in browser, no guarantees implied or given, use at own risk]

i.e. after building array in host memory, make a copy in GPU memory and pass that GPU memory copy to your kernel. There might be other problems in your code, I haven't analyzed further than the first six lines.

Can we use dynamic allocation for array of arrays in Cuda C++?

Answers (2)

Related Questions