Galilean
Galilean

Reputation: 268

cudaMallocManaged for 2D and 3D array

If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before. The code looks like this(more or less)

__global__ void convert(float kelvin[], float celsius[])  //can pass 
arrays in kernel
{
     int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i<N)
    kelvin[i]=celsius[i]+273.15;
}

int main()
{
    float *celsius =(float *)malloc(N*sizeof(float));
    float *kelvin =(float *)malloc(N*sizeof(float));
    cudaMallocManaged(&celsius, N*sizeof(float));
    cudaMallocManaged(&kelvin, N*sizeof(float));

// init celsius here

dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();

//Doing stuff with the output here

return 0;
}

The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying

__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}

int main()
{   // I thonk, 2D arrays can be passed as pointer to pointers
    float **A = (float **)malloc(N*N*sizeof(float));
    float **B = (float **)malloc(N*N*sizeof(float));
    float **C = (float **)malloc(N*N*sizeof(float));
    cudaMallocManaged(&A, N*N*sizeof(float));
    cudaMallocManaged(&B, N*N*sizeof(float));
    cudaMallocManaged(&C, N*N*sizeof(float));


A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all

}

But, It shows the following error

matrix_add.cu(22): error: too many initializer values

matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"

Your help is highly appreciated.

Upvotes: 1

Views: 3407

Answers (1)

talonmies
talonmies

Reputation: 72349

You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:

#include <algorithm>
#include <iostream>

const int N = 3;

__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i < N && j < N)
        C[i][j] = A[i][j] + B[i][j];
}

int main()
{
    float* A; cudaMallocManaged(&A, N*N*sizeof(float));
    float* B; cudaMallocManaged(&B, N*N*sizeof(float));
    float* C; cudaMallocManaged(&C, N*N*sizeof(float));

    const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
    const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
    float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);

    std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
    std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks(1, 1);
    MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
                                            reinterpret_cast<float (*)[N]>(B),
                                            C_vals );

    cudaDeviceSynchronize();

    for(int i=0; i<N; i++) {
        for(int j=0; j<N; j++) {
            std::cout << C_vals[i][j] << "  ";
        }
        std::cout << std::endl;
    }

    return 0;
}

Some important points:

  1. Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
  2. All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
  3. You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
  4. Initialization syntax and assignment syntax for arrays are not interchangeable.

All I can suggest is that you study it thoroughly until you understand how it works.

Upvotes: 2

Related Questions