Copying a 2D malloc pitched device memory to a 3D array in the device

Question

After the execution of a kernel (i.e., reorder_raw) that outputs an updated device memory (i.e., d_ordered in the code below), I want to do some layered interpolation in another kernel. I understand that I must create a 3D array for such purposes, and then transfer my device memory to my array memory d_ordered_array. However, when I use the function cudaMemcpy2DToArray I get the error invalid memory in my mexPrintf(cudaGetErrorString(cudaGetLastError())); function at the end of the code

Otherwise, if I comment the cudaMemcpy2DToArray I get no errors.

void delay_US_linear(
    short *h_raw, short *d_ordered, float *d_delay,
    int samples, int channels, int scanlines, int elements, 
    float pitch, float speed_sound, float sample_freq, float delay_offset,
    size_t in_pitch, size_t out_pitch
){
    // Allocate the GPU raw data and ordered data buffer
    short *d_raw;
    cudaMalloc((void**)& d_raw, sizeof(short)*samples*channels*scanlines);
    cudaMemcpy(d_raw, h_raw, sizeof(short)*samples*channels*scanlines, cudaMemcpyHostToDevice);

    // Allocate block and grid dimensions
    int griddim_x = (samples + order_X - 1) / order_X;
    int griddim_y = (scanlines);
    int griddim_z = 1;

    dim3 dimGrid(griddim_x, griddim_y, griddim_z);
    dim3 dimBlock(order_X, order_Y, order_Z);

    // Use all threads in block for shared memory
    int shared_size = order_X * order_Y * order_Z * sizeof(short);

    // Only need to change the channel order, independency in axial and scanline dimension
    reorder_raw << > > (
        d_raw, d_ordered, samples, channels, scanlines, elements, in_pitch/sizeof(short));

    cudaDeviceSynchronize();
    // Create a 3D array
    cudaArray *d_ordered_array;
    cudaChannelFormatDesc  desc = cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSigned);
    cudaMalloc3DArray(&d_ordered_array, &desc, make_cudaExtent(samples, channels, scanlines), 
    cudaArrayLayered);

    // Copy device memory to the 3D array
    cudaMemcpy2DToArray(d_ordered_array, 0, 0, d_ordered, in_pitch, sizeof(short)*samples, 
    channels*scanlines,cudaMemcpyDeviceToDevice);

    cudaFreeArray(d_ordered_array);
    cudaFree(d_raw);

    mexPrintf(cudaGetErrorString(cudaGetLastError()));
}

For reference, the d_ordered device pointer is a 2D pitched memory that has been previously allocated as

size_t in_pitch;
cudaMallocPitch((void**)& d_ordered,&in_pitch,sizeof(short)*samples,channels*scanlines);

Robert Crovella · Accepted Answer

Use cudaMalloc3D instead of cudaMallocPitch for your allocation of d_ordered, and use a cudaMemcpy3D operation instead of cudaMemcpy2DToArray, you will be able to get this to work. These are what match your 3D cudaArray. Here is an example:

$ cat t1733.cu
#include 

void delay_US_linear(
    short *h_raw, cudaPitchedPtr d_ordered, float *d_delay,
    int samples, int channels, int scanlines, int elements,
    float pitch, float speed_sound, float sample_freq, float delay_offset,
    size_t in_pitch, size_t out_pitch
){
    // Create a 3D array
    cudaArray *d_ordered_array;
    cudaChannelFormatDesc  desc = cudaCreateChannelDesc();
    cudaExtent my_ext = make_cudaExtent(samples, channels, scanlines);
    cudaMalloc3DArray(&d_ordered_array, &desc, my_ext, cudaArrayLayered);

    // Copy device memory to the 3D array
    cudaMemcpy3DParms p = {0};
    p.srcPtr = d_ordered;
    p.dstArray = d_ordered_array;
    p.extent = my_ext;
    p.kind = cudaMemcpyDeviceToDevice;
    cudaMemcpy3D(&p);

    cudaFreeArray(d_ordered_array);

    std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}


int main(){
  const int samples = 4864; // 4864
  const int channels = 64; //64
  const int scanlines = 128;// 128
  cudaPitchedPtr d_ordered;
  size_t in_pitch=0, out_pitch = 0;
  short *h_raw = NULL;
  float *d_delay = NULL;
  const int elements = 0;
  float pitch = 0;
  float speed_sound = 0;
  float sample_freq = 0;
  float delay_offset = 0;
  cudaExtent my_ext = make_cudaExtent(samples*sizeof(short), channels, scanlines);
  cudaMalloc3D(&d_ordered, my_ext);
//  cudaMallocPitch((void**) &d_ordered,&in_pitch,sizeof(short)*samples,channels*scanlines);
  delay_US_linear(h_raw, d_ordered, d_delay, samples, channels, scanlines, elements,
    pitch, speed_sound, sample_freq, delay_offset, in_pitch, out_pitch);
}
$ nvcc -o t1733 t1733.cu
$ cuda-memcheck ./t1733
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$

Copying a 2D malloc pitched device memory to a 3D array in the device

Answers (1)

Related Questions