Jakub Mitura
Jakub Mitura

Reputation: 83

How to copy the subsection of the 3 dimensional array in CUDA C++

I had followed example of Using cudaMemcpy3D to transfer *** pointer Yet my task is to copy the 3d subsection of the device global memory array to device global memory array for example:

Nx =10;
Ny=10;
Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz))
... // here I am populating sourceTensor with some Data
NxTarget = 5;
NyTarget = 5;
NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(int), NyTarget, NzTarget))

// here I get lost ...
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(sourceTensor[0][0], Nx * sizeof(int), Nx, Ny); // How to make it start in chosen location like for example 1,2,3
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(int), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);

So in above I am looking for a way to copy from sourceTensor to target tensor all the data where

x indices are in range (1,6)

y indices are in range (2,7)

z indices are in range (3,8)

So only subsection of the source array but I do not know How to define make_cudaPitchedPtr and make_cudaExtent properly, in order to achieve my goal.

Upvotes: 0

Views: 401

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 152249

The srcPos parameter in your cudaMemcpy3DParams should make this pretty easy. Here is an example:

$ cat t1957.cu
#include <cstdio>

typedef int it;  // index type
typedef int dt;  // data type

__global__ void populate_kernel(struct cudaPitchedPtr sourceTensor, it Nx, it Ny, it Nz) {

  for (it z = 0; z < Nz; z++)
    for (it y = 0; y < Ny; y++)
      for (it x = 0; x < Nx; x++) {
        char *ptr = (char *)sourceTensor.ptr + sourceTensor.pitch*(z*Ny+y);
        ((dt *)ptr)[x] = z*100+y*10+x;
        }
};

__global__ void verify_kernel(struct cudaPitchedPtr targetTensor, it NxTarget, it NyTarget, it NzTarget, it NxOffset, it NyOffset, it NzOffset) {

  if (((dt *)targetTensor.ptr)[0] != 321) {
    printf("%d\n", ((dt *)targetTensor.ptr)[0]);
    }

};


int main(){

  it Nx =10;
  it Ny=10;
  it Nz = 10;
  struct cudaPitchedPtr sourceTensor;
  cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(dt), Ny, Nz));
  populate_kernel<<<1,1>>>(sourceTensor, Nx, Ny, Nz);
  it NxTarget = 5;
  it NyTarget = 5;
  it NzTarget = 5;
  struct cudaPitchedPtr targetTensor;
  cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(dt), NyTarget, NzTarget));
  cudaMemcpy3DParms cpy = { 0 };
  it NxOffset = 1;
  it NyOffset = 2;
  it NzOffset = 3;
  cpy.srcPos = make_cudaPos(NxOffset*sizeof(dt), NyOffset, NzOffset);
  cpy.srcPtr = sourceTensor;
  cpy.dstPtr = targetTensor;
  cpy.extent = make_cudaExtent(NxTarget * sizeof(dt), NyTarget , NzTarget );
  cpy.kind = cudaMemcpyDeviceToDevice;
  cudaMemcpy3D(&cpy);
  verify_kernel<<<1,1>>>(targetTensor, NxTarget, NyTarget, NzTarget, NxOffset, NyOffset, NzOffset);
  cudaDeviceSynchronize();
}
$ nvcc -o t1957 t1957.cu
$ cuda-memcheck ./t1957
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Note that when neither source nor destination are specified as cudaArray types, then the element size is always assumed to be unsigned char (ie. 1 byte).

Upvotes: 2

Related Questions