Reputation: 83
I had followed example of Using cudaMemcpy3D to transfer *** pointer Yet my task is to copy the 3d subsection of the device global memory array to device global memory array for example:
Nx =10;
Ny=10;
Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz))
... // here I am populating sourceTensor with some Data
NxTarget = 5;
NyTarget = 5;
NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(int), NyTarget, NzTarget))
// here I get lost ...
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(sourceTensor[0][0], Nx * sizeof(int), Nx, Ny); // How to make it start in chosen location like for example 1,2,3
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(int), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
So in above I am looking for a way to copy from sourceTensor to target tensor all the data where
x indices are in range (1,6)
y indices are in range (2,7)
z indices are in range (3,8)
So only subsection of the source array but I do not know How to define make_cudaPitchedPtr and make_cudaExtent properly, in order to achieve my goal.
Upvotes: 0
Views: 401
Reputation: 152249
The srcPos
parameter in your cudaMemcpy3DParams
should make this pretty easy. Here is an example:
$ cat t1957.cu
#include <cstdio>
typedef int it; // index type
typedef int dt; // data type
__global__ void populate_kernel(struct cudaPitchedPtr sourceTensor, it Nx, it Ny, it Nz) {
for (it z = 0; z < Nz; z++)
for (it y = 0; y < Ny; y++)
for (it x = 0; x < Nx; x++) {
char *ptr = (char *)sourceTensor.ptr + sourceTensor.pitch*(z*Ny+y);
((dt *)ptr)[x] = z*100+y*10+x;
}
};
__global__ void verify_kernel(struct cudaPitchedPtr targetTensor, it NxTarget, it NyTarget, it NzTarget, it NxOffset, it NyOffset, it NzOffset) {
if (((dt *)targetTensor.ptr)[0] != 321) {
printf("%d\n", ((dt *)targetTensor.ptr)[0]);
}
};
int main(){
it Nx =10;
it Ny=10;
it Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(dt), Ny, Nz));
populate_kernel<<<1,1>>>(sourceTensor, Nx, Ny, Nz);
it NxTarget = 5;
it NyTarget = 5;
it NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(dt), NyTarget, NzTarget));
cudaMemcpy3DParms cpy = { 0 };
it NxOffset = 1;
it NyOffset = 2;
it NzOffset = 3;
cpy.srcPos = make_cudaPos(NxOffset*sizeof(dt), NyOffset, NzOffset);
cpy.srcPtr = sourceTensor;
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(dt), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
verify_kernel<<<1,1>>>(targetTensor, NxTarget, NyTarget, NzTarget, NxOffset, NyOffset, NzOffset);
cudaDeviceSynchronize();
}
$ nvcc -o t1957 t1957.cu
$ cuda-memcheck ./t1957
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that when neither source nor destination are specified as cudaArray
types, then the element size is always assumed to be unsigned char (ie. 1 byte).
Upvotes: 2