Overlapping transfers and kernel executions in CUDA with two loops

Question

I want to overlap data transfers and kernel executions in a form like this:

int numStreams = 3;
int size = 10;

for(int i = 0; i < size; i++) {
    cuMemcpyHtoDAsync( _bufferIn1,
                           _host_memoryIn1 ),
                           _size * sizeof(T),
                           cuda_stream[i % numStreams]);

    cuMemcpyHtoDAsync( _bufferIn2,
                           _host_memoryIn2,
                           _size * sizeof(T),
                           cuda_stream[i % numStreams]);

        cuLaunchKernel( _kernel,
                        gs.x(), gs.y(), gs.z(),
                        bs.x(), bs.y(), bs.z(),
                        _memory_size,
                        cuda_stream[i % numStreams],
                        _kernel_arguments,
                        0
                      );
      cuEventRecord(event[i], cuda_stream);
}

for(int i = 0; i < size; i++) {
    cuEventSynchronize(events[i]);

    cuMemcpyDtoHAsync( _host_memoryOut,
                           _bufferOut,
                           _size * sizeof(T),
                           cuda_stream[i % numStreams]);
}

Is overlapping possible in this case? Currently only the HtoD-transfers overlap with the kernel executions. The first DtoH-transfer is executed after the last kernel execution.

Overlapping transfers and kernel executions in CUDA with two loops

Answers (1)

Related Questions