Farzad
Farzad

Reputation: 3438

Does accessing mapped pinned host (or a peer device) memory require GPU copy engine?

Assume the GPU has one execution engine and one copy engine.

  1. When inside a CUDA kernel the threads access the host memory, does it make the copy engine busy? Does it consequently block all asynchronous memory copy operations to/from the device in other streams?
  2. If inside the CUDA kernel threads access the peer device memory, does it make copy engines in both devices busy?

Upvotes: 2

Views: 736

Answers (1)

Vitality
Vitality

Reputation: 21475

I'm trying to provide an answer to the first question only

When inside a CUDA kernel the threads access the host memory, does it make the copy engine busy? Does it consequently block all asynchronous memory copy operations to/from the device in other streams?

I have written down the below simple code. It contains two kernels, one explicitly using mapped pinned host memory, namely kernel2, and one not explicitly using mapped pinned host memory, namely kernel1. The code uses three streams to check if the use of mapped pinned host memory disrupt concurrency or not.

Here is the code:

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

using namespace std;

#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/*******************************/
/* KERNEL FUNCTION - VERSION 1 */
/*******************************/
__global__ void kernel1(const int *in, int *out, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x)
    {
        out[i] = in[i] * in[i];
    }
}

/*******************************/
/* KERNEL FUNCTION - VERSION 2 */
/*******************************/
__global__ void kernel2(const int *in, int *out, int* cnt, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x)
    {
        out[i] = cnt[i] * in[i] * in[i];
    }
}

/********/
/* MAIN */
/********/
int main()
{
    const int dataSize = 6000000;

    // --- Host side memory allocations
    int *h_in = new int[dataSize];
    int *h_out = new int[dataSize];

    // --- Host side memory initialization
    for(int i = 0; i < dataSize; i++) h_in[i] = 5;
    for(int i = 0; i < dataSize; i++) h_out[i] = 0;

    // --- Registers host memory as page-locked, as required for asynch cudaMemcpyAsync)
    gpuErrchk(cudaHostRegister(h_in, dataSize * sizeof(int), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out, dataSize * sizeof(int), cudaHostRegisterPortable));

    // --- Device side memory allocations
    int *d_in = 0;  gpuErrchk(cudaMalloc((void**)&d_in, dataSize * sizeof(int)));
    int *d_out = 0; gpuErrchk(cudaMalloc((void**)&d_out, dataSize * sizeof(int)));

    // --- Testing mapped pinned memory
    int *cnt; gpuErrchk(cudaMallocHost((void**)&cnt, dataSize * sizeof(int)));
    for(int i = 0; i < dataSize; i++) cnt[i] = 2;

    int streamSize = dataSize / NUM_STREAMS;
    size_t streamMemSize = dataSize * sizeof(int) / NUM_STREAMS;

    // --- Setting kernel launch config
    dim3 nThreads = dim3(NUM_THREADS,1,1);
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

    // --- Create CUDA streams
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));

    /**********/
    /* CASE 1 */
    /**********/
    for(int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * streamSize;
        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]); }

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;

        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        kernel1<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel1<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);
    }

    for(int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * streamSize;
        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]); }


    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));

    /**********/
    /* CASE 2 */
    /**********/
    for(int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * streamSize;
        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]); }

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;

        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        kernel2<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], cnt, streamSize/2);
        kernel2<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset +  streamSize/2], cnt, streamSize/2);
    }

    for(int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * streamSize;
        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]); }


    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));

    // --- Release resources
    gpuErrchk(cudaHostUnregister(h_in));
    gpuErrchk(cudaHostUnregister(h_out));
    gpuErrchk(cudaFree(d_in));
    gpuErrchk(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));

    delete[] h_in;
    delete[] h_out;

    gpuErrchk(cudaDeviceReset());

    return 0;
}

From the below timeline, it seems that the usage of mapped pinned host memory in kernel2 does not disrupt concurrency. The algorithm has been tested on a GT540M card having a single copy engine.

enter image description here

Upvotes: 1

Related Questions