Reputation: 21515

CUDA timing for multi-gpu applications

This is the standard way timing in CUDA is performed:

cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start, 0);

// Something to be timed

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);

cudaEventElapsedTime(&time, start, stop);
printf ("Time: %f ms\n", time);

In the CUDA simpleP2P (peer-to-peer) example, timing is performed in this way:

cudaEvent_t start, stop;
float time;
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&start,eventflags);
cudaEventCreateWithFlags(&stop,eventflags);

cudaEventRecord(start,0);

// Something to be timed

cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);

My questions are:

Why, for the P2P example, timing has been performed by cudaEventCreateWithFlags with the cudaEventBlockingSync?
Is it something needed in, generally speaking, all multi-gpu applications (including peer-to-peer memcopy timings?

Thanks.

Upvotes: 2

Answers (1)

Vitality

Reputation: 21515

After almost three years, I'm answering my own question.

To this end, I'll consider my examples in Concurrency in CUDA multi-GPU executions where it has been underlined how using asynchronous copies enables achieving true multi-GPU concurrency. In particular, I will consider Test case #8 of that post.

The full code as well as the profiler timeline for Test case #8 are reported here for the sake of clarity.

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
    T               *d_data;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    // --- "Breadth-first" approach - async
    double *inputMatrices;   gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
    }

    gpuErrchk(cudaDeviceReset());
}

Timing the asynchronous copies - concurrency is destroyed

Now, let us begin by timing the asynchronous copies. A possible way to do so, is using the following snippet:

float time[numGPUs];
cudaEvent_t start[numGPUs], stop[numGPUs];

// --- "Breadth-first" approach - async
for (int k = 0; k < numGPUs; k++) {
    gpuErrchk(cudaSetDevice(k));
    cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
    cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
    cudaEventRecord(start[k], 0);
    gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
    cudaEventRecord(stop[k], 0);
    cudaEventSynchronize(stop[k]);
    cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time:  %3.1f ms \n", time[k]);

Unfortunately, this way of timing destroys concurrency, as it is possible to appreciate from the profiler timeline below:

Timing the asynchronous copies - concurrency is preserved

To avoid this problem, a possibility is to launch the GPU tasks as OpenMP threads as follows:

int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;

// --- "Breadth-first" approach - async
omp_set_num_threads(numGPUs);
#pragma omp parallel
{
    unsigned int k = omp_get_thread_num();
    gpuErrchk(cudaSetDevice(k));
    cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
    cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
    cudaEventRecord(start[k], 0);
    gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
    cudaEventRecord(stop[k], 0);
    cudaEventSynchronize(stop[k]);
    cudaEventElapsedTime(&time[k], start[k], stop[k]);
    printf("Thread nr. %i; Elapsed time:  %3.1f ms \n", k, time[k]);
}

As it can be seen from the profiler timeline, concurrency is preserved.

Timing the kernel launches - concurrency is destroyed

The same happens when timing the kernel launches. Using the following snippet, concurrency is destroyed.

for (int k = 0; k < numGPUs; k++) {
    gpuErrchk(cudaSetDevice(k));
    cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
    cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
    cudaEventRecord(start[k], 0);
    kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
    cudaEventRecord(stop[k], 0);
    cudaEventSynchronize(stop[k]);
    cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time:  %3.1f ms \n", time[k]);

Timing the kernel launches - concurrency is preserved

Opposite to the above, using OpenMP, concurrency is preserved.

int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;

omp_set_num_threads(numGPUs);
#pragma omp parallel
{
    unsigned int k = omp_get_thread_num();
    gpuErrchk(cudaSetDevice(k));
    cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
    cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
    cudaEventRecord(start[k], 0);
    kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
    cudaEventRecord(stop[k], 0);
    cudaEventSynchronize(stop[k]);
    cudaEventElapsedTime(&time[k], start[k], stop[k]);
    printf("Thread nr. %i; Elapsed time:  %3.1f ms \n", k, time[k]);
}

Upvotes: 3

CUDA timing for multi-gpu applications

Answers (1)

Related Questions