Why repeating a kernel inside a for-loop makes CUDA code significantly slower?

Question

Suppose we have four float arrays to be used on the host side, as well as its four counterparts to be used on the device side:

float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];

Now assume that we have a very simple kernel, taken from one of the examples at NVIDIA's blog:

__global__
void saxpy(int n, float a, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i < n)
    {
        y[i] = a*x[i] + y[i];
    }
}

Such kernel is to be called by the host side inside a for-loop, like the following:

for (int r = 0; r < LOOP_N; r++)
{
    saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
    saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}

And then I compare the execution time of such loop against its pure-CPU version:

for (int r = 0; r < LOOP_N; r++)
{
    for (int i = 0; i < ARRAYS_SIZE; i++) {
        y[i] = 2.0f*x[i] + y[i];
        y2[i] = 2.0f*x2[i] + y2[i];
    }
}

Now, what I don't understand is the following. For instance with ARRAYS_SIZE = 1000000 and for LOOP_N = 1000, when I run both loops in the versions shown above, I get a ratio between the execution time of CPU version and CUDA version that is around 6. It is, the CUDA version is approximately 6 times faster.

However, if I comment out one of the calls to saxpy that is inside the CUDA version of the loop and one of the calculations inside the CPU version of the loop, the ratio between CPU and CUDA becomes around 210. It is, the CUDA version is approximately 210 times faster.

What is the technical reason for such performance loss when merely repeating the call to a kernel, if no memory is being transferred to / from the device? Are there any workarounds to this?

A (hopefully) fully reproducible code example goes below:

#include 
#include 
#include 
#include 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


// Typedef and constant variables
typedef std::chrono::high_resolution_clock::time_point timers;
const int LOOP_N = 1000;
const int ARRAYS_SIZE = 1000000;

//Pretty simple kernel, from the example in Nvidia's blog
__global__
void saxpy(int n, float a, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i < n)
    {
        y[i] = a*x[i] + y[i];
    }
}

// Main loop
int main(void)
{
    timers t0, t1, t2;
    timers tfinal0, tfinal1, tfinal2;

    float *x, *x2, *y, *y2;
    float *d_x, *d_x2, *d_y, *d_y2;
    x = new float[ARRAYS_SIZE];
    x2 = new float[ARRAYS_SIZE];
    y = new float[ARRAYS_SIZE];
    y2 = new float[ARRAYS_SIZE];

    //Initializing arrays at the host side:
    for (int i = 0; i < ARRAYS_SIZE; i++) {
        x[i] = 1.0f;
        x2[i] = 1.0f;
        y[i] = 2.0f;
        y2[i] = 2.0f;
    }

    // GPU memory allocation:
    cudaMalloc(&d_x, ARRAYS_SIZE * sizeof(float));
    cudaMalloc(&d_x2, ARRAYS_SIZE * sizeof(float));
    cudaMalloc(&d_y, ARRAYS_SIZE * sizeof(float));
    cudaMalloc(&d_y2, ARRAYS_SIZE * sizeof(float));

    // Transfering arrays from host to device:
    cudaMemcpy(d_x, x, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);


    //////////////////
    //    CPU run   //
    //////////////////
    t0 = std::chrono::high_resolution_clock::now();
    for (int r = 0; r < LOOP_N; r++)
    {
        for (int i = 0; i < ARRAYS_SIZE; i++) {
            //comment one of the following out to see the point of my question:
            y[i] = 2.0f*x[i] + y[i];
            y2[i] = 2.0f*x2[i] + y2[i];
        }
    }
    tfinal0 = std::chrono::high_resolution_clock::now();
    auto time0 = std::chrono::duration_cast(tfinal0 - t0).count();
    std::cout << "CPU: " << (float)time0 << " microseconds" << std::endl;


    //////////////////
    // GPU-CUDA run //
    //////////////////

    // Perform SAXPY kernel on ARRAYS_SIZE elements, for LOOP_N times
    t1 = std::chrono::high_resolution_clock::now();
    for (int r = 0; r < LOOP_N; r++)
    {
        //comment one of the following out to see the point of my question:
        saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
        saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
    }
    tfinal1 = std::chrono::high_resolution_clock::now();
    auto time1 = std::chrono::duration_cast(tfinal1 - t1).count();
    std::cout << "CUDA: " << (float)time1 << " microseconds" << std::endl;

    //Display performance ratio CPU / GPU-CUDA
    std::cout << "Ratio CPU/CUDA: " << (float)time0 / (float)time1 << std::endl;

    //Freeing memory used by arrays:
    cudaFree(d_x);
    cudaFree(d_x2);
    cudaFree(d_y);
    cudaFree(d_y2);
    free(x);
    free(x2);
    free(y);
    free(y2);

    return 0;
}

tera · Accepted Answer

You are not waiting for the kernels to be finished. As all kernel launches are asynchronous, you need to explicitly call cudaDeviceSynchronize() before stopping your timer.

The differences you are observing with variants of your current code likely stem from the fact that the queue for kernels to launch is finite, so at some point your code will start waiting for part of your kernels anyways. On Windows kernel batching also plays into this, up to some number (or a timeout) the driver will not even start to launch kernels.

Why repeating a kernel inside a for-loop makes CUDA code significantly slower?

Answers (2)

Related Questions