Reputation: 3076

Concurrency of CUDA default stream with created streams

I created streams in this way:

cudaStream_t stream0;
cudaStream_t stream1;
cudaStreamCreate( &stream0);
cudaStreamCreate( &stream1);

I run the kernel functions like

singlecore<<<1,1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);

The two kernels are not executed currently. But if I execute the first kernel in stream1 as:

singlecore<<<1,1,0,stream1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);

they will execute currently.

I wonder if the kernel function in default stream can not be executed currently.

Upvotes: 3

Answers (2)

Vitality

Reputation: 21515

I want to update Robert Crovella's answer in the light of the newly issue CUDA 7.0 which, as of March 2015, is in the Release Candidate version.

With CUDA 7.0, default streams are regular streams in the sense that commands in the default stream may run concurrently with commands in non-default streams. A more detailed explanation of this new feature can be found at

CUDA 7 Streams Simplify Concurrency

This feature can be simply enabled by the additional --default stream per-thread compilation option.

At the page linked above, an example worked out by Mark Harris can be found. Here, I want to resume the example I posted at False dependency issue for the Fermi architecture. In particular, in the new example below, although I'm creating 3 streams, I'm not using anymore the first one and adopting the default stream in its place.

This is the timeline produced without the --default stream per-thread compilation option:

enter image description here

As you can see, the execution in the default stream does not exploit concurrency.

On this other side, this is the timeline produced with the --default stream per-thread compilation option:

enter image description here

As you can see now, the default stream execution overlaps with the other two streams execution.

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include "Utilities.cuh"

using namespace std;

#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int N)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  N;
    for (int i = start; i < end; i += blockDim.x * gridDim.x)
    {
        out[i] = in[i] * in[i];
    }
}

int main()
{
    const int N = 6000000;

    // --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
    int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
    gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));

    // --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
    int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
    gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));

    // --- Host side check results vector allocation and initialization
    int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];

    // --- Device side input data allocation.
    int *d_in = 0;              gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));

    // --- Device side output data allocation. 
    int *d_out = 0;             gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));

    int streamSize = N / NUM_STREAMS;
    size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;

    // --- Set kernel launch configuration
    dim3 nThreads       = dim3(NUM_THREADS,1,1);
    dim3 nBlocks        = dim3(NUM_BLOCKS, 1,1);
    dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

    // --- Create CUDA streams
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++)
        gpuErrchk(cudaStreamCreate(&streams[i]));

    /**************************/
    /* BREADTH-FIRST APPROACH */
    /**************************/

    int offset = 0;
    cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     0);
    for(int i = 1; i < NUM_STREAMS; i++) {
        int offset = i * streamSize;
        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);
    }

    kernel<<<subKernelBlock, nThreads>>>(&d_in[offset], &d_out[offset],   streamSize/2);
    kernel<<<subKernelBlock, nThreads>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);

    for(int i = 1; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);
    }

    for(int i = 1; i < NUM_STREAMS; i++) {
        int offset = i * streamSize;
        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
    }

    cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   0);
    for(int i = 1; i < NUM_STREAMS; i++) {
        int offset = i * streamSize;
        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   0);
    }

    for(int i = 0; i < NUM_STREAMS; i++)
        gpuErrchk(cudaStreamSynchronize(streams[i]));

    gpuErrchk(cudaDeviceSynchronize());

    // --- Release resources
    gpuErrchk(cudaHostUnregister(h_in));
    gpuErrchk(cudaHostUnregister(h_out));
    gpuErrchk(cudaFree(d_in));
    gpuErrchk(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++)
        gpuErrchk(cudaStreamDestroy(streams[i]));

    cudaDeviceReset();  

    // --- GPU output check
    int sum = 0;
    for(int i = 0; i < N; i++)      
        sum += h_checkResults[i] - h_out[i];

    cout << "Error between CPU and GPU: " << sum << endl;

    delete[] h_in;
    delete[] h_out;
    delete[] h_checkResults;

    return 0;
}

Upvotes: 4

Robert Crovella

Reputation: 152173

Yes there is a limitation on cuda commands issued to the default stream. Referring to the C programming guide section on implicit synchronization:

"Two commands from different streams cannot run concurrently if any one of the following operations is issued in-between them by the host thread: ... •any CUDA command to the default stream, "

So as a general rule of thumb, for overlapped copy and compute operations, it's easiest to program all such operations in a set of non-default streams. There's a bit of a loophole (which you've discovered) where it's possible to get overlap with commands issued in the default stream (and other streams), but it requires careful understanding of the restrictions between the default stream and other streams, as well as careful attention to the order in which you issue commands. A good example is explained in the C programming guide. Read all the way through the section on "overlapping behavior".

In your first example, the kernel issued to the default stream blocks execution of the kernel issued to the other stream. In your second example, you can have concurrency because the kernel issued to the non-default stream does not block the execution of the kernel issued to the default stream.

Upvotes: 7

Concurrency of CUDA default stream with created streams

Answers (2)

Related Questions