Nicolas Perrault
Nicolas Perrault

Reputation: 1

Can CUDA Thrust Kernels operate in parallel on multiple streams?

I am attempting to launch thrust::fill on two different device vectors in parallel on different CUDA streams. However, when I look at the kernel launches in NSight Systems, they appear to be serialized. Here is the basic example I am working with.

#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/sort.h>
#include <thrust/transform.h>
#include <thrust/execution_policy.h>

#define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if(code != cudaSuccess)
        {
            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if(abort) exit(code);
        }
}

int main(void)
{
    cudaStream_t stream1, stream2;
    gpuErrchk(cudaStreamCreate(&stream1));
    gpuErrchk(cudaStreamCreate(&stream2));

    const size_t size = 10000000;

    int* d_test1_ptr;
    int* d_test2_ptr;
    gpuErrchk(cudaMalloc((void**)&d_test1_ptr, size * sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_test2_ptr, size * sizeof(int)));

    thrust::device_ptr<int> d_test1(d_test1_ptr);
    thrust::device_ptr<int> d_test2(d_test2_ptr);

    for(int i = 0; i < 100; i++)
        {
            thrust::fill(thrust::cuda::par.on(stream1), d_test1, d_test1 + size, 2);
            thrust::fill(thrust::cuda::par.on(stream2), d_test2, d_test2 + size, 2);
        }

    gpuErrchk(cudaStreamSynchronize(stream1));
    gpuErrchk(cudaStreamSynchronize(stream2));

    gpuErrchk(cudaFree(d_test1_ptr));
    gpuErrchk(cudaFree(d_test2_ptr));

    gpuErrchk(cudaStreamDestroy(stream1));
    gpuErrchk(cudaStreamDestroy(stream2));

    std::cout << "Completed execution of dummy functions on different streams." << std::endl;

    return 0;
}

Here is the result from NSight. It looks like there is a constant cudaStreamSynchronize() call but I am not sure why.

NSight Image

I have looked at Getting CUDA Thrust to use a CUDA stream of your choice where it appears their launches are in parallel. I tried even using their exact code but the kernels were still being serialized.

Please let me know if you need more information.

Upvotes: 0

Views: 173

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151799

Thrust has gone through some significant changes in the last 5 years. A number of them were documented here.

The proximal problem is that your call to thrust::fill is also issuing a cudaStreamSynchronize(), which can be seen in your pictorial profiler timeline output/attachment. You can also run your code with nsys profile --stats=true ... and the CLI output will indicate 202 calls to cudaStreamSynchronize(). Two of these are for explicit calls in your code, and the other 200 correspond to each of your thrust algorithm launches.

If we "fix" that issue as suggested in the comments by using the nosync variant of the execution policy, we can see a small amount of overlap in the profiler:

# cat t234.cu
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/sort.h>
#include <thrust/transform.h>
#include <thrust/execution_policy.h>

#define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if(code != cudaSuccess)
        {
            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if(abort) exit(code);
        }
}

int main(void)
{
    cudaStream_t stream1, stream2;
    gpuErrchk(cudaStreamCreate(&stream1));
    gpuErrchk(cudaStreamCreate(&stream2));

    const size_t size = 10000000;

    int* d_test1_ptr;
    int* d_test2_ptr;
    gpuErrchk(cudaMalloc((void**)&d_test1_ptr, size * sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_test2_ptr, size * sizeof(int)));

    thrust::device_ptr<int> d_test1(d_test1_ptr);
    thrust::device_ptr<int> d_test2(d_test2_ptr);

    for(int i = 0; i < 100; i++)
        {
            thrust::fill(thrust::cuda::par_nosync.on(stream1), d_test1, d_test1 + size, 2);
            thrust::fill(thrust::cuda::par_nosync.on(stream2), d_test2, d_test2 + size, 2);
        }

    gpuErrchk(cudaStreamSynchronize(stream1));
    gpuErrchk(cudaStreamSynchronize(stream2));

    gpuErrchk(cudaFree(d_test1_ptr));
    gpuErrchk(cudaFree(d_test2_ptr));

    gpuErrchk(cudaStreamDestroy(stream1));
    gpuErrchk(cudaStreamDestroy(stream2));

    std::cout << "Completed execution of dummy functions on different streams." << std::endl;

    return 0;
}
# nvcc -o t234 t234.cu
# nsys nvprof --print-gpu-trace ./t234
WARNING: t234 and any of its children processes will be profiled.

Completed execution of dummy functions on different streams.
Generating '/tmp/nsys-report-7d14.qdstrm'
[1/3] [========================100%] report59.nsys-rep
[2/3] [========================100%] report59.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId   GrdX   GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm                                                  Name
 -----------  -------------  ------  ------  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ----------------------------------------------------------------------------------------------------
 677,159,755         44,577     135  19,532     1     1   256     1     1       16         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…

//////SNIP///////////////////


 707,769,835        167,232   2,879  19,532     1     1   256     1     1       16         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 707,927,755        166,272   2,893  19,532     1     1   256     1     1       16         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 708,085,387        167,488   2,907  19,532     1     1   256     1     1       16         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 708,242,027        164,672   2,921  19,532     1     1   256     1     1       16         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…

Generated:
    /root/bobc/report59.nsys-rep
    /root/bobc/report59.sqlite
#

I've trimmed down some of the profiler output, but looking at the last few kernel calls, we see that the 2nd to last call started at 708,085,387 ns on the timeline, and had a duration of 167,488ns, which means the endpoint of that kernel is 708,252,875ns on the timeline, which is after the start of the next kernel at 708,242,072ns on the timeline, therefore there is about 10us of overlap.

One of the reasons you don't see more overlap is because each kernel can fill the GPU for most of its execution duration, due to the large size of the grid for the kernel launch, due to the large size of the input vector length (10,000,000). Thrust tends to parallelize with a for_each strategy associating each element to a thread, therefore 10,000,000 threads is enough to fill any current GPU, leaving no "room" for another kernel to execute. (The profiler output shows each thrust algorithm kernel using almost 20,000 blocks of 256 threads each. This suggests to me each thread is processing 2 elements.) This is a common problem when people are trying to witness kernel concurrency.

You might try to make the effective kernel launch smaller by reducing the threads, i.e. reducing the elements per vector, to see if you can witness more overlap. You will then start fighting with the problem that eventually the kernel duration becomes so short that the kernel launch latency (of about 10us) eliminates much opportunity to witness kernel overlap.

The takeaway is that it is very hard to witness much kernel overlap between two such kernels that are doing almost no work per element.

If we increase the work per thread (a nonsense calculation here), and reduce the vector size, we can see considerably more overlap/concurrency:

# cat t234.cu
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/transform.h>
#include <thrust/execution_policy.h>
#include <math.h>

#define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if(code != cudaSuccess)
        {
            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if(abort) exit(code);
        }
}
struct my_func
{
  double _x;
  my_func(double x) : _x(x) {};
  __host__ __device__
  int operator()(){
    return (int)normcdf((double)_x);}
};

int main(void)
{
    cudaStream_t stream1, stream2;
    gpuErrchk(cudaStreamCreate(&stream1));
    gpuErrchk(cudaStreamCreate(&stream2));

    const size_t size = 100000;

    int* d_test1_ptr;
    int* d_test2_ptr;
    gpuErrchk(cudaMalloc((void**)&d_test1_ptr, size * sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_test2_ptr, size * sizeof(int)));

    thrust::device_ptr<int> d_test1(d_test1_ptr);
    thrust::device_ptr<int> d_test2(d_test2_ptr);

    for(int i = 0; i < 10; i++)
        {
            thrust::generate(thrust::cuda::par_nosync.on(stream1), d_test1, d_test1 + size, my_func(1.0));
            thrust::generate(thrust::cuda::par_nosync.on(stream2), d_test2, d_test2 + size, my_func(2.0));
        }

    gpuErrchk(cudaStreamSynchronize(stream1));
    gpuErrchk(cudaStreamSynchronize(stream2));

    gpuErrchk(cudaFree(d_test1_ptr));
    gpuErrchk(cudaFree(d_test2_ptr));

    gpuErrchk(cudaStreamDestroy(stream1));
    gpuErrchk(cudaStreamDestroy(stream2));

    std::cout << "Completed execution of dummy functions on different streams." << std::endl;

    return 0;
}
# nvcc -o t234 t234.cu
# nsys nvprof --print-gpu-trace ./t234
WARNING: t234 and any of its children processes will be profiled.

Completed execution of dummy functions on different streams.
Generating '/tmp/nsys-report-5866.qdstrm'
[1/3] [========================100%] report63.nsys-rep
[2/3] [========================100%] report63.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm                                                  Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ----------------------------------------------------------------------------------------------------
 720,913,028         48,576     135   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 720,937,764         55,296     149   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 720,962,244         72,384     163   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 720,993,700         72,512     177   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,035,236         62,560     191   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,066,884         72,608     205   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,098,436         72,288     219   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,140,100         66,784     233   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,171,428         72,416     247   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,207,524         67,840     261   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,244,548         70,016     275   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,276,004         72,384     289   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,315,236         64,864     303   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,349,028         72,512     317   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,380,804         72,160     331   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,422,212         66,816     345   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,453,540         72,448     359   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,489,636         67,936     373   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,526,596         67,585     387   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…
 721,558,212         67,169     401   196     1     1   256     1     1       36         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrus…

Generated:
    /root/bobc/report63.nsys-rep
    /root/bobc/report63.sqlite
#

Upvotes: 4

Related Questions