cudaStreamSynchronize behavior under multiple threads

Question

What is the behavior of cudaStreamSynchronize in the following case

ThreadA pseudo code :
    while(true):
         submit new cuda Kernel to cudaStreamX

ThreadB pseudo code:
     call cudaStreamSynchronize(cudaStreamX)

My question is when will ThreadB return? Since ThreadA will always push new cuda kernels, and the cudaStreamX will never finish.

Robert Crovella · Accepted Answer

The API documentation isn't directly explicit about this, however the CUDA C programming guide is basically explicit:

cudaStreamSynchronize() takes a stream as a parameter and waits until all preceding commands in the given stream have completed

Furthermore, I think it should be sensible that:

cudaStreamSynchronize() cannot reasonably take into account work issued to a stream after that cudaStreamSynchronize() call. this would more or less require it to know the future.
cudaStreamSynchronize() should reasonably be expected to return after all previously issued work to that stream is complete.

Putting together an experimental test app, the above description is what I observe:

$ cat t396.cu
#include 
#include 
#include 
#include 
#include 

const int PTHREADS=2;
const int TRIGGER1=5;

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)
", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING
"); \
            exit(1); \
        } \
    } while (0)

#include 
#include 
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

#define DELAY_T 1000000000ULL
template 
__global__ void delay_kern(int i){

  unsigned long long time = clock64();
#ifdef DEBUG
  printf("hello %d
", type);
#endif
  while (clock64() < time+(i*DELAY_T));
}

volatile static int flag, flag0, loop_cnt;

// The thread configuration structure.
typedef struct
{
    int my_thread_ordinal;
    pthread_t thread;
    cudaError_t status;
    cudaStream_t stream;
    int delay_usec;
}
config_t;


// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
    // Unpack the config structure.
    config_t *config = (config_t *)arg;
    int my_thread=config->my_thread_ordinal;
    cudaError_t cuda_status = cudaSuccess;
    cuda_status = cudaSetDevice(0);
    if (cuda_status != cudaSuccess) {
        fprintf(stderr, "Cannot set focus to device %d, status = %d
",
            0, cuda_status);
        config->status = cuda_status;
        pthread_exit(NULL);
    }


    printf("thread %d initialized
", my_thread);

    switch(config->my_thread_ordinal){
      case 0:
        //master thread
        while (flag0) {
          delay_kern<0><<<1,1,0,config->stream>>>(1);
          if (loop_cnt++ > TRIGGER1)  flag = 1;
          printf("master thread loop: %d
", loop_cnt);
          usleep(config->delay_usec);
          }
        break;
      default:
        //slave thread
        while (!flag);
        printf("slave thread issuing stream sync at loop count: %d
", loop_cnt);
        cudaStreamSynchronize(config->stream);
        flag0 = 0;
        printf("slave thread set trigger and exit
");
        break;
    }
    cudaCheckErrors("thread CUDA error");
    printf("thread %d complete
", my_thread);
    config->status = cudaSuccess;
    return NULL;
}

int main(int argc, char* argv[])
{
    int mydelay_usec = 1;
    if (argc > 1) mydelay_usec = atoi(argv[1]);
    if ((mydelay_usec < 1) || (mydelay_usec > 10000000)) {printf("invalid delay time specified
"); return -1;}
    flag = 0; flag0 = 1; loop_cnt = 0;
    const int nthreads = PTHREADS;

    // Create workers configs. Its data will be passed as
    // argument to thread_func.
    config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads);
    cudaSetDevice(0);
    cudaStream_t str;
    cudaStreamCreate(&str);
    // create a separate thread
    // and execute the thread_func.
    for (int i = 0; i < nthreads; i++) {
        config_t *config = configs + i;
        config->my_thread_ordinal = i;
        config->stream = str;
        config->delay_usec = mydelay_usec;
        int status = pthread_create(&config->thread, NULL, thread_func, config);
        if (status) {
            fprintf(stderr, "Cannot create thread for device %d, status = %d
",
                i, status);
        }
    }
    // Wait for device threads completion.
    // Check error status.
    int status = 0;
    for (int i = 0; i < nthreads; i++) {
        pthread_join(configs[i].thread, NULL);
        status += configs[i].status;
    }
    if (status)
        return status;

    free(configs);

    return 0;
}
$ nvcc -arch=sm_61 -o t396 t396.cu -lpthread
$ time ./t396 100000
thread 0 initialized
thread 1 initialized
master thread loop: 1
master thread loop: 2
master thread loop: 3
master thread loop: 4
master thread loop: 5
master thread loop: 6
slave thread issuing stream sync at loop count: 7
master thread loop: 7
master thread loop: 8
master thread loop: 9
master thread loop: 10
master thread loop: 11
master thread loop: 12
master thread loop: 13
master thread loop: 14
master thread loop: 15
master thread loop: 16
master thread loop: 17
master thread loop: 18
master thread loop: 19
master thread loop: 20
master thread loop: 21
master thread loop: 22
master thread loop: 23
master thread loop: 24
master thread loop: 25
master thread loop: 26
master thread loop: 27
master thread loop: 28
master thread loop: 29
master thread loop: 30
master thread loop: 31
master thread loop: 32
master thread loop: 33
master thread loop: 34
master thread loop: 35
master thread loop: 36
master thread loop: 37
master thread loop: 38
master thread loop: 39
slave thread set trigger and exit
thread 1 complete
thread 0 complete

real    0m5.416s
user    0m2.990s
sys     0m1.623s
$

This will require some careful thought to understand. However, in a nutshell, the app will issue kernels that simply execute about a 0.7s delay before returning from one thread, and from the other thread will wait for a small number of kernels to be issued, then will issue a cudaStreamSynchronize() call. The overall time measurement for the application defines when that call returned. As long as you keep the command line parameter (host delay) between kernel launches to a value less than about 0.5s, then the app will reliably exit in about 5.4s (this will vary depending on which GPU you are running on, but the overall app execution time should be constant up to a fairly large value of the host delay parameter).

If you specify a command line parameter that is larger than the kernel duration on your machine, then the overall app execution time will be approximately 5 times your command line parameter (microseconds), since the trigger point for the cudaStreamSynchronize() call is 5.

In my case, I compiled and ran this on CUDA 8.0.61, Ubuntu 14.04, Pascal Titan X.

cudaStreamSynchronize behavior under multiple threads

Answers (1)

Related Questions