WorkinChina
WorkinChina

Reputation: 25

Launching a CUDA stream from each host thread, will each stream run concurrently?

From the search I know that cuda support that Launch a CUDA stream from each host thread. My problem is that when I use only one thread, the test took 180 seconds to finish. Then I use three threads, the test took 430 seconds. Why didn't they run concurrently?

My gpu is Tesla K20c

Below is my simplified code, it cut off some variables define and the output data save etc,

int main()
{
    cudaSetDevice(0);
    cudaSetDeviceFlags(cudaDeviceBlockingSync);
    cudaStream_t stream1;
    cudaStream_t stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    int ret;
    pthread_t id_1,id_2;
    ret = pthread_create(&id_1,NULL,thread_1,&stream1);
    ret = pthread_create(&id_2,NULL,thread_1,&stream2);
    pthread_join(id_1,NULL);
    pthread_join(id_2,NULL);
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    return 0;
}

void* thread_1(void *streamno)
{ 
    char speechInFileName[1024] = "data/ori_in.bin";
    char bitOutFileName[1024] = "data/enc_out.bin";
    //make sure the bitOutFileName is exclusive
    char buf[1024];
    int nchar = snprintf(buf,1024,"%p",(char*)streamno);
    strcat(bitOutFileName,buf);

    //change the stack size limit
    size_t pvalue = 60 * 1024;
    if (cudaDeviceSetLimit(cudaLimitStackSize, pvalue) == cudaErrorInvalidValue)
        cout << "cudaErrorInvalidValue " << endl;

    Encoder_main(3, speechInFileName, bitOutFileName,(cudaStream_t*)streamno);

    pthread_exit(0);
}

int Encoder_main(int argc, char speechInFileName[], char bitOutFileName[], cudaStream_t *stream)
{
    void      *d_psEnc;
    cudaMalloc(&d_psEnc, encSizeBytes);
    cudaMemcpyAsync(d_psEnc, psEnc, encSizeBytes, cudaMemcpyHostToDevice, *stream);
    SKP_SILK_SDK_EncControlStruct *d_encControl; // Struct for input to encoder
    cudaMalloc(&d_encControl, sizeof(SKP_SILK_SDK_EncControlStruct));
    cudaMemcpyAsync(d_encControl, &encControl, sizeof(SKP_SILK_SDK_EncControlStruct), cudaMemcpyHostToDevice, *stream);
    SKP_int16 *d_in;
    cudaMalloc(&d_in, FRAME_LENGTH_MS * MAX_API_FS_KHZ * MAX_INPUT_FRAMES * sizeof(SKP_int16));
    SKP_int16 *d_nBytes;
    cudaMalloc(&d_nBytes, sizeof(SKP_int16));
    SKP_int32 *d_ret;
    cudaMalloc(&d_ret, sizeof(SKP_int32));
    SKP_uint8 *d_payload;
    cudaMalloc(&d_payload, MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES);


    while (1) {
        /* Read input from file */
        counter = fread(in, sizeof(SKP_int16), (frameSizeReadFromFile_ms * API_fs_Hz) / 1000, speechInFile);

        if ((SKP_int)counter < ((frameSizeReadFromFile_ms * API_fs_Hz) / 1000)) {
            break;
        }
        /* max payload size */
        nBytes = MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES;

        cudaMemcpyAsync(d_nBytes, &nBytes, sizeof(SKP_int16), cudaMemcpyHostToDevice, *stream);
        cudaMemcpyAsync(d_in, in, FRAME_LENGTH_MS * MAX_API_FS_KHZ * MAX_INPUT_FRAMES, cudaMemcpyHostToDevice * sizeof(SKP_int16), *stream);
        encoder_kernel <<<1, 1, 0, *stream>>>(d_psEnc, d_encControl, d_in, (SKP_int16)counter, d_payload, d_nBytes, d_ret);
        cudaMemcpyAsync(&nBytes, d_nBytes, sizeof(SKP_int16), cudaMemcpyDeviceToHost,*stream);
        cudaMemcpyAsync(&ret, d_ret, sizeof(ret), cudaMemcpyDeviceToHost,*stream);
        cudaMemcpyAsync(payload, d_payload, MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES, cudaMemcpyDeviceToHost,*stream);

        cudaStreamSynchronize(*stream);
    }

    cudaFree(d_psEnc);
    cudaFree(d_encControl);
    cudaFree(d_in);
    cudaFree(d_nBytes);
    cudaFree(d_ret);
    cudaFree(d_payload);

    return 0;
}

encoder_kernel is a speech encoder function.

Thanks for Robert and Jez's advice! I change my code to just open two streams, and use the visual profiler to show the timeline. From the image, I see sometime the two streams run concurrently, but most time did not! Can you tell me why? Thank you!

enter image description here

Upvotes: 0

Views: 566

Answers (1)

Jez
Jez

Reputation: 1781

One thread takes 180s, three threads takes 430s. 430/180 = ~2.4. That is not three times longer, indicating you have some concurrency. Whether you can do better than this or not depends on the details of the work each thread does.

Often the best way of figuring out what is going on is to run your application through the NVIDIA Visual Profiler. You can either run it from the visual profiler interface, or output from the command-line nvprof profiler. This will show every CUDA API call as well as the copies and kernels. It'll split them by streams and thread, so it's quite clear to see what is going on.

Upvotes: 2

Related Questions