Some questions about cuda streams

Question

Code:

__global__ void K1() {
    int p=1;
    for(int i=0; i<100000; ++i)
        for(int j=0; j<100000; ++j)
            p*=(i+100)*j;
    printf("K1
");
}
__global__ void K2() {
    printf("K2
");
}

int main() {
    int *ptr;
    cudaStream_t s1, s2;
    cudaStreamCreate(&s1);
    cudaStreamCreate(&s2);
    K1<<<1, 1, 0, s1>>>();
    cudaHostAlloc(&ptr, 1, 0);
    K2<<<1, 1, 0, s2>>>();
    cudaDeviceSynchronize();
    return 0;
}

Output:

K2
K1

Questions:

Is s1 same as default stream?
As per this documentation,

Two commands from different streams cannot run concurrently if any one of the following operations is issued in-between them by the host thread:

a page-locked host memory allocation,

shouldn't K2 start after K1 finish?

Some questions about cuda streams

Answers (1)

Related Questions