Dynamic parallelism cudaDeviceSynchronize() crashes

Question

I have a kernel which calls another empty kernel. However when the calling kernel calls cudaDeviceSynchronize(), the kernel crashes and the execution goes straight to the host. Memory checker does not report of any memory access issues. Does anyone know what could be the reason for such uncivilized behavior?

The crash seems to happen only if I run the code from the debugger (Visual Studio -> Nsight -> Start CUDA Debugging). The crash does not happen every time I run the code - sometimes it crashes, and sometimes it finishes ok.

Here is the complete code to reproduce the problem:

#include 
#include 
#include "device_launch_parameters.h"
#include 

#define CUDA_RUN(x_, err_) {cudaStatus = x_; if (cudaStatus != cudaSuccess) {fprintf(stderr, err_ "  %d - %s
", cudaStatus, cudaGetErrorString(cudaStatus)); int k; scanf("%d", &k); goto Error;}}

struct computationalStorage {
    float rotMat;
};

__global__ void drawThetaFromDistribution() {}

__global__ void chainKernel() {
    computationalStorage* c = (computationalStorage*)malloc(sizeof(computationalStorage));
    if (!c) printf("malloc error
");
    c->rotMat = 1.0f;

    int n = 1;
    while (n < 1000) {
        cudaError_t err;

        drawThetaFromDistribution<<<1, 1>>>();
        if ((err = cudaGetLastError()) != cudaSuccess)
            printf("drawThetaFromDistribution Sync kernel error: %s
", cudaGetErrorString(err));
        printf("0");
        if ((err = cudaDeviceSynchronize()) != cudaSuccess)
          printf("drawThetaFromDistribution Async kernel error: %s
", cudaGetErrorString(err));
        printf("1
");
        ++n;
    }

    free(c);
}

int main() {
    cudaError_t cudaStatus;
    // Choose which GPU to run on, change this on a multi-GPU system.
    CUDA_RUN(cudaSetDevice(0), "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

    // Set to use on chip memory 16KB for shared, 48KB for L1
    CUDA_RUN(cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 ), "Can't set CUDA to use on chip memory for L1");
    // Set a large heap
    CUDA_RUN(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024 * 10 * 192), "Can't set the Heap size");

    chainKernel<<<10, 192>>>();
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        printf("Something was wrong! Error code: %d", cudaStatus);
    }

    CUDA_RUN(cudaDeviceReset(), "cudaDeviceReset failed!");

Error:
    int k;
    scanf("%d",&k);
    return 0;
}

If all goes well I expect to see:

00000000000000000000000....0000000000000001
1
1
1
1
....

This is what I get when everything works ok. When it crashes however:

000000000000....0000000000000Something was wrong! Error code: 30

As you can see the statement err = cudaDeviceSynchronize(); does not finish, and the execution goes straight to the host, where its cudaDeviceSynchronize(); fails with unknown error code (30 = cudaErrorUnknown).

System: CUDA 5.5, NVidia-Titan(Headless), Windows 7x64, Win32 application. UPDATE: additional Nvidia card driving the display, Nsight 3.2.0.13289.

Dynamic parallelism cudaDeviceSynchronize() crashes

Answers (1)

Related Questions