
Reputation: 2449

CUDA program crashed driver

i am trying to make a pice of code work in parallel on a cuda device, but i am having a problem, my gfx driver keep crashing while the program runs, but only for this program, other cuda code runs fine. And it gives a wrong anwser, but i will think that is because of the crash!

Note: it is being runed on a Quadro 2000M GFX card!

Here is my parallel version of the code.

#include <stdio.h>
#include <time.h>
#include <math.h>

#define N 16
#define threads 512
#define MaxBlocks 6500

__global__ void cudaCalculateBestPath(int *M, int *bestToDiagPathSum, int *bestFromDiagPathSum, 
        unsigned int *bestToDiagPathCode, unsigned int *bestFromDiagPathCode)
    int x = ((unsigned int)1 << (N-1));
    unsigned int currentPathCode = blockIdx.x * threads + threadIdx.x;
    // This while is for if we are over the max amount of blocks
    while(currentPathCode < x)
        int test = ((unsigned int)1 << (N-1));
        if(currentPathCode >= test)

        unsigned int ui = currentPathCode;

        int toDiagPathSum = M[0];
        int toDiagRow = 0; 
        int toDiagCol = 0;

        int fromDiagPathSum = M[(N-1)*N+N-1];
        int fromDiagRow = N-1;
        int fromDiagCol = N-1;

        for (int i = 0; i < N-1; i++) 
            if (ui % 2 == 0) 
                toDiagCol++;                 // horizontal move
                toDiagRow++;                 // vertical move
            toDiagPathSum += M[toDiagRow*N+toDiagCol];
            fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
            ui = ui >> 1;

        if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
            bestToDiagPathSum[toDiagRow] = toDiagPathSum;
            bestToDiagPathCode[toDiagRow] = currentPathCode; 

        if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
            bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
            bestFromDiagPathCode[fromDiagRow] = currentPathCode; 

        // Next run
        currentPathCode = blockDim.x + gridDim.x;

int main()
    clock_t start = clock();

    //--- create and initialize M matrix (including best path) 

    int M[N*N];
    for (int row = 0; row < N; row++)
        for (int col = 0; col < N; col++)
            M[row*N+col] = 2;

#pragma region Test Path
    M[ 0*N+0] = 1;
    M[ 0*N+1] = 1;
    M[ 0*N+2] = 1;
    M[ 1*N+2] = 1;
    M[ 1*N+3] = 1;
    M[ 2*N+3] = 1;
    M[ 3*N+3] = 1;
    M[ 3*N+4] = 1;
    M[ 3*N+5] = 1;
    M[ 3*N+6] = 1;
    M[ 4*N+6] = 1;
    M[ 5*N+6] = 1;
    M[ 6*N+6] = 1;
    M[ 7*N+6] = 1;
    M[ 8*N+6] = 1;
    M[ 9*N+6] = 1;
    M[10*N+6] = 1;
    M[10*N+7] = 1;
    M[10*N+8] = 1;
    M[10*N+9] = 1;
    M[11*N+9] = 1;
    M[11*N+0] = 1;
    M[12*N+0] = 1;
    M[13*N+10] = 1;
    M[13*N+11] = 1;
    M[13*N+12] = 1;
    M[14*N+12] = 1;
    M[15*N+12] = 1;
    M[15*N+13] = 1;
    M[15*N+14] = 1;
    M[15*N+15] = 1;
#pragma endregion Test Path

    //--- create and initialize bestToDiag and bestFromDiag arrays

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N];

    unsigned int bestToDiagPathCode[N];
    unsigned int bestFromDiagPathCode[N];

    int biggerThanMaxPathSum = 256*N + 1;
    for (int i = 0; i < N; i++)
        bestToDiagPathSum[i]   = biggerThanMaxPathSum; 
        bestFromDiagPathSum[i] = biggerThanMaxPathSum;

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays

    int x = ((unsigned int)1 << (N-1));

    // Trick for celin the total blocks
    int TotalBlocks = (x+threads)/threads;
    if(TotalBlocks > MaxBlocks)
        TotalBlocks = MaxBlocks;

    int *dev_M, *dev_bestToDiagPathSum, *dev_bestFromDiagPathSum;
    unsigned int *dev_bestToDiagPathCode, *dev_bestFromDiagPathCode;

    // allocate memory
    cudaMalloc((void**)&dev_M, N*N*sizeof(int));
    cudaMalloc((void**)&dev_bestToDiagPathSum, N*sizeof(int));
    cudaMalloc((void**)&dev_bestFromDiagPathSum, N*sizeof(int));
    cudaMalloc((void**)&dev_bestToDiagPathCode, N*sizeof(int));
    cudaMalloc((void**)&dev_bestFromDiagPathCode, N*sizeof(int));

    // Copy memory to device
    cudaMemcpy( dev_M, M, N*N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestToDiagPathSum, bestToDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestFromDiagPathSum, bestFromDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestToDiagPathCode, bestToDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestFromDiagPathCode, bestFromDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);

    // Run code on device
    printf("Blocks: %d\n", TotalBlocks);
    printf("Threads: %d\n\n", threads);
    cudaCalculateBestPath<<<TotalBlocks,threads>>>(dev_M, dev_bestToDiagPathSum, dev_bestFromDiagPathSum, 
        dev_bestToDiagPathCode, dev_bestFromDiagPathCode);

    // Insert code here to run while the GPU is running.

    // Copy the mem back
    cudaMemcpy( M, dev_M, N*N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestToDiagPathSum, dev_bestToDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestFromDiagPathSum, dev_bestFromDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestToDiagPathCode, dev_bestToDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestFromDiagPathCode, dev_bestFromDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);

    int bestPathSum = biggerThanMaxPathSum;
    unsigned int bestPathCodePrefix;
    unsigned int bestPathCodeSuffix;

    int tempSum;

    for (int i = 0; i < N; i++)
        tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
        if (tempSum < bestPathSum)
            bestPathSum = tempSum;
            bestPathCodePrefix = bestToDiagPathCode[i];
            bestPathCodeSuffix = bestFromDiagPathCode[i];

    //--- output best path sum and best path diagram

    printf("Best Path Sum = %d\n\n",bestPathSum);

    M[0] = -M[0];
    int toDiagRow = 0;
    int toDiagCol = 0;
    unsigned int ui = bestPathCodePrefix;
    for (int i = 0; i < N-1; i++) 
        if (ui % 2 == 0) 
            toDiagCol++;                 // horizontal move
            toDiagRow++;                 // vertical move
        M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
        ui = ui >> 1;

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
    int fromDiagRow = N-1;
    int fromDiagCol = N-1;
    ui = bestPathCodeSuffix;
    for (int i = 0; i < N-2; i++) 
        if (ui % 2 == 0) 
            fromDiagCol--;                 // horizontal move
            fromDiagRow--;                 // vertical move
        M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
        ui = ui >> 1;

    for (int row = N-1; row >= 0; row--)
        for (int col = 0; col <= N-1; col++)
            if (M[row*N+col] < 0)
                M[row*N+col] = -M[row*N+col];

    printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);

    int dummyReadForPause;

    return 0;

Sequential code (Working perfect)

#include <stdio.h>
#include <time.h>
#include <math.h>

#define N 16
#define threads 512
#define MaxBlocks 6500

int main()
    clock_t start = clock();

    //--- create and initialize M matrix (including best path) 

    int M[N*N];
    for (int row = 0; row < N; row++)
        for (int col = 0; col < N; col++)
            M[row*N+col] = 2;

#pragma region Test Path
    M[ 0*N+0] = 1;
    M[ 0*N+1] = 1;
    M[ 0*N+2] = 1;
    M[ 1*N+2] = 1;
    M[ 1*N+3] = 1;
    M[ 2*N+3] = 1;
    M[ 3*N+3] = 1;
    M[ 3*N+4] = 1;
    M[ 3*N+5] = 1;
    M[ 3*N+6] = 1;
    M[ 4*N+6] = 1;
    M[ 5*N+6] = 1;
    M[ 6*N+6] = 1;
    M[ 7*N+6] = 1;
    M[ 8*N+6] = 1;
    M[ 9*N+6] = 1;
    M[10*N+6] = 1;
    M[10*N+7] = 1;
    M[10*N+8] = 1;
    M[10*N+9] = 1;
    M[11*N+9] = 1;
    M[11*N+0] = 1;
    M[12*N+0] = 1;
    M[13*N+10] = 1;
    M[13*N+11] = 1;
    M[13*N+12] = 1;
    M[14*N+12] = 1;
    M[15*N+12] = 1;
    M[15*N+13] = 1;
    M[15*N+14] = 1;
    M[15*N+15] = 1;
#pragma endregion Test Path

    //--- create and initialize bestToDiag and bestFromDiag arrays

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N];

    unsigned int bestToDiagPathCode[N];
    unsigned int bestFromDiagPathCode[N];

    int biggerThanMaxPathSum = 256*N + 1;
    for (int i = 0; i < N; i++)
        bestToDiagPathSum[i]   = biggerThanMaxPathSum; 
        bestFromDiagPathSum[i] = biggerThanMaxPathSum;

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays
    int toDiagPathSum, toDiagRow, toDiagCol;
    int fromDiagPathSum, fromDiagRow, fromDiagCol;
    unsigned int ui;
    int x = ((unsigned int)1 << (N-1));

    for(unsigned int currentPathCode = 0; currentPathCode < x; currentPathCode++)
        ui = currentPathCode;

        toDiagPathSum = M[0];
        toDiagRow = 0; 
        toDiagCol = 0;

        fromDiagPathSum = M[(N-1)*N+N-1];
        fromDiagRow = N-1;
        fromDiagCol = N-1;

        for (int i = 0; i < N-1; i++) 
            if (ui % 2 == 0) 
                toDiagCol++;                 // horizontal move
                toDiagRow++;                 // vertical move
            toDiagPathSum += M[toDiagRow*N+toDiagCol];
            fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
            ui = ui >> 1;

        if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
            bestToDiagPathSum[toDiagRow] = toDiagPathSum;
            bestToDiagPathCode[toDiagRow] = currentPathCode; 

        if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
            bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
            bestFromDiagPathCode[fromDiagRow] = currentPathCode; 

    int bestPathSum = biggerThanMaxPathSum;
    unsigned int bestPathCodePrefix;
    unsigned int bestPathCodeSuffix;

    int tempSum;

    for (int i = 0; i < N; i++)
        tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
        if (tempSum < bestPathSum)
            bestPathSum = tempSum;
            bestPathCodePrefix = bestToDiagPathCode[i];
            bestPathCodeSuffix = bestFromDiagPathCode[i];

    //--- output best path sum and best path diagram

    printf("Best Path Sum = %d\n\n",bestPathSum);

    M[0] = -M[0];
    toDiagRow = 0;
    toDiagCol = 0;
    ui = bestPathCodePrefix;
    for (int i = 0; i < N-1; i++) 
        if (ui % 2 == 0) 
            toDiagCol++;                 // horizontal move
            toDiagRow++;                 // vertical move
        M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
        ui = ui >> 1;

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
    fromDiagRow = N-1;
    fromDiagCol = N-1;
    ui = bestPathCodeSuffix;
    for (int i = 0; i < N-2; i++) 
        if (ui % 2 == 0) 
            fromDiagCol--;                 // horizontal move
            fromDiagRow--;                 // vertical move
        M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
        ui = ui >> 1;

    for (int row = N-1; row >= 0; row--)
        for (int col = 0; col <= N-1; col++)
            if (M[row*N+col] < 0)
                M[row*N+col] = -M[row*N+col];

    printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);

    int dummyReadForPause;

    return 0;

Upvotes: 0

Views: 377

Answers (1)


Reputation: 62459

If you have a long running kernel Windows will stop your driver to prevent the system from becoming unresponsive. You will need to do some regedit to remove this feature. Have a look here:

Upvotes: 2

Related Questions