Jens
Jens

Reputation: 2702

Varying results from cuBlas

I have implemented the following CUDA code but i am a little bit confused about the behavior.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>

#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1)) 

void PrintMatrix(float* a, int n)
{
    int j, i;
    for (j = 1; j <= n; j++)
    {
        for (i = 1; i <= n; i++)
        {
            printf("%7.0f", a[IDX2F(i, j, n)]);
        }
        printf("\n");
    }
}

float* CreateMatrix(int n)
{
    float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
    if (!matrix)
    {
        printf("host memory allocation failed");
        return nullptr;
    }

    for (int j = 1; j <= n; j++)
    {
        for (int i = 1; i <= n; i++)
        {
            matrix[IDX2F(i, j, n)] = 2;
        }
    }

    return matrix;
}

long CudaMatrixMultiply(float* matrix, int n)
{
    cudaError_t cudaStat;
    cublasStatus_t status;
    cublasHandle_t handle;
    float* deviceMatrix;

    cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
    if (cudaStat != cudaSuccess)
    {
        printf("device memory allocation failed");
        return EXIT_FAILURE;
    }

    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }

    status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("data download failed");
        cudaFree(deviceMatrix);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    float alpha = 1;
    float beta = 0;
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);

    status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("data upload failed");
        cudaFree(deviceMatrix);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    cudaFree(deviceMatrix);
    cublasDestroy(handle);
    return EXIT_SUCCESS;
}

float* CpuMatrixMultiply(float* matrix, int size)
{
    float* result = new float[size * size]();

    // Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
    for (int row = 1; row <= size; row++) 
    {
        for (int col = 1; col <= size; col++) 
        {
            // Multiply the row of A by the column of B to get the row, column of product.
            for (int inner = 1; inner <= size; inner++) 
            {
                // result[row][col] += matrix[row][inner] * matrix[inner][col];
                result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
            }
        }
    }

    free(matrix);
    return result;
}

int main(void)
{
    // printf("Matrix * Matrix Test\n");
    int size = 1000;
    int runs = 10;

    for (int run = 0; run != runs; run++)
    {
        printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
        printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);

        float* cpuMatrix = CreateMatrix(size);
        cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);

        PrintMatrix(cpuMatrix, 5);

        float* gpuMatrix = CreateMatrix(size);
        CudaMatrixMultiply(gpuMatrix, size);
        PrintMatrix(gpuMatrix, 5);

        free(cpuMatrix);
        free(gpuMatrix);
    }
    getchar();
    return EXIT_SUCCESS;
}

The ouput of the CPU version of the MatrixMultiplication is the following as expected:

4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000

but the result of the GPU computed is sometimes the right one (see above) or a wrong random(?) one. When the loop is executed the first time then the result was always the right one.

I am not able to find a mistake in my code and it would be great if you could help me.


Additionally if i set size (int the main method) to e.g. 16000 then my driver is crashing and i get an error message. For this i have written a bug report to NVidea because my pc crashed twice. But maybe it is a programming fault by me?

Driver: 364.72 (newest one)
SDK: CUDA Toolkit 7.5
Graphics Card: NVidia GeForce GTX 960 (4GB)
Windows 10 64Bit

Driver Error

Display driver NVIDIA Windows kernel Mode Driver, Version 362.72 stopped responding and has successfully recovered.

Edit: With the help of the community i found out that this is a problem with the watchdog timer. See answer below.

Upvotes: 1

Views: 257

Answers (1)

Florent DUGUET
Florent DUGUET

Reputation: 2916

Regarding the second part of the question, following njuffa's remark, you may change the settings for driver behavior to avoid the error when increasing size. Open NSIGHT Monitor and in Options, General, Microsoft Display Driver, change to False the WDDM TDR enabled field.

settings illustration of NSIGHT Monitor Options

From spec, the 32bits FPU flops should be around 2.4 TFLOPS in single precision, hence your operation for a 16000 sized matrix should take at the minimum 3.5 seconds. Hence the Driver Recovery after 2 seconds.

Upvotes: 1

Related Questions