Ixanezis
Ixanezis

Reputation: 1681

CUBLAS works unpredictably

Wrote my first program using CUDA+CUBLAS. It just uses a 'cublasDgemm' function and computes a product of 2 N*N matrices.

However, all the time I was launching my program, it keeped producing the same wrong answer (e.g. when multiplying 1*1 matrix containing 5 as a single element by 1*1 matrix containing element 6, it always said the result is 36, not 30). I checked the program several times with no success. But, when I came back to it the nexy day (i.e. after reboot), it worked just fine. I don't remember whether I recompiled it or not, but the truth is that it is the same VS project, same code, same computer with its GPU.

So, can anyone explain me why could that have happened? And do I have to expect same strange behaviour further?

Here is the code I was launching:

#include <iostream>
#include <string>
#include <iomanip>
#include <cuda_runtime.h>
#include <cublas_v2.h>

const int N = 5;
#define IDX2F(i,j) ((i) * N + j)

void fail(const cudaError_t& cudaStatus, const std::string& errorMessage) {
    if (cudaStatus != cudaSuccess) {
        std::cerr << errorMessage << std::endl;
        exit(EXIT_FAILURE);
    }
}

void fail(const cublasStatus_t& status, const std::string& errorMessage) {
    if (status != CUBLAS_STATUS_SUCCESS) {
        std::cerr << errorMessage << std::endl;
        exit(EXIT_FAILURE);
    }
}

void printMatrix(const double *C) {
    for (int i=0; i<N; i++) {
        for (int j=0; j<N; j++) {
            std::cout << std::fixed << std::setprecision(2) << C[IDX2F(i,j)] << ' ';
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;
}

int main(int argc, char **argv) {
    cudaError_t cudaStatus;
    cublasStatus_t status;
    cublasHandle_t handle;

    double *A = new double[N*N];
    double *devPtrA;

    double *B = new double[N*N];
    double *devPtrB;

    double *C = new double[N*N];
    double *devPtrC;

    for (int i=0; i<N; i++)
        for (int j=0; j<N; j++)
            A[IDX2F(i,j)] = i + j;

    for (int i=0; i<N; i++)
        for (int j=0; j<N; j++)
            B[IDX2F(i,j)] = i + j * 0.5;

    // do not have to set anything into matrix C, because beta = 0

    // allocate mamory on GPU
    cudaStatus = cudaMalloc((void**)&devPtrC, N*N*sizeof(*C));
    fail(cudaStatus, "device memory allocation failed");

    cudaStatus = cudaMalloc((void**)&devPtrA, N*N*sizeof(*A));
    fail(cudaStatus, "device memory allocation failed");

    cudaStatus = cudaMalloc((void**)&devPtrB, N*N*sizeof(*B));
    fail(cudaStatus, "device memory allocation failed");

    // create GPU handle
    status = cublasCreate(&handle);
    fail(status, "CUBLAS initialization failed");

    // copying matrices from host to GPU
    status = cublasSetMatrix(N, N, sizeof (*B), B, N, devPtrB, N);
    fail(status, "failed to load data from host to GPU");

    status = cublasSetMatrix(N, N, sizeof (*A), A, N, devPtrA, N);
    fail(status, "failed to load data from host to GPU");

    const double ONE = 1;
    const double ZERO = 0;

    printMatrix(A);
    printMatrix(B);

    status = cublasDgemm(   handle,
                            CUBLAS_OP_N, CUBLAS_OP_N,
                            N, N, N,
                            &ONE,
                            devPtrA, N,
                            devPtrB, N,
                            &ZERO,
                            devPtrC, N);

    fail(status, "error cublasDgemm");

    status = cublasGetMatrix(N, N, sizeof (*C), devPtrC, N, C, N);
    fail(status, "could not load result back from GPU to host");

    printMatrix(C);

    status = cublasDestroy(handle);
    fail(status, "could not destroy CUBLAS handle");

    cudaStatus = cudaFree(devPtrC);
    fail(cudaStatus, "device memory freeing failed");

    cudaStatus = cudaFree(devPtrB);
    fail(cudaStatus, "device memory freeing failed");

    cudaStatus = cudaFree(devPtrA);
    fail(cudaStatus, "device memory freeing failed");

    delete[] C;
    delete[] B;
    delete[] A;

    return EXIT_SUCCESS;
}

Upvotes: 0

Views: 2157

Answers (1)

borratatu
borratatu

Reputation: 21

op(B) must be CUBLAS_OP_T . .

status = cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &ONE, devPtrA, N, devPtrB, N, &ZERO, devPtrC, N); . . . . definition is : C = α op ( A ) op ( B ) + β C http://docs.nvidia.com/cuda/cublas/index.html#topic_8_1

Upvotes: 2

Related Questions