Caenorst
Caenorst

Reputation: 97

cublas is unusually slow compare to cusparse

I'm trying to run some test to compare cusparse and cublas performance under differents sparsity (with a Titan X), here is the main code named "testcusparsevector.cpp" :

#include <stdio.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <time.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <assert.h>
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))


// /home/gpu1/Install/OpenBLAS-0.2.14


#define CHECK_EQ(a,b) do { \
    if ((a) != (b)) { \
        cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
        exit(1);\
    }\
} while(0)

#define CUBLAS_CHECK(condition) \
do {\
    cublasStatus_t status = condition; \
    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
} while(0)

#define CUSPARSE_CHECK(condition)\
do {\
    cusparseStatus_t status = condition; \
    switch(status)\
    {\
        case CUSPARSE_STATUS_NOT_INITIALIZED:\
            cout << "CUSPARSE_STATUS_NOT_INITIALIZED" << endl;\
            break;\
        case CUSPARSE_STATUS_ALLOC_FAILED:\
            cout << "CUSPARSE_STATUS_ALLOC_FAILED" << endl;\
            break;\
        case CUSPARSE_STATUS_INVALID_VALUE:\
            cout << "CUSPARSE_STATUS_INVALID_VALUE" << endl;\
            break;\
        case CUSPARSE_STATUS_ARCH_MISMATCH:\
            cout << "CUSPARSE_STATUS_ARCH_MISMATCH" << endl;\
            break;\
        case CUSPARSE_STATUS_MAPPING_ERROR:\
            cout << "CUSPARSE_STATUS_MAPPING_ERROR" << endl;\
            break;\
            case CUSPARSE_STATUS_EXECUTION_FAILED:\
            cout << "CUSPARSE_STATUS_EXECUTION_FAILED" << endl;\
            break;\
        case CUSPARSE_STATUS_INTERNAL_ERROR:\
            cout << "CUSPARSE_STATUS_INTERNAL_ERROR" << endl;\
            break;\
        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:\
            cout << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED" << endl;\
            break;\
        case CUSPARSE_STATUS_ZERO_PIVOT:\
            cout << "CUSPARSE_STATUS_ZERO_PIVOT" << endl;\
    }\
    CHECK_EQ(status, CUSPARSE_STATUS_SUCCESS); \
} while(0)

#define CUDA_CHECK(condition)\
do {\
    cudaError_t error = condition;\
    CHECK_EQ(error, cudaSuccess);\
} while(0)

//check after kernel function
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())



#define __TIMING__ 1

#if __TIMING__


#define INIT_TIMER  cudaEvent_t start, stop; \
    float milliseconds = 0; \
    float sum = 0;\
    cudaEventCreate( &start );\
    cudaEventCreate( &stop );

#define TIC {  cudaEventRecord( start ); }

#if __CUDNN__
    #define PREDEFNAME "CUDNN"
#else
    #define PREDEFNAME "CUDA"
#endif

#define TOC(a) { cudaEventRecord( stop ); \
        cudaEventSynchronize( stop ); \
        cudaEventElapsedTime( &milliseconds, start, stop );  \
        printf( "GPU Execution time of %s_%s: %f ms\n",PREDEFNAME, a, milliseconds ); \
        sum += milliseconds;\
        fflush(stdout); }

#define CLOSE_TIMER {cudaEventDestroy(start); cudaEventDestroy(stop); }
#endif

using namespace std;

void dispArray(double* array, size_t width, size_t height) {
    for (int i=0; i < height;i++ ) {
        for (int j=0;j < width;j++) {
            cout << array[j*height+i] << ' ';
        }
        cout << endl;
    }
    cout << endl;
}

int main()
{
    srand(time(NULL));
    const int num_loop = 1;
    const int inside_loop = 1000;
    // const int WIDTH = 512*3*3;
    // const int HEIGHT = 512;
    // const int WIDTHOUT = 36;
    const int WIDTH = 4608;
    const int HEIGHT = 512;
    const int WIDTHOUT = 144;
    // const int WIDTH = 18500;
    // const int HEIGHT = 512;
    // const int WIDTHOUT = 1;
    // const int WIDTH = 3;
    // const int HEIGHT = 5;
    // const int WIDTHOUT = 2;
    INIT_TIMER
    ofstream myfile;
    myfile.open("test_sparsity.log");

    cudaError_t cudaStat;    
    cusparseStatus_t stat;
    cusparseHandle_t handle;
    cublasHandle_t handleblas;

    double *devPtrOutput;
    double *devPtrOutput2;
    double *devPtrRand;
    double *devPtrSec;
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput), sizeof(double)*HEIGHT*WIDTHOUT));
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput2), sizeof(double)*HEIGHT*WIDTHOUT));

    CUDA_CHECK(cudaMalloc((void **)&(devPtrRand), sizeof(double)*WIDTH*WIDTHOUT));
    CUDA_CHECK(cudaMalloc((void **)&(devPtrSec), sizeof(double)*WIDTH*HEIGHT));
    const double alpha=1.0;
    const double beta=0.0;
    double *csrVal;
    int *csrRowPtr;
    int *csrColInd;

    const bool SPARSE = true;
    long a = clock();
    long temp = clock();
    cusparseMatDescr_t descr;
    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
    int nnz;
    CUSPARSE_CHECK(cusparseCreate(&handle));
    CUBLAS_CHECK(cublasCreate(&handleblas));
    int *nnzPerRow_gpu;
    CUDA_CHECK(cudaMalloc((void **)&(nnzPerRow_gpu), sizeof(int)*HEIGHT));
    CUDA_CHECK(cudaMalloc((void **)&(csrRowPtr), sizeof(int)*(HEIGHT+1)));
    double density_array[1] = {0.9999};//, 0.8, 0.7, 0.6, 0.5,      0.4, 0.3, 0.2, 0.1 ,0.09,     0.08, 0.07, 0.06, 0.05 ,0.04,     0.03, 0.02, 0.01};
    for (int inddense=0;inddense < 1;inddense++) {
        double DENSITY = density_array[inddense];
        int num_non_zeros = DENSITY * (WIDTH * HEIGHT);

        CUDA_CHECK(cudaMalloc((void **)&(csrColInd), sizeof(int)*num_non_zeros));
        CUDA_CHECK(cudaMalloc((void **)&(csrVal), sizeof(double)*num_non_zeros));
        INIT_TIMER
        for (int iter=0; iter < num_loop;iter++) {
            vector<double> randVec(WIDTH*WIDTHOUT, 0);
            vector<double> secArray(WIDTH*HEIGHT, 0);
            vector<int> temp(WIDTH*HEIGHT, 1);

            for (int j = 0; j < WIDTH*WIDTHOUT; j++) {
                randVec[j]=(double)(rand()%100000)/100;
            }

            for (int x, i = 0; i < num_non_zeros;i++) {
                do
                {
                    x = rand() % (WIDTH*HEIGHT);
                } while(temp[x] == 0);
                temp[x]=0;
                secArray[x]=(double)(rand()%100000)/100;
            }
            int count = 0;
            for(int i=0;i < WIDTH*HEIGHT;i++) {
                if (secArray[i] != 0) {
                    count++;
                }
            }

            // randVec = {2,2,2,3,3,3};
            // secArray = {0,5,0,2,5,8,7,0,0,0,0,2,0,4,4};
            CUDA_CHECK(cudaMemcpy(devPtrRand, &randVec[0], sizeof(double)*WIDTH*WIDTHOUT, cudaMemcpyHostToDevice));
            CUDA_CHECK(cudaMemcpy(devPtrSec, &secArray[0], sizeof(double)*WIDTH*HEIGHT, cudaMemcpyHostToDevice));


            if (SPARSE) {
                CUSPARSE_CHECK(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, HEIGHT, WIDTH, descr, devPtrSec, HEIGHT, nnzPerRow_gpu, &nnz));
                CUSPARSE_CHECK(cusparseDdense2csr(handle, HEIGHT, WIDTH, descr,devPtrSec,HEIGHT,nnzPerRow_gpu,csrVal,csrRowPtr,csrColInd));
            }       
            // vector<double> tempcsrVal(nnz,0);
            // vector<int> tempcsrRowPtr(HEIGHT+1);
            // vector<int> tempcsrColInd(nnz,0);
            // CUDA_CHECK(cudaMemcpy(&tempcsrVal[0], csrVal, sizeof(double)*nnz, cudaMemcpyDeviceToHost));
            // CUDA_CHECK(cudaMemcpy(&tempcsrRowPtr[0], csrRowPtr, sizeof(int)*(HEIGHT+1), cudaMemcpyDeviceToHost));
            // CUDA_CHECK(cudaMemcpy(&tempcsrColInd[0], csrColInd, sizeof(int)*nnz, cudaMemcpyDeviceToHost));
            // for (int i =0; i < nnz;i++) {
                // cout << tempcsrVal[i] << " ";
            // }
            // cout << endl;
            // for (int i =0; i < HEIGHT+1;i++) {
                // cout << tempcsrRowPtr[i] << " ";
            // }
            // cout << endl;
            // for (int i =0; i < nnz;i++) {
                // cout << tempcsrColInd[i] << " ";
            // }
            // cout << endl;
            cudaDeviceSynchronize();
            TIC
            for (int i=0 ; i < inside_loop;i++) {
                if (WIDTHOUT == 1) {
                    // TIC
                    CUSPARSE_CHECK(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                    HEIGHT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, csrColInd, 
                    devPtrRand, &beta, devPtrOutput));
                    // TOC("csrmv")
                } else {
                    // TIC
                    CUSPARSE_CHECK(cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
                        HEIGHT, WIDTHOUT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, 
                        csrColInd, devPtrRand, WIDTH, &beta, devPtrOutput, HEIGHT));
                    // TOC("csrmm")
                }
            }
            TOC("csr")
            TIC
            for (int i=0 ; i < inside_loop;i++) {
                if (WIDTHOUT == 1) {
                    // TIC
                    CUBLAS_CHECK(cublasDgemv(handleblas, CUBLAS_OP_N, HEIGHT, WIDTH, &alpha, devPtrSec, HEIGHT , devPtrRand, 1, &beta, devPtrOutput2, 1));
                    // TOC("dgemv")
                } else {
                    // TIC
                    CUBLAS_CHECK(cublasDgemm(handleblas, CUBLAS_OP_N, CUBLAS_OP_N, HEIGHT, WIDTHOUT, WIDTH, &alpha, devPtrSec, HEIGHT, devPtrRand, WIDTH, &beta, devPtrOutput2, HEIGHT));
                    // TOC("dgemm")
                }
            }
            TOC("blas")


            #if 0
            vector<double> output(HEIGHT*WIDTHOUT, 0);
            vector<double> output2(HEIGHT*WIDTHOUT, 0);
            CUDA_CHECK(cudaMemcpy(&output[0], devPtrOutput, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
            CUDA_CHECK(cudaMemcpy(&output2[0], devPtrOutput2, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
            dispArray(&output[0], WIDTHOUT, HEIGHT);
            cout << endl;
            for (int i=0;i < WIDTHOUT * HEIGHT;i++) {
                if (output[i] != output2[i]) {
                    cout << "error: " << i << " " << (output[i] - output2[i]) << " " << output[i] << endl;
                }
            }
            #endif

        }

        cout << DENSITY << " " << sum/num_loop << endl;
        myfile << DENSITY << " " << sum/num_loop << endl;
        cudaFree(csrColInd);
        cudaFree(csrVal);
    }
    myfile.close();
    cudaFree(csrRowPtr);
    cudaFree(devPtrOutput);
    cudaFree(devPtrRand);
    cudaFree(devPtrSec);

}

However after compiling the code with

g++ -std=c++1y -O3 -I/usr/local/cuda/include -o testcusparsevector testcusparsevector.cpp -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusparse

here is the output :

GPU Execution time of CUDA_csr: 4818.447266 ms
GPU Execution time of CUDA_blas: 5024.459961 ms

which should mean that even if my density is at 0.999, the cusparseDcsrmm is still faster than cublasDgemm, I already checked the result which is good, and compared to others example, seems that the problem is coming from cublas which is far too slow.

Do you have any idea about where does it come from ?

EDIT : I tried to change the values to float, and the result is more what I was looking for, apparently, cublas is not made for double computation...

Thanks by advance.

Upvotes: 3

Views: 488

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151799

Titan X (and all current members of the maxwell GPU family) have a ratio of throughput between double precision floating point operations and single precision floating point operations of 1:32.

Normally, sparse matrix operations are memory bandwidth bound, whereas dense matrix-matrix multiply would be an example of a compute-bound problem.

So in your example, you're taking a problem that is typically compute bound, and running it as a sparse matrix multiply on a processor which has a relatively large amount of memory bandwidth, and a relatively small amount of double-precision compute throughput.

The situation can give rise to a blurring of the lines between the two APIs, whereas the CUBLAS API would normally be much quicker for this comparison.

If you switch your code to using float instead of double as I think you've already tried, you'll see CUBLAS win again. Likewise if you ran the code as-is on a GPU that had a different ratio between single and double precision throughput, you'd see CUBLAS win again there also.

apparently, cublas is not made for double computation...

rather than saying that, I would say that GTX Titan X is not made (primarily) for double computation. Try a Tesla K80, K40, or other GPU that has a closer ratio of double to single throughput.

Here's the output of your program running on a "unboosted" Tesla K40:

$ ./testcusparsevector
GPU Execution time of CUDA_csr: 8870.386719 ms
GPU Execution time of CUDA_blas: 1045.211792 ms

Disclaimer: I haven't attempted to study your code. I looked it over, and no obvious issues jump out at me. But there could be issues that I haven't spotted.

Upvotes: 3

Related Questions