Getting a wrong result when using cusparse Csrmv_mp but a correct one with cusparse Csrmv

Question

I'm trying to do a simple test of the Csrmv_mp function. I have a working program, but I'm getting a wrong result for an specific pair of matrix an vector. If I run the exact same program but with Csrmv I get the correct result.

Here is my code (it is a reduced version of the example in appendix C of the cusparse library: http://docs.nvidia.com/cuda/cusparse/index.html#csrmv_examples):

/*
* How to compile (assume cuda is installed at /usr/local/cuda/)
* nvcc -c -I/usr/local/cuda/include csrmvmp_example.cpp
* g++ -fopenmp -o csrmvmp_example csrmvmp_example.o -L/usr/local/cuda/lib64 -
lcublas -lcusparse -lcudart
*
*/
#include 
#include 
#include 
#include 
#include 
#include 

void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            double Areg = A[row + col*lda];
            printf("%s(%d,%d) = %f
", name, row + 1, col + 1, Areg);
        }
    }
}
int main(int argc, char*argv[])
{
    cublasHandle_t cublasH = NULL;
    cusparseHandle_t cusparseH = NULL;
    cudaStream_t stream = NULL;
    cusparseMatDescr_t descrA = NULL;
    cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
    cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;
    cudaError_t cudaStat5 = cudaSuccess;

    const int n = 3;
    const int nnzA = 6;
    /*
    *       | 0 1 2 |
    *   A = | 1 0 3 |
    *       | 2 3 0 |
    *   

    * Initial vector
    *
    *       | 1/3 |
    *   v = | 1/3 |
    *       | 1/3 |
    * 
    */

    const int csrRowPtrA[n + 1] = { 0, 2, 4, 6 };
    const int csrColIndA[nnzA] = { 1, 2, 0, 2, 0, 1 };
    const double csrValA[nnzA] = { 1.0, 2.0, 1.0, 3.0, 2.0, 3.0 };

    const double x0[n] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 }; /* initial guess */

    double x[n]; /* numerical eigenvector */
    int *d_csrRowPtrA = NULL;
    int *d_csrColIndA = NULL;
    double *d_csrValA = NULL;
    double *d_x = NULL; /* eigenvector */
    double *d_y = NULL; /* workspace */
    const double tol = 1.e-6;
    const int max_ites = 30;
    const double h_one = 1.0;
    const double h_zero = 0.0;

    /* step 1: create cublas/cusparse handle, bind a stream */
    cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    assert(cudaSuccess == cudaStat1);
    cublasStat = cublasCreate(&cublasH);
    assert(CUBLAS_STATUS_SUCCESS == cublasStat);
    cublasStat = cublasSetStream(cublasH, stream);
    assert(CUBLAS_STATUS_SUCCESS == cublasStat);
    cusparseStat = cusparseCreate(&cusparseH);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
    cusparseStat = cusparseSetStream(cusparseH, stream);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);


    /* step 2: configuration of matrix A */
    cusparseStat = cusparseCreateMatDescr(&descrA);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);


    /* step 3: copy A and x0 to device */
    cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
    cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
    cudaStat3 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA);
    cudaStat4 = cudaMalloc((void**)&d_x, sizeof(double) * n);
    cudaStat5 = cudaMalloc((void**)&d_y, sizeof(double) * n);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    assert(cudaSuccess == cudaStat5);
    cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1),
        cudaMemcpyHostToDevice);
    cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA,
        cudaMemcpyHostToDevice);
    cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA,
        cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);


    /*
    * 4.1: initial guess x0
    */

    cudaStat1 = cudaMemcpy(d_x, x0, sizeof(double) * n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);

    /*
    * 4.3: y = A*x
    */
    cusparseStat = cusparseDcsrmv_mp(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnzA, &h_one, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_x, &h_zero, d_y);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);


    /*
    * step 5: report result
    */
    cudaStat1 = cudaMemcpy(x, d_y, sizeof(double) * n, cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat1);
    printf("vector = 
");
    printMatrix(n, 1, x, n, "V0");
    printf("=====
");


    /* free resources */
    if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
    if (d_csrColIndA) cudaFree(d_csrColIndA);
    if (d_csrValA) cudaFree(d_csrValA);
    if (d_x) cudaFree(d_x);
    if (d_y) cudaFree(d_y);
    if (cublasH) cublasDestroy(cublasH);
    if (cusparseH) cusparseDestroy(cusparseH);
    if (stream) cudaStreamDestroy(stream);
    if (descrA) cusparseDestroyMatDescr(descrA);
    cudaDeviceReset();
    return 0;
}

The resulting vector is {1, 1, 1}, but doing the calculation by hand or with the Csrmv function I get the vector {1, 4/3, 5/3} as a result. I really don't understand why I have this problem. The only thing I can think of is I incorrectly wrote the matrix in its CSR format. Also, I don't use CUSPARSE_MATRIX_TYPE_SYMMETRIC because the function doesn't accept this type of matix (the documentation is wrong).

If someone could help me I would appreciate it very much.

Edit: I'm using CUDA 9.0, my OS Windows 10 Home and my GPU is the GTX 960m

Getting a wrong result when using cusparse Csrmv_mp but a correct one with cusparse Csrmv

Answers (1)

Related Questions