Reputation: 43
I'm trying to do a simple test of the Csrmv_mp function. I have a working program, but I'm getting a wrong result for an specific pair of matrix an vector. If I run the exact same program but with Csrmv I get the correct result.
Here is my code (it is a reduced version of the example in appendix C of the cusparse library: http://docs.nvidia.com/cuda/cusparse/index.html#csrmv_examples):
/*
* How to compile (assume cuda is installed at /usr/local/cuda/)
* nvcc -c -I/usr/local/cuda/include csrmvmp_example.cpp
* g++ -fopenmp -o csrmvmp_example csrmvmp_example.o -L/usr/local/cuda/lib64 -
lcublas -lcusparse -lcudart
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse.h>
void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
double Areg = A[row + col*lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
cublasHandle_t cublasH = NULL;
cusparseHandle_t cusparseH = NULL;
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL;
cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const int n = 3;
const int nnzA = 6;
/*
* | 0 1 2 |
* A = | 1 0 3 |
* | 2 3 0 |
*
* Initial vector
*
* | 1/3 |
* v = | 1/3 |
* | 1/3 |
*
*/
const int csrRowPtrA[n + 1] = { 0, 2, 4, 6 };
const int csrColIndA[nnzA] = { 1, 2, 0, 2, 0, 1 };
const double csrValA[nnzA] = { 1.0, 2.0, 1.0, 3.0, 2.0, 3.0 };
const double x0[n] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 }; /* initial guess */
double x[n]; /* numerical eigenvector */
int *d_csrRowPtrA = NULL;
int *d_csrColIndA = NULL;
double *d_csrValA = NULL;
double *d_x = NULL; /* eigenvector */
double *d_y = NULL; /* workspace */
const double tol = 1.e-6;
const int max_ites = 30;
const double h_one = 1.0;
const double h_zero = 0.0;
/* step 1: create cublas/cusparse handle, bind a stream */
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
cublasStat = cublasCreate(&cublasH);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cublasStat = cublasSetStream(cublasH, stream);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cusparseStat = cusparseCreate(&cusparseH);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseStat = cusparseSetStream(cusparseH, stream);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/* step 2: configuration of matrix A */
cusparseStat = cusparseCreateMatDescr(&descrA);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
/* step 3: copy A and x0 to device */
cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
cudaStat3 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA);
cudaStat4 = cudaMalloc((void**)&d_x, sizeof(double) * n);
cudaStat5 = cudaMalloc((void**)&d_y, sizeof(double) * n);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1),
cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA,
cudaMemcpyHostToDevice);
cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA,
cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/*
* 4.1: initial guess x0
*/
cudaStat1 = cudaMemcpy(d_x, x0, sizeof(double) * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/*
* 4.3: y = A*x
*/
cusparseStat = cusparseDcsrmv_mp(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnzA, &h_one, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_x, &h_zero, d_y);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/*
* step 5: report result
*/
cudaStat1 = cudaMemcpy(x, d_y, sizeof(double) * n, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("vector = \n");
printMatrix(n, 1, x, n, "V0");
printf("=====\n");
/* free resources */
if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
if (d_csrColIndA) cudaFree(d_csrColIndA);
if (d_csrValA) cudaFree(d_csrValA);
if (d_x) cudaFree(d_x);
if (d_y) cudaFree(d_y);
if (cublasH) cublasDestroy(cublasH);
if (cusparseH) cusparseDestroy(cusparseH);
if (stream) cudaStreamDestroy(stream);
if (descrA) cusparseDestroyMatDescr(descrA);
cudaDeviceReset();
return 0;
}
The resulting vector is {1, 1, 1}, but doing the calculation by hand or with the Csrmv function I get the vector {1, 4/3, 5/3} as a result. I really don't understand why I have this problem. The only thing I can think of is I incorrectly wrote the matrix in its CSR format. Also, I don't use CUSPARSE_MATRIX_TYPE_SYMMETRIC because the function doesn't accept this type of matix (the documentation is wrong).
If someone could help me I would appreciate it very much.
Edit: I'm using CUDA 9.0, my OS Windows 10 Home and my GPU is the GTX 960m
Upvotes: 0
Views: 193
Reputation: 43
I just updated to CUDA 9.1 and, as Robert Crovella said, the bug has been solved.
Upvotes: 1