Reputation: 171
I'm busy working on a LS method, I manually implemented a conjugate gradient solver, but after updating my CUDA version, I saw that there is a new function (cusolverDnSSgels) which I assume is faster than my manual implementation. My first task was to try and run it on a test case (see below), I'd expect the result to be: -6.5, 9.7
according to MATlab. Unfortunately I cannot find what I did wrong, I also cannot find an example because it is a relatively new function.
The output says that niter= -3
, which would suggest too many iterations according to the documentation, however this would not make sense, as it is a very small matrix which should be easily solvable.
#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "device_launch_parameters.h"
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int dinfo = 0;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << dinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
The results I get are:
Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08
Could someone point out what I am doing wrong?
Upvotes: 1
Views: 80
Reputation: 152269
You have a problem with your dinfo
parameter usage. Referring to the documentation, we see that:
Parameters of cusolverDngels() functions
parameter Memory In/out Meaning
dinfo device output Status of the IRS solver on the return. If 0 - solve was successful. If dinfo = -i then i-th argument is not valid.
the dinfo
parameter is expected to live in device memory. But you have it in host memory:
int dinfo = 0;
If I move the storage to the proper location, your code outputs the values you indicate as expected:
$ cat t143.cu
#include <iostream>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cusolverDnHandle_t cusolverH;
cusolverStatus_t stat;
// create handle
stat = cusolverDnCreate(&cusolverH);
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat, *gXmat;
//allocate mem
Amat = (float*)malloc(M * C * sizeof(float));
Ymat = (float*)malloc(C * sizeof(float));
Xmat = (float*)malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc(&gAmat, M * C * sizeof(float));
cudaMalloc(&gYmat, C * sizeof(float));
cudaMalloc(&gXmat, M * 1 * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float *gdwork;
size_t work_bytes;
stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);
std::cout << "Status = " << stat << std::endl;
int niter = 0;
int *dinfo, hinfo;
cudaMalloc(&gdwork, work_bytes * sizeof(float));
cudaMalloc(&dinfo, sizeof(int));
stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, dinfo);
cudaMemcpy(&hinfo, dinfo, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Status = " << stat << std::endl;
std::cout << "niter = " << niter << std::endl;
std::cout << "dinfo = " << hinfo << std::endl;
cudaDeviceSynchronize();
cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;
//free memory
cudaFree(gdwork);
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gXmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cusolverDnDestroy(cusolverH);
return 0;
}
$ nvcc -o t143 t143.cu -lcublas -lcusolver
$ cuda-memcheck ./t143
========= CUDA-MEMCHECK
Status = 0
Status = 0
niter = -51
dinfo = 0
-6.5, 9.7
========= ERROR SUMMARY: 0 errors
$
Notes:
I am using CUDA 11.3 for the above. If you are using an earlier version, I strongly recommend you move forward to CUDA 11.3 or newer for usage of this function.
You can get a hint as to the problem by running your code with cuda-memcheck
It was fairly quick to spot the problem by reviewing your parameter usage with the table of parameter locations (host/device) given in the documentation. You had a problem here which was similar in that you could focus in on the problem by reviewing your parameter locations (host/device) against the table given in the documentation. This may be a good thing to check to save yourself time in the future.
Upvotes: 2