Trying to run a CusolverSSgels testcase, however it is not working

Question

I'm busy working on a LS method, I manually implemented a conjugate gradient solver, but after updating my CUDA version, I saw that there is a new function (cusolverDnSSgels) which I assume is faster than my manual implementation. My first task was to try and run it on a test case (see below), I'd expect the result to be: -6.5, 9.7 according to MATlab. Unfortunately I cannot find what I did wrong, I also cannot find an example because it is a relatively new function.

The output says that niter= -3, which would suggest too many iterations according to the documentation, however this would not make sense, as it is a very small matrix which should be easily solvable.

#include 
#include 
#include 
#include 
#include "device_launch_parameters.h"


int main()
{   
    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cusolverDnHandle_t cusolverH;
    cusolverStatus_t stat;

    // create handle
    stat = cusolverDnCreate(&cusolverH);

    //params
    const int C = 3;
    const int M = 2;
    long lda = C;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat, *gXmat;

    //allocate mem
    Amat = (float*)malloc(M * C * sizeof(float));
    Ymat = (float*)malloc(C * sizeof(float));
    Xmat = (float*)malloc(M * sizeof(float));

    srand(100);
#if 0
    for (int i = 0; i < C * M; i++) {
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];

    }

    for (int i = 0; i < C; i++) {
        Ymat[i] = rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    }
#endif
    Amat[0] = 6;
    Amat[1] = 7;
    Amat[2] = 6;
    Amat[3] = 5;
    Amat[4] = 5;
    Amat[5] = 5;
    Ymat[0] = 9;
    Ymat[1] = 3;
    Ymat[2] = 10;

    //allocate mem
    cudaMalloc(&gAmat, M * C * sizeof(float));
    cudaMalloc(&gYmat, C * sizeof(float));
    cudaMalloc(&gXmat, M * 1 * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);

    float *gdwork;
    size_t work_bytes;

    stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);

    std::cout << "Status = " << stat << std::endl;

    int niter = 0;
    int dinfo = 0;

    cudaMalloc(&gdwork, work_bytes * sizeof(float));

    stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);

    std::cout << "Status = " << stat  << std::endl;
    std::cout << "niter = "  << niter << std::endl;
    std::cout << "dinfo = "  << dinfo << std::endl;

    cudaDeviceSynchronize();

    cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);


    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;

    //free memory
    cudaFree(gdwork);
    free(Amat);
    free(Ymat);
    free(Xmat);


    cudaFree(gXmat);
    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cusolverDnDestroy(cusolverH);



    return 0;
}

The results I get are:

Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08

Could someone point out what I am doing wrong?

Trying to run a CusolverSSgels testcase, however it is not working

Answers (1)

Related Questions