user3185689
user3185689

Reputation: 11

cublas cublasDgetrfBatched() batched LU factorization doesn't work with matrices bigger than 32x32

I wrote a cuda function for Matlab to perform a LU factorization of a batch of matrices using cublasDgetrfBatched(). The toolkit documentation of this function is here.

It works fine for matrices up to size 32x32. But it fails with status code CUBLAS_STATUS_INVALID_VALUE for bigger matrices. Below is my source code (gpuBatchedLU.cu):

#include "mex.h"
#include "gpu/mxGPUArray.h"

/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <string>
#include <sstream>

static std::string cublasGetErrorString(cublasStatus_t error) {
 switch (error) {
 case CUBLAS_STATUS_SUCCESS:
  return "CUBLAS_STATUS_SUCCESS";

 case CUBLAS_STATUS_NOT_INITIALIZED:
  return "CUBLAS_STATUS_NOT_INITIALIZED";

 case CUBLAS_STATUS_ALLOC_FAILED:
  return "CUBLAS_STATUS_ALLOC_FAILED";

 case CUBLAS_STATUS_INVALID_VALUE:
  return "CUBLAS_STATUS_INVALID_VALUE";

 case CUBLAS_STATUS_ARCH_MISMATCH:
  return "CUBLAS_STATUS_ARCH_MISMATCH";

 case CUBLAS_STATUS_MAPPING_ERROR:
  return "CUBLAS_STATUS_MAPPING_ERROR";

 case CUBLAS_STATUS_EXECUTION_FAILED:
  return "CUBLAS_STATUS_EXECUTION_FAILED";

 case CUBLAS_STATUS_INTERNAL_ERROR:
  return "CUBLAS_STATUS_INTERNAL_ERROR";
 }

 return "<unknown>";
}

inline bool cublasAssert(cublasStatus_t code, const char* file, int line) {
 if (code != CUBLAS_STATUS_SUCCESS) {
  std::stringstream ss;
  ss << "cublasAssert: " << cublasGetErrorString(code) << " in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == CUBLAS_STATUS_SUCCESS;
}

inline bool cudaAssert(cudaError_t code, const char* file, int line) {
 if (code != cudaSuccess) {
  std::stringstream ss;
  ss << "cudaAssert: " << cudaGetErrorString(code) << " in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == cudaSuccess;
}

inline bool mexGPUAssert(int code, const char* file, int line) {
 if (code != MX_GPU_SUCCESS) {
  std::stringstream ss;
  ss << "mexGPUAssert: could not initialize the Mathworks GPU API in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == MX_GPU_SUCCESS;
}

#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
#define cudaErrchk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
#define mxGPUErrchk(ans) { mexGPUAssert((ans), __FILE__, __LINE__); }

void mexFunction(int nlhs, mxArray *plhs[], /* Output variables */int nrhs,
  const mxArray *prhs[]) /* Input variables */{
 if (nrhs != 1) { /* end if not one function arguments */
  mexErrMsgTxt("This function requires one input argument.");
  return;
 }

 if (nlhs > 3) { /* take three outputs */
  mexErrMsgTxt("This function takes a maximum of three output variables.");
  return;
 }

 mxGPUErrchk(mxInitGPU());

 const mxGPUArray* in1_gpu = mxGPUCreateFromMxArray(prhs[0]);
 size_t ndims = mxGPUGetNumberOfDimensions(in1_gpu);
 const size_t* dim = (const size_t*) mxGPUGetDimensions(in1_gpu);

 if (ndims != 3) { /* end if input arguments are of different dimensions */
  mexErrMsgTxt("The input argument must be a 3-dimensional array.");
  return;
 }

 cublasHandle_t handle;

 cublasErrchk(cublasCreate(&handle));

 int no_matrices = dim[2];
 int nrow = dim[0];
 int ncol = dim[1];
 int matrix_size = nrow * ncol;
 size_t i;

 std::stringstream ss;
 ss << "dim[2] = " << dim[2] << "\nno_matrices = " << no_matrices << "\nnrow = " << nrow << "\nmatrix_size = " << nrow << " x " << ncol << " = " << matrix_size << std::endl;
 mexPrintf(ss.str().c_str());

 mxGPUArray* gpu_array_inout = mxGPUCopyFromMxArray(prhs[0]);
 double* inout_storage = (double*) mxGPUGetData(gpu_array_inout);

 size_t info_dimensions[1] = { no_matrices };
 mxGPUArray* gpu_array_info = mxGPUCreateGPUArray(1, (mwSize*) info_dimensions, mxINT32_CLASS, mxREAL,
   MX_GPU_INITIALIZE_VALUES);
 int* out_info = (int*) mxGPUGetData(gpu_array_info);

 mexPrintf("after defining gpu_array_info\n");

 size_t pivot_dimensions[2] = { nrow, no_matrices };
 mxGPUArray* gpu_array_pivot = mxGPUCreateGPUArray(2, (mwSize*) pivot_dimensions, mxINT32_CLASS, mxREAL,
   MX_GPU_DO_NOT_INITIALIZE);
 int* out_pivot = (int*) mxGPUGetData(gpu_array_pivot);

 mexPrintf("after defining gpu_array_pivot\n");

 double** inout_pointers_CPU = (double**) malloc(no_matrices * sizeof(double*));
 for (i = 0; i < no_matrices; i++) {
  inout_pointers_CPU[i] = (double*) ((char*) inout_storage + i * ((size_t) matrix_size) * sizeof(double));
 }
 double** inout_pointers_GPU;
 cudaErrchk(cudaMalloc((void** )&inout_pointers_GPU, no_matrices * sizeof(double*)));
 cudaErrchk(
   cudaMemcpy(inout_pointers_GPU, inout_pointers_CPU, no_matrices * sizeof(double*), cudaMemcpyHostToDevice));
 free(inout_pointers_CPU);

 ss.clear();
 ss << "check again before calling cublasDgetrfBatched:\nnrow = " << nrow << "\nno_matrices = " << no_matrices << std::endl;
 mexPrintf(ss.str().c_str());

 cublasErrchk(cublasDgetrfBatched(handle, nrow, inout_pointers_GPU, nrow, out_pivot, out_info, no_matrices));

 cublasErrchk(cublasDestroy(handle));

 cudaErrchk(cudaFree(inout_pointers_GPU));

 if (mxIsGPUArray(prhs[0])) {
  plhs[0] = mxGPUCreateMxArrayOnGPU(gpu_array_inout);
  if (nlhs > 1) {
   plhs[1] = mxGPUCreateMxArrayOnGPU(gpu_array_pivot);
   if (nlhs > 2) {
    plhs[2] = mxGPUCreateMxArrayOnGPU(gpu_array_info);
   }
  }
 } else {
  plhs[0] = mxGPUCreateMxArrayOnCPU(gpu_array_inout);
  if (nlhs > 1) {
   plhs[1] = mxGPUCreateMxArrayOnCPU(gpu_array_pivot);
   if (nlhs > 2) {
    plhs[2] = mxGPUCreateMxArrayOnCPU(gpu_array_info);
   }
  }
 }

 mxGPUDestroyGPUArray(gpu_array_inout);
 mxGPUDestroyGPUArray(gpu_array_pivot);
 mxGPUDestroyGPUArray(gpu_array_info);
 mxFree((void*) dim);

 return;
}

I compile as follows:

mex -L/usr/local/cuda/lib64 -lcudart -lcublas gpuBatchedLU.cu

And I call from MATLAB:

[a1,b1,c1]=gpuBatchedLU(randn(32,32,5)); %no problem
[a2,b2,c2]=gpuBatchedLU(randn(33,33,5)); %produces CUBLAS_STATUS_INVALID_VALUE

I use Matlab R2013b with the parallel toolbox, Cuda 5.5, and a NVS 5200M graphics chip.

Can anyone replicate this problem? I would appreciate any suggestions on how to solve this problem.

Upvotes: 0

Views: 919

Answers (1)

user3185689
user3185689

Reputation: 11

The problem seems to be with Matlab R2013b using libcublas.so in version 5.0. The file link is in /MATLAB/R2013b/bin/glnxa64/. Once I changed the link to the libcublas.so of my Cuda 5.5 installation it worked fine.

Upvotes: 1

Related Questions