cublas cublasDgetrfBatched() batched LU factorization doesn't work with matrices bigger than 32x32

Question

I wrote a cuda function for Matlab to perform a LU factorization of a batch of matrices using cublasDgetrfBatched(). The toolkit documentation of this function is here.

It works fine for matrices up to size 32x32. But it fails with status code CUBLAS_STATUS_INVALID_VALUE for bigger matrices. Below is my source code (gpuBatchedLU.cu):

#include "mex.h"
#include "gpu/mxGPUArray.h"

/* Includes, cuda */
#include 
#include 

#include 
#include 

static std::string cublasGetErrorString(cublasStatus_t error) {
 switch (error) {
 case CUBLAS_STATUS_SUCCESS:
  return "CUBLAS_STATUS_SUCCESS";

 case CUBLAS_STATUS_NOT_INITIALIZED:
  return "CUBLAS_STATUS_NOT_INITIALIZED";

 case CUBLAS_STATUS_ALLOC_FAILED:
  return "CUBLAS_STATUS_ALLOC_FAILED";

 case CUBLAS_STATUS_INVALID_VALUE:
  return "CUBLAS_STATUS_INVALID_VALUE";

 case CUBLAS_STATUS_ARCH_MISMATCH:
  return "CUBLAS_STATUS_ARCH_MISMATCH";

 case CUBLAS_STATUS_MAPPING_ERROR:
  return "CUBLAS_STATUS_MAPPING_ERROR";

 case CUBLAS_STATUS_EXECUTION_FAILED:
  return "CUBLAS_STATUS_EXECUTION_FAILED";

 case CUBLAS_STATUS_INTERNAL_ERROR:
  return "CUBLAS_STATUS_INTERNAL_ERROR";
 }

 return "";
}

inline bool cublasAssert(cublasStatus_t code, const char* file, int line) {
 if (code != CUBLAS_STATUS_SUCCESS) {
  std::stringstream ss;
  ss << "cublasAssert: " << cublasGetErrorString(code) << " in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == CUBLAS_STATUS_SUCCESS;
}

inline bool cudaAssert(cudaError_t code, const char* file, int line) {
 if (code != cudaSuccess) {
  std::stringstream ss;
  ss << "cudaAssert: " << cudaGetErrorString(code) << " in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == cudaSuccess;
}

inline bool mexGPUAssert(int code, const char* file, int line) {
 if (code != MX_GPU_SUCCESS) {
  std::stringstream ss;
  ss << "mexGPUAssert: could not initialize the Mathworks GPU API in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == MX_GPU_SUCCESS;
}

#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
#define cudaErrchk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
#define mxGPUErrchk(ans) { mexGPUAssert((ans), __FILE__, __LINE__); }

void mexFunction(int nlhs, mxArray *plhs[], /* Output variables */int nrhs,
  const mxArray *prhs[]) /* Input variables */{
 if (nrhs != 1) { /* end if not one function arguments */
  mexErrMsgTxt("This function requires one input argument.");
  return;
 }

 if (nlhs > 3) { /* take three outputs */
  mexErrMsgTxt("This function takes a maximum of three output variables.");
  return;
 }

 mxGPUErrchk(mxInitGPU());

 const mxGPUArray* in1_gpu = mxGPUCreateFromMxArray(prhs[0]);
 size_t ndims = mxGPUGetNumberOfDimensions(in1_gpu);
 const size_t* dim = (const size_t*) mxGPUGetDimensions(in1_gpu);

 if (ndims != 3) { /* end if input arguments are of different dimensions */
  mexErrMsgTxt("The input argument must be a 3-dimensional array.");
  return;
 }

 cublasHandle_t handle;

 cublasErrchk(cublasCreate(&handle));

 int no_matrices = dim[2];
 int nrow = dim[0];
 int ncol = dim[1];
 int matrix_size = nrow * ncol;
 size_t i;

 std::stringstream ss;
 ss << "dim[2] = " << dim[2] << "
no_matrices = " << no_matrices << "
nrow = " << nrow << "
matrix_size = " << nrow << " x " << ncol << " = " << matrix_size << std::endl;
 mexPrintf(ss.str().c_str());

 mxGPUArray* gpu_array_inout = mxGPUCopyFromMxArray(prhs[0]);
 double* inout_storage = (double*) mxGPUGetData(gpu_array_inout);

 size_t info_dimensions[1] = { no_matrices };
 mxGPUArray* gpu_array_info = mxGPUCreateGPUArray(1, (mwSize*) info_dimensions, mxINT32_CLASS, mxREAL,
   MX_GPU_INITIALIZE_VALUES);
 int* out_info = (int*) mxGPUGetData(gpu_array_info);

 mexPrintf("after defining gpu_array_info
");

 size_t pivot_dimensions[2] = { nrow, no_matrices };
 mxGPUArray* gpu_array_pivot = mxGPUCreateGPUArray(2, (mwSize*) pivot_dimensions, mxINT32_CLASS, mxREAL,
   MX_GPU_DO_NOT_INITIALIZE);
 int* out_pivot = (int*) mxGPUGetData(gpu_array_pivot);

 mexPrintf("after defining gpu_array_pivot
");

 double** inout_pointers_CPU = (double**) malloc(no_matrices * sizeof(double*));
 for (i = 0; i < no_matrices; i++) {
  inout_pointers_CPU[i] = (double*) ((char*) inout_storage + i * ((size_t) matrix_size) * sizeof(double));
 }
 double** inout_pointers_GPU;
 cudaErrchk(cudaMalloc((void** )&inout_pointers_GPU, no_matrices * sizeof(double*)));
 cudaErrchk(
   cudaMemcpy(inout_pointers_GPU, inout_pointers_CPU, no_matrices * sizeof(double*), cudaMemcpyHostToDevice));
 free(inout_pointers_CPU);

 ss.clear();
 ss << "check again before calling cublasDgetrfBatched:
nrow = " << nrow << "
no_matrices = " << no_matrices << std::endl;
 mexPrintf(ss.str().c_str());

 cublasErrchk(cublasDgetrfBatched(handle, nrow, inout_pointers_GPU, nrow, out_pivot, out_info, no_matrices));

 cublasErrchk(cublasDestroy(handle));

 cudaErrchk(cudaFree(inout_pointers_GPU));

 if (mxIsGPUArray(prhs[0])) {
  plhs[0] = mxGPUCreateMxArrayOnGPU(gpu_array_inout);
  if (nlhs > 1) {
   plhs[1] = mxGPUCreateMxArrayOnGPU(gpu_array_pivot);
   if (nlhs > 2) {
    plhs[2] = mxGPUCreateMxArrayOnGPU(gpu_array_info);
   }
  }
 } else {
  plhs[0] = mxGPUCreateMxArrayOnCPU(gpu_array_inout);
  if (nlhs > 1) {
   plhs[1] = mxGPUCreateMxArrayOnCPU(gpu_array_pivot);
   if (nlhs > 2) {
    plhs[2] = mxGPUCreateMxArrayOnCPU(gpu_array_info);
   }
  }
 }

 mxGPUDestroyGPUArray(gpu_array_inout);
 mxGPUDestroyGPUArray(gpu_array_pivot);
 mxGPUDestroyGPUArray(gpu_array_info);
 mxFree((void*) dim);

 return;
}

I compile as follows:

mex -L/usr/local/cuda/lib64 -lcudart -lcublas gpuBatchedLU.cu

And I call from MATLAB:

[a1,b1,c1]=gpuBatchedLU(randn(32,32,5)); %no problem
[a2,b2,c2]=gpuBatchedLU(randn(33,33,5)); %produces CUBLAS_STATUS_INVALID_VALUE

I use Matlab R2013b with the parallel toolbox, Cuda 5.5, and a NVS 5200M graphics chip.

Can anyone replicate this problem? I would appreciate any suggestions on how to solve this problem.

cublas cublasDgetrfBatched() batched LU factorization doesn't work with matrices bigger than 32x32

Answers (1)

Related Questions

cublas cublasDgetrfBatched() batched LU factorization doesn&#39;t work with matrices bigger than 32x32

Answers (1)

Related Questions

cublas cublasDgetrfBatched() batched LU factorization doesn't work with matrices bigger than 32x32