Reputation: 11
I wrote a cuda function for Matlab to perform a LU factorization of a batch of matrices using cublasDgetrfBatched(). The toolkit documentation of this function is here.
It works fine for matrices up to size 32x32. But it fails with status code CUBLAS_STATUS_INVALID_VALUE for bigger matrices. Below is my source code (gpuBatchedLU.cu):
#include "mex.h"
#include "gpu/mxGPUArray.h"
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <string>
#include <sstream>
static std::string cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
inline bool cublasAssert(cublasStatus_t code, const char* file, int line) {
if (code != CUBLAS_STATUS_SUCCESS) {
std::stringstream ss;
ss << "cublasAssert: " << cublasGetErrorString(code) << " in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == CUBLAS_STATUS_SUCCESS;
}
inline bool cudaAssert(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::stringstream ss;
ss << "cudaAssert: " << cudaGetErrorString(code) << " in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == cudaSuccess;
}
inline bool mexGPUAssert(int code, const char* file, int line) {
if (code != MX_GPU_SUCCESS) {
std::stringstream ss;
ss << "mexGPUAssert: could not initialize the Mathworks GPU API in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == MX_GPU_SUCCESS;
}
#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
#define cudaErrchk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
#define mxGPUErrchk(ans) { mexGPUAssert((ans), __FILE__, __LINE__); }
void mexFunction(int nlhs, mxArray *plhs[], /* Output variables */int nrhs,
const mxArray *prhs[]) /* Input variables */{
if (nrhs != 1) { /* end if not one function arguments */
mexErrMsgTxt("This function requires one input argument.");
return;
}
if (nlhs > 3) { /* take three outputs */
mexErrMsgTxt("This function takes a maximum of three output variables.");
return;
}
mxGPUErrchk(mxInitGPU());
const mxGPUArray* in1_gpu = mxGPUCreateFromMxArray(prhs[0]);
size_t ndims = mxGPUGetNumberOfDimensions(in1_gpu);
const size_t* dim = (const size_t*) mxGPUGetDimensions(in1_gpu);
if (ndims != 3) { /* end if input arguments are of different dimensions */
mexErrMsgTxt("The input argument must be a 3-dimensional array.");
return;
}
cublasHandle_t handle;
cublasErrchk(cublasCreate(&handle));
int no_matrices = dim[2];
int nrow = dim[0];
int ncol = dim[1];
int matrix_size = nrow * ncol;
size_t i;
std::stringstream ss;
ss << "dim[2] = " << dim[2] << "\nno_matrices = " << no_matrices << "\nnrow = " << nrow << "\nmatrix_size = " << nrow << " x " << ncol << " = " << matrix_size << std::endl;
mexPrintf(ss.str().c_str());
mxGPUArray* gpu_array_inout = mxGPUCopyFromMxArray(prhs[0]);
double* inout_storage = (double*) mxGPUGetData(gpu_array_inout);
size_t info_dimensions[1] = { no_matrices };
mxGPUArray* gpu_array_info = mxGPUCreateGPUArray(1, (mwSize*) info_dimensions, mxINT32_CLASS, mxREAL,
MX_GPU_INITIALIZE_VALUES);
int* out_info = (int*) mxGPUGetData(gpu_array_info);
mexPrintf("after defining gpu_array_info\n");
size_t pivot_dimensions[2] = { nrow, no_matrices };
mxGPUArray* gpu_array_pivot = mxGPUCreateGPUArray(2, (mwSize*) pivot_dimensions, mxINT32_CLASS, mxREAL,
MX_GPU_DO_NOT_INITIALIZE);
int* out_pivot = (int*) mxGPUGetData(gpu_array_pivot);
mexPrintf("after defining gpu_array_pivot\n");
double** inout_pointers_CPU = (double**) malloc(no_matrices * sizeof(double*));
for (i = 0; i < no_matrices; i++) {
inout_pointers_CPU[i] = (double*) ((char*) inout_storage + i * ((size_t) matrix_size) * sizeof(double));
}
double** inout_pointers_GPU;
cudaErrchk(cudaMalloc((void** )&inout_pointers_GPU, no_matrices * sizeof(double*)));
cudaErrchk(
cudaMemcpy(inout_pointers_GPU, inout_pointers_CPU, no_matrices * sizeof(double*), cudaMemcpyHostToDevice));
free(inout_pointers_CPU);
ss.clear();
ss << "check again before calling cublasDgetrfBatched:\nnrow = " << nrow << "\nno_matrices = " << no_matrices << std::endl;
mexPrintf(ss.str().c_str());
cublasErrchk(cublasDgetrfBatched(handle, nrow, inout_pointers_GPU, nrow, out_pivot, out_info, no_matrices));
cublasErrchk(cublasDestroy(handle));
cudaErrchk(cudaFree(inout_pointers_GPU));
if (mxIsGPUArray(prhs[0])) {
plhs[0] = mxGPUCreateMxArrayOnGPU(gpu_array_inout);
if (nlhs > 1) {
plhs[1] = mxGPUCreateMxArrayOnGPU(gpu_array_pivot);
if (nlhs > 2) {
plhs[2] = mxGPUCreateMxArrayOnGPU(gpu_array_info);
}
}
} else {
plhs[0] = mxGPUCreateMxArrayOnCPU(gpu_array_inout);
if (nlhs > 1) {
plhs[1] = mxGPUCreateMxArrayOnCPU(gpu_array_pivot);
if (nlhs > 2) {
plhs[2] = mxGPUCreateMxArrayOnCPU(gpu_array_info);
}
}
}
mxGPUDestroyGPUArray(gpu_array_inout);
mxGPUDestroyGPUArray(gpu_array_pivot);
mxGPUDestroyGPUArray(gpu_array_info);
mxFree((void*) dim);
return;
}
I compile as follows:
mex -L/usr/local/cuda/lib64 -lcudart -lcublas gpuBatchedLU.cu
And I call from MATLAB:
[a1,b1,c1]=gpuBatchedLU(randn(32,32,5)); %no problem
[a2,b2,c2]=gpuBatchedLU(randn(33,33,5)); %produces CUBLAS_STATUS_INVALID_VALUE
I use Matlab R2013b with the parallel toolbox, Cuda 5.5, and a NVS 5200M graphics chip.
Can anyone replicate this problem? I would appreciate any suggestions on how to solve this problem.
Upvotes: 0
Views: 919
Reputation: 11
The problem seems to be with Matlab R2013b using libcublas.so
in version 5.0. The file link is in /MATLAB/R2013b/bin/glnxa64/
. Once I changed the link to the libcublas.so
of my Cuda 5.5 installation it worked fine.
Upvotes: 1