Maximum value of batchsize allowed for cublasDgetrfBatched() from CUBLAS Library

Question

Is there any maximum batchsize limitation for cublasDgetrfBatched() from CUBLAS library? I am doing a benchmark problem for comparing timings between CPU and GPU. For a batchsize of 1000 i am getting GPU timing greater than CPU timing. But, for a batchsize of 100, i am getting some speedup over CPU.

I have posted below the code that i used for benchmarking.

1. main.cpp

/*main.cpp goes below*/
#include
#include 
#include 
#include 
#include 
#include "mathlib_blas.h"

int main(){

double**mat;                         
double**mat_scratch1;
int *ipvt;
double *fVec;
double *fVecSave;
double *fVec_scratch;
double *A;                      
double *B;
double **devPtrA;
double **devPtrB;
double **devPtrA_dev;
double **devPtrB_dev;
double *d_x;
double *x;
int *d_pivot_array ;


int *d_info_array;
int *h_info_array;
int batchsize;
int neqn;

cublasHandle_t handle;
cublasStatus_t status;
cudaError_t error;

clock_t start, end, start1, end1;

double rcond;
batchsize = 32;
neqn = 172;

mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double)); 
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));


ipvt = (int*) calloc((size_t) neqn, sizeof(int));
fVec = (double*) calloc((size_t) neqn, sizeof(double));
fVecSave = (double*) calloc((size_t) neqn, sizeof(double));
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double));

A = (double*)malloc( neqn*neqn*sizeof(A[0])); 
B = (double*)malloc( neqn*neqn*sizeof(B[0])); 

devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA));
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB));

for(int b_count =0; b_count



2. mathlib_blas.h

#include 
#include 

#define maxm(a,b)   (((a) > (b)) ? (a) : (b))
#define minm(a,b)   (((a) < (b)) ? (a) : (b))
#define signum(a,b)   (((b) < (0)) ? (-a) : (a)) 

void **ArrayAlloc2d( const int size1, const int size2, const size_t   sizeType);
void dgefa_blas(double **a,int n, int ipvt[],int *info);
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job);
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z);



void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType )
 {
  void** array = nullptr;

  array = (void**)calloc(size1, sizeof(void*));

  if (array != nullptr) {
     if (size2 > 0) {
        void* data = calloc(size1*size2, sizeType);

        if (data != nullptr) {
           char* addr = (char*)data;

           for (int index1 = 0; index1 < size1; index1++) {
              array[index1] = (void*)addr;
              addr += sizeType*size2; /* char is always 1 byte */
           }
        } else {

           free(array);
           free(data);
           array = nullptr;


        }
     }
   } else {

   }


    return array;
 }

 void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z)
 {
 double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm;
 int i,info,j,k,kb,kp1,l;

 /* Compute 1-norm of a */
 anorm = 0.0;
 for (j = 0; j < n; j++) {
  vecsum = 0.0;
  for (i = 0;i < n; i++)
     vecsum += fabs(a[i][j]);
  anorm = maxm(anorm,vecsum);
 }

 /* Factor. */
 dgefa_blas(a,n,ipvt,&info);   

 /* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))).
 * estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e.
 * trans(a) is the transpose of a. The components of e are
 * chosen to cause maximum local growth in the elements of
 * w, where trans(u)*w=e. The vectors are frequently rescaled
 * to avoid overflow.
 */
 ek = 1.0;
 for (j = 0; j < n; j++)
  z[j] = 0.0;
  for (k = 0; k < n; k++) {
  if (z[k] != 0.0)
     ek = signum(ek,-z[k]);
  if (fabs(ek-z[k]) > fabs(a[k][k])) {
     s = fabs(a[k][k])/fabs(ek-z[k]);
     /* dscal(n,s,z,1) */
     for (i = 0; i < n; i++)
        z[i] *= s; 
     ek *= s;
  }
  wk = ek - z[k];
  wkm = -ek - z[k];
  s = fabs(wk);
  sm = fabs(wkm);
  if (a[k][k] != 0.0) {
     wk /= a[k][k];
     wkm /= a[k][k];
  }
  else {
     wk = 1.0;
     wkm = 1.0;
  }
  kp1 = k + 1;
  if (kp1 < n) {
     for (j = kp1; j < n; j++) {
        sm += (fabs(z[j] + wkm * a[k][j]));
        z[j] += (wk * a[k][j]);
        s += fabs(z[j]);
     }
     if (s < sm) {
        t = wkm -wk;
        wk = wkm;
        for (j = kp1; j < n; j++)
           z[j] += (t * a[k][j]);
     }
  }
  z[k] = wk;
  }

   /* dasum(n,s,z,1) */
 vecsum = 0.0;

 for (i = 0;i < n; i++)
   vecsum += fabs(z[i]);
 s = 1.0/vecsum;

 /* dscal(n,s,z) */
 for (i = 0; i < n; i++)
  z[i] *= s;

 /* Solve trans(l)*y= w
 */
 for (kb = 0; kb < n; kb++) {
   k = n - kb - 1;
  if (k < (n-1)) {
     /* sdot(n-k,a(k+1,k),1,z(k+1),1) */
     vecdot = 0.0;
     for (i = k+1;i < n; i++)
        vecdot += (a[i][k] * z[i]);
     z[k] += vecdot;
    }
    if (fabs(z[k]) > 1.0) {
     s = 1.0/fabs(z[k]);
     /* dscal(n,s,z) */
     for (i = 0; i < n; i++)
        z[i] *= s;
    }
    l = ipvt[k];
    t = z[l];
    z[l] = z[k];
    z[k] = t;
  } /* endfor kb */
  /* dasum(n,z,1) */
  vecsum = 0.0;
  for (i = 0; i < n; i++)
  vecsum += fabs(z[i]);
  s = 1.0/vecsum;
  /* dscal(n,s,z) */
  for (i = 0; i < n; i++)
  z[i] *= s;   

  ynorm = 1.0; 
  /*
  * Solve l * v = y
  */
  for (k = 0; k < n; k++) {
  l = ipvt[k];
  t = z[l];
  z[l] = z[k];
  z[k] = t;
  if (k < (n-1)) {
     /* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */
     for (i = k+1;i < n; i++)
        z[i] += (t * a[i][k]);
  }
  if (fabs(z[k]) > 1.0) {
     s = 1.0/fabs(z[k]);
     /* dscal(n,s,z,1) */
     for (i = 0; i < n; i++)
        z[i] *= s;
     ynorm *= s;
   }
  }
  /* dasum(n,z,1) */
  vecsum = 0.0;
  for (i = 0; i < n; i++)
  vecsum += fabs(z[i]);
  s = 1.0/vecsum;
   /* dscal(n,s,z,1) */
  for (i = 0; i < n; i++)
  z[i] *= s;
  ynorm *= s;

  /* Solve u * z = v */
  for (kb = 0; kb < n; kb++) {
  k = n - kb - 1;
  if (fabs(z[k]) > fabs(a[k][k])) {
     s = fabs(a[k][k])/fabs(z[k]);
     /* dscal(n,s,z,1) */
     for (i = 0; i < n; i++)
        z[i] *= s;
     ynorm *= s;
   }
   if (a[k][k] != 0.0)
     z[k] /= a[k][k];
   if (a[k][k] == 0.0)
     z[k] = 1.0;
   t = -z[k];
   /* daxpy(k-1,t,a[1][k],1,z[1],1) */
   for (i = 0; i < k; i++)
     z[i] += (t * a[i][k]);
  }
  /* Make znorm = 1.0 */
  /* dasum(n,z,1) */
  vecsum = 0.0;
  for (i = 0; i < n; i++)
  vecsum += fabs(z[i]); 
  s = 1.0/vecsum;

  /* dscal(n,s,z,1) */
  for (i = 0; i < n; i++)
  z[i] *= s;
  ynorm *= s;

  if (anorm != 0.0) *rcond = ynorm/anorm;
  if (anorm == 0.0) *rcond = 0.0;
  }



  void dgefa_blas(double **a,int n, int ipvt[],int *info)
  {
  double dmax,t;
  int i,j,k,kp1,l,nm1;

 *info = 0;
 nm1 = n - 1;
 if (n > 0) {
  for (k = 0; k < nm1; k++) {
     kp1 = k + 1;
     /* Find l = pivot index. */
     dmax = fabs(a[k][k]);
     l = k;
     for (i = k+1; i < n; i++) {
        if (fabs(a[i][k]) <= dmax) continue;
        l = i;
     }
     ipvt[k] = l;

     /* Zero pivot implies this column already triangularized. */
     if (a[l][k] == 0.0) {
        *info = k;
        continue;
     }

     /* Interchange if necessary. */
     if (l != k) {
        t = a[l][k];
        a[l][k] = a[k][k];
        a[k][k] = t;
     }

     /* Compute multipliers. */
     if (a[k][k] == 0.0) printf("
!ERROR. Singular matrix.
");
     t = -1.0/a[k][k];
     for (i = k+1; i < n; i++)
        a[i][k] *= t;

     /* Row elimination with column indexing. */
     for (j = kp1; j < n; j++) {
        t = a[l][j];
        if (l != k) {
           a[l][j] = a[k][j];
           a[k][j] = t;
        }
        for (i = k+1; i < n; i++ )
           a[i][j] += (t * a[i][k]);
     }
    }
   } 
   ipvt[n-1] = n-1;
   if (a[n-1][n-1] == 0.0) *info = n-1;
}



 void dgesl_blas(double **a,int n,int ipvt[],double b[],int job)
{
 double t;
 int i,k,kb,l,nm1;

 nm1 = n - 1;
 if (job == 0) {

  /* job = 0, solve a * x = b.
  * First solve l * y = b.
  */
  if (n > 0) {
     for (k = 0; k < nm1; k++) {
        l = ipvt[k];
        t = b[l];
        if (l != k) {
           b[l] = b[k];
           b[k] = t;
        }
        /* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */
        for (i=k+1;i < n;i++)
           b[i] += (t * a[i][k]); 
     }
   } 

    /* Now solve u * x = y. */
    for (kb = 0; kb < n; kb++) {
     k = n - kb-1;
     b[k] /= a[k][k];
     t = -b[k];
     /*      saxpy(k-1,t,a(1,k),1,b(1),1); */
     for (i = 0; i < k ; i++)
        b[i] += (t * a[i][k]);
    }
    return;
  }
  /* job != 0, solve trans(a) * x = b.
 * First solve trans(u) * x = y.
 */
 for (k = 0; k < n; k++) {
  /*  t = ddot(k-1,a(1,k),1,b(1),1); */
  t = 0;
  for (i = 0; i < k; i++)
     t += (a[i][k] * b[i]);
  b[k] = (b[k] - t)/a[k][k];
 }

 /* Now solve trans(l) * x = y. */

 if (n > 0) {
  for (kb = 0; kb < nm1; kb++) {
     k = n - 2 - kb;
     /* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */
     t = 0;

     for (i = k+1;i < n; i++)
        t += (a[i][k] * b[i]);
     b[k] += t;
     l = ipvt[k];

     if (l != k) {
        t = b[l];
        b[l] = b[k];
        b[k] = t;
     }
    } 
  }
}

Robert Crovella · Accepted Answer

There should not be any behavioral differences between a batch size of 100 and a batch size of 1000. (Certainly there would be a performance difference - the batch size of 1000 should probably take longer.)

There are no published limits to the batch size, other than implicit memory limits. In fact, unless the GPU is returning incorrect results, there is no reason to think that you've run into any hard limit anyway.

( If you wanted to explore some behavioral or performance issue, this question is not properly written to address that. )

Maximum value of batchsize allowed for cublasDgetrfBatched() from CUBLAS Library

Answers (1)

Related Questions