sa112
sa112

Reputation: 65

Maximum value of batchsize allowed for cublasDgetrfBatched() from CUBLAS Library

Is there any maximum batchsize limitation for cublasDgetrfBatched() from CUBLAS library? I am doing a benchmark problem for comparing timings between CPU and GPU. For a batchsize of 1000 i am getting GPU timing greater than CPU timing. But, for a batchsize of 100, i am getting some speedup over CPU.

I have posted below the code that i used for benchmarking.

1. main.cpp

/*main.cpp goes below*/
#include<stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "mathlib_blas.h"

int main(){

double**mat;                         
double**mat_scratch1;
int *ipvt;
double *fVec;
double *fVecSave;
double *fVec_scratch;
double *A;                      
double *B;
double **devPtrA;
double **devPtrB;
double **devPtrA_dev;
double **devPtrB_dev;
double *d_x;
double *x;
int *d_pivot_array ;


int *d_info_array;
int *h_info_array;
int batchsize;
int neqn;

cublasHandle_t handle;
cublasStatus_t status;
cudaError_t error;

clock_t start, end, start1, end1;

double rcond;
batchsize = 32;
neqn = 172;

mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double)); 
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));


ipvt = (int*) calloc((size_t) neqn, sizeof(int));
fVec = (double*) calloc((size_t) neqn, sizeof(double));
fVecSave = (double*) calloc((size_t) neqn, sizeof(double));
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double));

A = (double*)malloc( neqn*neqn*sizeof(A[0])); 
B = (double*)malloc( neqn*neqn*sizeof(B[0])); 

devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA));
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB));

for(int b_count =0; b_count<batchsize; b_count++){
cudaMalloc((void **)&devPtrA[b_count], neqn*neqn * sizeof(devPtrA[0][0]));
cudaMalloc((void **)&devPtrB[b_count], batchsize*neqn * sizeof(devPtrB[0][0]));
}

cudaMalloc((void **)&devPtrA_dev,  batchsize*sizeof(*devPtrA));
cudaMalloc((void **)&devPtrB_dev,  batchsize*sizeof(*devPtrB));


cudaMemcpy(devPtrA_dev, devPtrA, batchsize*sizeof(*devPtrA), cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB_dev, devPtrB, batchsize*sizeof(*devPtrB), cudaMemcpyHostToDevice);


cudaMalloc((void **)&d_x, neqn*sizeof(double));
x =(double *)malloc(neqn*sizeof(double));

cudaMalloc((void **)&d_pivot_array, batchsize*neqn*sizeof(int));

cudaMalloc((void **)&d_info_array, batchsize*sizeof(int));

h_info_array =(int*)malloc(batchsize*sizeof(int));

cublasCreate(&handle);

srand(time(NULL));



/* Fill in the CPU and GPU Matrix */
for (int iRow = 0; iRow < neqn; iRow++) {
    double sumCol = 0;
    for (int iColumn = 0; iColumn < neqn; iColumn++) {
        for(int b_count =0; b_count<batchsize; b_count++){
            A[neqn*iColumn + iRow] = rand()%10 ;
            mat[iRow][iColumn] = A[neqn*iColumn + iRow];
        }
        sumCol +=A[neqn*iColumn + iRow];
    }
    fVec[iRow] = sumCol;
    fVecSave[iRow] = sumCol;
}

/*CPU_CODE GOES HERE */

start = clock();
for(int b_count =0; b_count<batchsize; b_count++){ 

     for (int iRow = 0; iRow < neqn; iRow++) {
         for (int iColumn = 0; iColumn < neqn; iColumn++) {
            mat_scratch1[iColumn][iRow]= mat[iColumn][iRow];
         }
     }

     dgeco_blas(mat_scratch1, neqn, ipvt, &rcond, fVecSave);   
     }

      for (int iRow = 0; iRow < neqn; iRow++) {
         for (int iColumn = 0; iColumn < neqn; iColumn++) {
            mat[iColumn][iRow]= mat_scratch1[iColumn][iRow];
         }
     }

    for(int b_count =0; b_count<batchsize; b_count++){ 
  for(int i = 0; i < neqn; i++) fVec_scratch[i] = fVec[i];    
  dgesl_blas(mat, neqn, ipvt , fVec_scratch, 0); 
  }

end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;

printf("Time in seconds(CPU) : %lf \n", seconds);
/*CPU_CODE ENDS HERE */


start1 = clock();

for(int b_count =0; b_count<batchsize; b_count++){ 
    status = cublasSetMatrix(neqn, neqn, sizeof(A[0]), A, neqn, devPtrA[b_count], neqn);
}

status = cublasDgetrfBatched(handle, neqn, ( double**)devPtrA_dev,neqn,d_pivot_array,d_info_array,batchsize); 
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status);

cudaMemcpy(h_info_array, d_info_array, batchsize*sizeof(int), cudaMemcpyDeviceToHost);


for(int b_count =0; b_count<batchsize; b_count++){ 
    cudaMemcpy(devPtrB[b_count], fVec, neqn*sizeof(double),cudaMemcpyHostToDevice);     /* for testing purpose only */
    }

status = cublasDgetrsBatched(handle, CUBLAS_OP_N, neqn, batchsize, (const double**)devPtrA_dev, 
                     neqn, d_pivot_array,devPtrB_dev, neqn, h_info_array, batchsize);                                                                   

for(int b_count =0; b_count<batchsize; b_count++){ 
    cudaMemcpy( fVec,devPtrB[b_count], neqn*sizeof(double),cudaMemcpyDeviceToHost);     /* for testing purpose only */
}


end1 = clock();
float seconds1 = (float)(end1 - start1) / CLOCKS_PER_SEC;

printf("Time in seconds(GPU) : %lf \n", seconds1);

printf("Speedup(CPU/GPU) : %lf \n", seconds/seconds1);

system("pause");
/* End of the main portion of the code */

free(mat);
free(mat_scratch1);
free(ipvt);
free(fVec);
free(fVecSave);
free(fVec_scratch);
free(A);
free(B);
cudaFree(devPtrA[0]);
cudaFree(devPtrB[0]);
cudaFree(devPtrA_dev);
cudaFree(devPtrB_dev);
free(devPtrA);
free(devPtrB);
cudaFree(d_x);
free(x);
cudaFree(d_pivot_array);
cudaFree(d_info_array);
free(h_info_array);
cublasDestroy_v2(handle);


}

2. mathlib_blas.h

#include <stdio.h>
#include <math.h>

#define maxm(a,b)   (((a) > (b)) ? (a) : (b))
#define minm(a,b)   (((a) < (b)) ? (a) : (b))
#define signum(a,b)   (((b) < (0)) ? (-a) : (a)) 

void **ArrayAlloc2d( const int size1, const int size2, const size_t   sizeType);
void dgefa_blas(double **a,int n, int ipvt[],int *info);
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job);
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z);



void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType )
 {
  void** array = nullptr;

  array = (void**)calloc(size1, sizeof(void*));

  if (array != nullptr) {
     if (size2 > 0) {
        void* data = calloc(size1*size2, sizeType);

        if (data != nullptr) {
           char* addr = (char*)data;

           for (int index1 = 0; index1 < size1; index1++) {
              array[index1] = (void*)addr;
              addr += sizeType*size2; /* char is always 1 byte */
           }
        } else {

           free(array);
           free(data);
           array = nullptr;


        }
     }
   } else {

   }


    return array;
 }

 void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z)
 {
 double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm;
 int i,info,j,k,kb,kp1,l;

 /* Compute 1-norm of a */
 anorm = 0.0;
 for (j = 0; j < n; j++) {
  vecsum = 0.0;
  for (i = 0;i < n; i++)
     vecsum += fabs(a[i][j]);
  anorm = maxm(anorm,vecsum);
 }

 /* Factor. */
 dgefa_blas(a,n,ipvt,&info);   

 /* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))).
 * estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e.
 * trans(a) is the transpose of a. The components of e are
 * chosen to cause maximum local growth in the elements of
 * w, where trans(u)*w=e. The vectors are frequently rescaled
 * to avoid overflow.
 */
 ek = 1.0;
 for (j = 0; j < n; j++)
  z[j] = 0.0;
  for (k = 0; k < n; k++) {
  if (z[k] != 0.0)
     ek = signum(ek,-z[k]);
  if (fabs(ek-z[k]) > fabs(a[k][k])) {
     s = fabs(a[k][k])/fabs(ek-z[k]);
     /* dscal(n,s,z,1) */
     for (i = 0; i < n; i++)
        z[i] *= s; 
     ek *= s;
  }
  wk = ek - z[k];
  wkm = -ek - z[k];
  s = fabs(wk);
  sm = fabs(wkm);
  if (a[k][k] != 0.0) {
     wk /= a[k][k];
     wkm /= a[k][k];
  }
  else {
     wk = 1.0;
     wkm = 1.0;
  }
  kp1 = k + 1;
  if (kp1 < n) {
     for (j = kp1; j < n; j++) {
        sm += (fabs(z[j] + wkm * a[k][j]));
        z[j] += (wk * a[k][j]);
        s += fabs(z[j]);
     }
     if (s < sm) {
        t = wkm -wk;
        wk = wkm;
        for (j = kp1; j < n; j++)
           z[j] += (t * a[k][j]);
     }
  }
  z[k] = wk;
  }

   /* dasum(n,s,z,1) */
 vecsum = 0.0;

 for (i = 0;i < n; i++)
   vecsum += fabs(z[i]);
 s = 1.0/vecsum;

 /* dscal(n,s,z) */
 for (i = 0; i < n; i++)
  z[i] *= s;

 /* Solve trans(l)*y= w
 */
 for (kb = 0; kb < n; kb++) {
   k = n - kb - 1;
  if (k < (n-1)) {
     /* sdot(n-k,a(k+1,k),1,z(k+1),1) */
     vecdot = 0.0;
     for (i = k+1;i < n; i++)
        vecdot += (a[i][k] * z[i]);
     z[k] += vecdot;
    }
    if (fabs(z[k]) > 1.0) {
     s = 1.0/fabs(z[k]);
     /* dscal(n,s,z) */
     for (i = 0; i < n; i++)
        z[i] *= s;
    }
    l = ipvt[k];
    t = z[l];
    z[l] = z[k];
    z[k] = t;
  } /* endfor kb */
  /* dasum(n,z,1) */
  vecsum = 0.0;
  for (i = 0; i < n; i++)
  vecsum += fabs(z[i]);
  s = 1.0/vecsum;
  /* dscal(n,s,z) */
  for (i = 0; i < n; i++)
  z[i] *= s;   

  ynorm = 1.0; 
  /*
  * Solve l * v = y
  */
  for (k = 0; k < n; k++) {
  l = ipvt[k];
  t = z[l];
  z[l] = z[k];
  z[k] = t;
  if (k < (n-1)) {
     /* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */
     for (i = k+1;i < n; i++)
        z[i] += (t * a[i][k]);
  }
  if (fabs(z[k]) > 1.0) {
     s = 1.0/fabs(z[k]);
     /* dscal(n,s,z,1) */
     for (i = 0; i < n; i++)
        z[i] *= s;
     ynorm *= s;
   }
  }
  /* dasum(n,z,1) */
  vecsum = 0.0;
  for (i = 0; i < n; i++)
  vecsum += fabs(z[i]);
  s = 1.0/vecsum;
   /* dscal(n,s,z,1) */
  for (i = 0; i < n; i++)
  z[i] *= s;
  ynorm *= s;

  /* Solve u * z = v */
  for (kb = 0; kb < n; kb++) {
  k = n - kb - 1;
  if (fabs(z[k]) > fabs(a[k][k])) {
     s = fabs(a[k][k])/fabs(z[k]);
     /* dscal(n,s,z,1) */
     for (i = 0; i < n; i++)
        z[i] *= s;
     ynorm *= s;
   }
   if (a[k][k] != 0.0)
     z[k] /= a[k][k];
   if (a[k][k] == 0.0)
     z[k] = 1.0;
   t = -z[k];
   /* daxpy(k-1,t,a[1][k],1,z[1],1) */
   for (i = 0; i < k; i++)
     z[i] += (t * a[i][k]);
  }
  /* Make znorm = 1.0 */
  /* dasum(n,z,1) */
  vecsum = 0.0;
  for (i = 0; i < n; i++)
  vecsum += fabs(z[i]); 
  s = 1.0/vecsum;

  /* dscal(n,s,z,1) */
  for (i = 0; i < n; i++)
  z[i] *= s;
  ynorm *= s;

  if (anorm != 0.0) *rcond = ynorm/anorm;
  if (anorm == 0.0) *rcond = 0.0;
  }



  void dgefa_blas(double **a,int n, int ipvt[],int *info)
  {
  double dmax,t;
  int i,j,k,kp1,l,nm1;

 *info = 0;
 nm1 = n - 1;
 if (n > 0) {
  for (k = 0; k < nm1; k++) {
     kp1 = k + 1;
     /* Find l = pivot index. */
     dmax = fabs(a[k][k]);
     l = k;
     for (i = k+1; i < n; i++) {
        if (fabs(a[i][k]) <= dmax) continue;
        l = i;
     }
     ipvt[k] = l;

     /* Zero pivot implies this column already triangularized. */
     if (a[l][k] == 0.0) {
        *info = k;
        continue;
     }

     /* Interchange if necessary. */
     if (l != k) {
        t = a[l][k];
        a[l][k] = a[k][k];
        a[k][k] = t;
     }

     /* Compute multipliers. */
     if (a[k][k] == 0.0) printf("\n!ERROR. Singular matrix.\n");
     t = -1.0/a[k][k];
     for (i = k+1; i < n; i++)
        a[i][k] *= t;

     /* Row elimination with column indexing. */
     for (j = kp1; j < n; j++) {
        t = a[l][j];
        if (l != k) {
           a[l][j] = a[k][j];
           a[k][j] = t;
        }
        for (i = k+1; i < n; i++ )
           a[i][j] += (t * a[i][k]);
     }
    }
   } 
   ipvt[n-1] = n-1;
   if (a[n-1][n-1] == 0.0) *info = n-1;
}



 void dgesl_blas(double **a,int n,int ipvt[],double b[],int job)
{
 double t;
 int i,k,kb,l,nm1;

 nm1 = n - 1;
 if (job == 0) {

  /* job = 0, solve a * x = b.
  * First solve l * y = b.
  */
  if (n > 0) {
     for (k = 0; k < nm1; k++) {
        l = ipvt[k];
        t = b[l];
        if (l != k) {
           b[l] = b[k];
           b[k] = t;
        }
        /* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */
        for (i=k+1;i < n;i++)
           b[i] += (t * a[i][k]); 
     }
   } 

    /* Now solve u * x = y. */
    for (kb = 0; kb < n; kb++) {
     k = n - kb-1;
     b[k] /= a[k][k];
     t = -b[k];
     /*      saxpy(k-1,t,a(1,k),1,b(1),1); */
     for (i = 0; i < k ; i++)
        b[i] += (t * a[i][k]);
    }
    return;
  }
  /* job != 0, solve trans(a) * x = b.
 * First solve trans(u) * x = y.
 */
 for (k = 0; k < n; k++) {
  /*  t = ddot(k-1,a(1,k),1,b(1),1); */
  t = 0;
  for (i = 0; i < k; i++)
     t += (a[i][k] * b[i]);
  b[k] = (b[k] - t)/a[k][k];
 }

 /* Now solve trans(l) * x = y. */

 if (n > 0) {
  for (kb = 0; kb < nm1; kb++) {
     k = n - 2 - kb;
     /* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */
     t = 0;

     for (i = k+1;i < n; i++)
        t += (a[i][k] * b[i]);
     b[k] += t;
     l = ipvt[k];

     if (l != k) {
        t = b[l];
        b[l] = b[k];
        b[k] = t;
     }
    } 
  }
}

Upvotes: 1

Views: 677

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151899

There should not be any behavioral differences between a batch size of 100 and a batch size of 1000. (Certainly there would be a performance difference - the batch size of 1000 should probably take longer.)

There are no published limits to the batch size, other than implicit memory limits. In fact, unless the GPU is returning incorrect results, there is no reason to think that you've run into any hard limit anyway.

( If you wanted to explore some behavioral or performance issue, this question is not properly written to address that. )

Upvotes: 3

Related Questions