How to minimize the overhead caused by the cuda memory related operations before and after launching a kernel function?

Question

to make things easy, here's the sample code ready to be compiled and run:

module elemWiseOps
   USE cudafor
   USE cublas
   !
   ! Definition of symbols for real types (RP) and complex ones (CP)
   !
   IMPLICIT NONE
   !
   INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6,   37)     ! REAL32
   INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307)     ! REAL64
   !
   INTERFACE VMUL
       MODULE PROCEDURE dVmul
   END INTERFACE

   INTERFACE VADD
       MODULE PROCEDURE dVadd
   END INTERFACE

   Contains

       attributes(global) subroutine dVmul(a, b)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)    :: a
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
       !
       ! local variables
       !
       INTEGER, DEVICE                  :: i1, j, n(2)

       i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
       j = (blockIdx%y-1)*blockDim%y + threadIdx%y
       n(1) = size(a,1)
       n(2) = size(a,2)
       if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) * b(i1,j)
     end subroutine dVmul

     attributes(global) subroutine dVadd(a, b)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)    :: a
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
       !
       ! local variables
       !
       INTEGER, DEVICE                  :: i1, j, n(2)

       i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
       j = (blockIdx%y-1)*blockDim%y + threadIdx%y
       n(1) = size(a,1)
       n(2) = size(a,2)
       if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) + b(i1,j)
     end subroutine dVadd

     attributes(global) subroutine dPrintV(a)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)  :: a
       !
       ! local variables
       !
       INTEGER, DEVICE    :: i1, j1, n(2)

       n(1) = size(a,1)
       n(2) = size(a,2)

       DO j1 = 1, n(2)
           Do i1 = 1, n(1)
              print*, a(i1,j1)
           ENDDO
       ENDDO
     end subroutine dPrintV

   end module elemWiseOps

   PROGRAM Test
   !
   ! This is the main program for Test
   !
   USE cudafor
   USE cublas
   USE elemWiseOps

   !
   IMPLICIT NONE
   !
   REAL(DP), ALLOCATABLE, DIMENSION(:,:)           :: a,   b
   REAL(DP), ALLOCATABLE, DEVICE, DIMENSION(:,:)   :: a_d, b_d
   !
   INTEGER, PARAMETER                              :: m = 500, n = 1000
   INTEGER    :: i1, i2, istat
   type(dim3) :: grid, tBlock
   !
   tBlock    = dim3(32,32,1)
   grid      = dim3(ceiling(real(m)/tBlock%x), &
                    ceiling(real(n)/tBlock%y), 1)
   !
   ! Allocate storage for the arrays
   !
   Allocate(a(m,n),b(m,n))
   Allocate(a_d(m,n),b_d(m,n))
   !
   ! Initialize the host arrays
   !
   Do i2 = 1, n
       Do i1 = 1, m
           a(i1, i2) = REAL(i1, DP)
           b(i1, i2) = REAL(i1, DP)
       Enddo
   Enddo
   !
   ! Copy to the device arrays
   !
   istat = cudaMemcpy2D(a_d, m, a, m, m, n)
   istat = cudaMemcpy2D(b_d, m, b, m, m, n)
   !!!
   !
   ! Now invoke the kernels
   !
   Call vadd<<>>(a_d, b_d)
   Call vadd<<>>(a_d, b_d)
   Call vadd<<>>(a_d, b_d)
   !
   ! Free storage for the arrays
   !
   Deallocate(a,b)
   Deallocate(a_d,b_d)
   !
END PROGRAM Test

I compiled the code with

nvfortran -O3 -cuda -fast -gpu=cc60 -lcufft -lcublas -Minfo=accel test.f90

Then I profiled it with

nvprof --print-api-trace ./a.out

Here's what happened if i call the kernel 3 times in a row:

... ...
171.32ms  287.59ms  cudaMalloc
458.94ms  98.290us  cudaMalloc
462.69ms  816.76us  cudaMemcpy2D
463.51ms  869.13us  cudaMemcpy2D
464.39ms  134.45us  cudaMalloc
464.52ms  12.703us  cudaMemcpy
464.54ms  5.2460us  cudaMalloc
464.54ms  4.1840us  cudaMemcpy
464.55ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [119])
464.61ms  123.66us  cudaFree
464.73ms  78.753us  cudaFree
464.81ms  107.87us  cudaMalloc
464.92ms  6.6480us  cudaMemcpy
464.92ms  3.7720us  cudaMalloc
464.93ms  3.8070us  cudaMemcpy
464.93ms  10.245us  cudaLaunchKernel (elemwiseops_dvadd_ [126])
464.94ms  124.35us  cudaFree
465.07ms  56.498us  cudaFree
465.12ms  93.173us  cudaMalloc
465.22ms  6.2260us  cudaMemcpy
465.22ms  3.6010us  cudaMalloc
465.23ms  3.6800us  cudaMemcpy
465.23ms  7.6920us  cudaLaunchKernel (elemwiseops_dvadd_ [133])
465.24ms  125.72us  cudaFree
465.37ms  47.850us  cudaFree
465.64ms  60.294us  cudaFree
465.70ms  208.73us  cudaFree

My question is, is it possible to avoid the overhead by 'cudaMalloc', 'cudaMemcpy' and 'cudaFree' when invoking a kernel in cuda Fortran? Many thanks!

How to minimize the overhead caused by the cuda memory related operations before and after launching a kernel function?

Answers (1)

Related Questions