Ya Squall
Ya Squall

Reputation: 21

How to minimize the overhead caused by the cuda memory related operations before and after launching a kernel function?

to make things easy, here's the sample code ready to be compiled and run:

module elemWiseOps
   USE cudafor
   USE cublas
   !
   ! Definition of symbols for real types (RP) and complex ones (CP)
   !
   IMPLICIT NONE
   !
   INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6,   37)     ! REAL32
   INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307)     ! REAL64
   !
   INTERFACE VMUL
       MODULE PROCEDURE dVmul
   END INTERFACE

   INTERFACE VADD
       MODULE PROCEDURE dVadd
   END INTERFACE

   Contains

       attributes(global) subroutine dVmul(a, b)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)    :: a
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
       !
       ! local variables
       !
       INTEGER, DEVICE                  :: i1, j, n(2)

       i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
       j = (blockIdx%y-1)*blockDim%y + threadIdx%y
       n(1) = size(a,1)
       n(2) = size(a,2)
       if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) * b(i1,j)
     end subroutine dVmul

     attributes(global) subroutine dVadd(a, b)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)    :: a
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
       !
       ! local variables
       !
       INTEGER, DEVICE                  :: i1, j, n(2)

       i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
       j = (blockIdx%y-1)*blockDim%y + threadIdx%y
       n(1) = size(a,1)
       n(2) = size(a,2)
       if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) + b(i1,j)
     end subroutine dVadd

     attributes(global) subroutine dPrintV(a)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)  :: a
       !
       ! local variables
       !
       INTEGER, DEVICE    :: i1, j1, n(2)

       n(1) = size(a,1)
       n(2) = size(a,2)

       DO j1 = 1, n(2)
           Do i1 = 1, n(1)
              print*, a(i1,j1)
           ENDDO
       ENDDO
     end subroutine dPrintV

   end module elemWiseOps

   PROGRAM Test
   !
   ! This is the main program for Test
   !
   USE cudafor
   USE cublas
   USE elemWiseOps

   !
   IMPLICIT NONE
   !
   REAL(DP), ALLOCATABLE, DIMENSION(:,:)           :: a,   b
   REAL(DP), ALLOCATABLE, DEVICE, DIMENSION(:,:)   :: a_d, b_d
   !
   INTEGER, PARAMETER                              :: m = 500, n = 1000
   INTEGER    :: i1, i2, istat
   type(dim3) :: grid, tBlock
   !
   tBlock    = dim3(32,32,1)
   grid      = dim3(ceiling(real(m)/tBlock%x), &
                    ceiling(real(n)/tBlock%y), 1)
   !
   ! Allocate storage for the arrays
   !
   Allocate(a(m,n),b(m,n))
   Allocate(a_d(m,n),b_d(m,n))
   !
   ! Initialize the host arrays
   !
   Do i2 = 1, n
       Do i1 = 1, m
           a(i1, i2) = REAL(i1, DP)
           b(i1, i2) = REAL(i1, DP)
       Enddo
   Enddo
   !
   ! Copy to the device arrays
   !
   istat = cudaMemcpy2D(a_d, m, a, m, m, n)
   istat = cudaMemcpy2D(b_d, m, b, m, m, n)
   !!!
   !
   ! Now invoke the kernels
   !
   Call vadd<<<grid,tBlock>>>(a_d, b_d)
   Call vadd<<<grid,tBlock>>>(a_d, b_d)
   Call vadd<<<grid,tBlock>>>(a_d, b_d)
   !
   ! Free storage for the arrays
   !
   Deallocate(a,b)
   Deallocate(a_d,b_d)
   !
END PROGRAM Test

I compiled the code with

nvfortran -O3 -cuda -fast -gpu=cc60 -lcufft -lcublas -Minfo=accel test.f90

Then I profiled it with

nvprof --print-api-trace ./a.out

Here's what happened if i call the kernel 3 times in a row:

... ...
171.32ms  287.59ms  cudaMalloc
458.94ms  98.290us  cudaMalloc
462.69ms  816.76us  cudaMemcpy2D
463.51ms  869.13us  cudaMemcpy2D
464.39ms  134.45us  cudaMalloc
464.52ms  12.703us  cudaMemcpy
464.54ms  5.2460us  cudaMalloc
464.54ms  4.1840us  cudaMemcpy
464.55ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [119])
464.61ms  123.66us  cudaFree
464.73ms  78.753us  cudaFree
464.81ms  107.87us  cudaMalloc
464.92ms  6.6480us  cudaMemcpy
464.92ms  3.7720us  cudaMalloc
464.93ms  3.8070us  cudaMemcpy
464.93ms  10.245us  cudaLaunchKernel (elemwiseops_dvadd_ [126])
464.94ms  124.35us  cudaFree
465.07ms  56.498us  cudaFree
465.12ms  93.173us  cudaMalloc
465.22ms  6.2260us  cudaMemcpy
465.22ms  3.6010us  cudaMalloc
465.23ms  3.6800us  cudaMemcpy
465.23ms  7.6920us  cudaLaunchKernel (elemwiseops_dvadd_ [133])
465.24ms  125.72us  cudaFree
465.37ms  47.850us  cudaFree
465.64ms  60.294us  cudaFree
465.70ms  208.73us  cudaFree

                          

My question is, is it possible to avoid the overhead by 'cudaMalloc', 'cudaMemcpy' and 'cudaFree' when invoking a kernel in cuda Fortran? Many thanks!

Upvotes: 0

Views: 428

Answers (1)

Ya Squall
Ya Squall

Reputation: 21

Reason found. Instead of using deferred dimension arrays as arguments, using explicit dimension arrays elimilates the creation of array temporaries. i.e., Revising the original kernel to the following form solves the problem:

attributes(global) subroutine dVadd(m, n, a, b)
!
IMPLICIT NONE
!
integer, value :: m, n
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(in)    :: a
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE                  :: i1, j

i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y

if (i1<=m .and. j<=n) b(i1,j) = a(i1,j) + b(i1,j)

end subroutine dVadd

And here's the updated nvprof output:

... ...
171.32ms  287.59ms  cudaMalloc
458.94ms  98.290us  cudaMalloc
462.69ms  816.76us  cudaMemcpy2D
463.51ms  869.13us  cudaMemcpy2D
464.55ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [119])
465.12ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [126])
465.61ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [133])
466.93ms  123.66us  cudaFree
467.75ms  78.753us  cudaFree

This conclusion is somehow conflicting to my experience in ordinary Fortran code, where there should be no temporay arrays being created when passing allocatable arrays as actual arguments to a subroutine declaring DEFERRED-SHAPE arrays as dummy arguments.

Upvotes: 1

Related Questions