Reputation: 21
to make things easy, here's the sample code ready to be compiled and run:
module elemWiseOps
USE cudafor
USE cublas
!
! Definition of symbols for real types (RP) and complex ones (CP)
!
IMPLICIT NONE
!
INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6, 37) ! REAL32
INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307) ! REAL64
!
INTERFACE VMUL
MODULE PROCEDURE dVmul
END INTERFACE
INTERFACE VADD
MODULE PROCEDURE dVadd
END INTERFACE
Contains
attributes(global) subroutine dVmul(a, b)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j, n(2)
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
n(1) = size(a,1)
n(2) = size(a,2)
if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) * b(i1,j)
end subroutine dVmul
attributes(global) subroutine dVadd(a, b)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j, n(2)
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
n(1) = size(a,1)
n(2) = size(a,2)
if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) + b(i1,j)
end subroutine dVadd
attributes(global) subroutine dPrintV(a)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
!
! local variables
!
INTEGER, DEVICE :: i1, j1, n(2)
n(1) = size(a,1)
n(2) = size(a,2)
DO j1 = 1, n(2)
Do i1 = 1, n(1)
print*, a(i1,j1)
ENDDO
ENDDO
end subroutine dPrintV
end module elemWiseOps
PROGRAM Test
!
! This is the main program for Test
!
USE cudafor
USE cublas
USE elemWiseOps
!
IMPLICIT NONE
!
REAL(DP), ALLOCATABLE, DIMENSION(:,:) :: a, b
REAL(DP), ALLOCATABLE, DEVICE, DIMENSION(:,:) :: a_d, b_d
!
INTEGER, PARAMETER :: m = 500, n = 1000
INTEGER :: i1, i2, istat
type(dim3) :: grid, tBlock
!
tBlock = dim3(32,32,1)
grid = dim3(ceiling(real(m)/tBlock%x), &
ceiling(real(n)/tBlock%y), 1)
!
! Allocate storage for the arrays
!
Allocate(a(m,n),b(m,n))
Allocate(a_d(m,n),b_d(m,n))
!
! Initialize the host arrays
!
Do i2 = 1, n
Do i1 = 1, m
a(i1, i2) = REAL(i1, DP)
b(i1, i2) = REAL(i1, DP)
Enddo
Enddo
!
! Copy to the device arrays
!
istat = cudaMemcpy2D(a_d, m, a, m, m, n)
istat = cudaMemcpy2D(b_d, m, b, m, m, n)
!!!
!
! Now invoke the kernels
!
Call vadd<<<grid,tBlock>>>(a_d, b_d)
Call vadd<<<grid,tBlock>>>(a_d, b_d)
Call vadd<<<grid,tBlock>>>(a_d, b_d)
!
! Free storage for the arrays
!
Deallocate(a,b)
Deallocate(a_d,b_d)
!
END PROGRAM Test
I compiled the code with
nvfortran -O3 -cuda -fast -gpu=cc60 -lcufft -lcublas -Minfo=accel test.f90
Then I profiled it with
nvprof --print-api-trace ./a.out
Here's what happened if i call the kernel 3 times in a row:
... ...
171.32ms 287.59ms cudaMalloc
458.94ms 98.290us cudaMalloc
462.69ms 816.76us cudaMemcpy2D
463.51ms 869.13us cudaMemcpy2D
464.39ms 134.45us cudaMalloc
464.52ms 12.703us cudaMemcpy
464.54ms 5.2460us cudaMalloc
464.54ms 4.1840us cudaMemcpy
464.55ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [119])
464.61ms 123.66us cudaFree
464.73ms 78.753us cudaFree
464.81ms 107.87us cudaMalloc
464.92ms 6.6480us cudaMemcpy
464.92ms 3.7720us cudaMalloc
464.93ms 3.8070us cudaMemcpy
464.93ms 10.245us cudaLaunchKernel (elemwiseops_dvadd_ [126])
464.94ms 124.35us cudaFree
465.07ms 56.498us cudaFree
465.12ms 93.173us cudaMalloc
465.22ms 6.2260us cudaMemcpy
465.22ms 3.6010us cudaMalloc
465.23ms 3.6800us cudaMemcpy
465.23ms 7.6920us cudaLaunchKernel (elemwiseops_dvadd_ [133])
465.24ms 125.72us cudaFree
465.37ms 47.850us cudaFree
465.64ms 60.294us cudaFree
465.70ms 208.73us cudaFree
My question is, is it possible to avoid the overhead by 'cudaMalloc', 'cudaMemcpy' and 'cudaFree' when invoking a kernel in cuda Fortran? Many thanks!
Upvotes: 0
Views: 428
Reputation: 21
Reason found. Instead of using deferred dimension arrays as arguments, using explicit dimension arrays elimilates the creation of array temporaries. i.e., Revising the original kernel to the following form solves the problem:
attributes(global) subroutine dVadd(m, n, a, b)
!
IMPLICIT NONE
!
integer, value :: m, n
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
if (i1<=m .and. j<=n) b(i1,j) = a(i1,j) + b(i1,j)
end subroutine dVadd
And here's the updated nvprof output:
... ...
171.32ms 287.59ms cudaMalloc
458.94ms 98.290us cudaMalloc
462.69ms 816.76us cudaMemcpy2D
463.51ms 869.13us cudaMemcpy2D
464.55ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [119])
465.12ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [126])
465.61ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [133])
466.93ms 123.66us cudaFree
467.75ms 78.753us cudaFree
This conclusion is somehow conflicting to my experience in ordinary Fortran code, where there should be no temporay arrays being created when passing allocatable arrays as actual arguments to a subroutine declaring DEFERRED-SHAPE arrays as dummy arguments.
Upvotes: 1