Nick Brady
Nick Brady

Reputation: 151

Why does a manually programmed matrix multiplication combined with matrix addition give better performance than the intrinsic functions?

I have some legacy code which performs the matrix operation of B = B + A*E as

DO I = 1,N
  DO L = 1,N
    DO K = 1,N
      B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
    end do
  end do
end do

To improve readability as well as take advantage of modern fortran intrinsic functions, I would like to write the above code as

B = B + matmul( A, E(:, 1:N, J-1) )

I noticed that the improved readability comes at the cost of performance. I determined that the problem is not with the intrinsic function matmul - the left figure shows that matmul performs just as well as the manually written operation for all values of N.

When matrix multiplication is combined with matrix addition, then for small values of N the manually written operation performs better than the intrinsic functions. For my uses, usually N < 10; I would like to improve the readability without losing the performance. Might there be a suggestion for that?

The code I am using is below. I am using Mac OS 10.14.6 with gfortran 8.2.0 and compiling with the -O3 optimization option.

Execution time for the manual vs intrinsic functions. Left is is just matrix multiplication (C = AB). Right is matrix multiplication with addition (C = C + AB).

program test
  implicit none

  integer :: loop_max = 1000
  integer :: j                                        ! loop index
  integer :: i                                        ! loop index
  real    :: t1, t2                                   ! start and end times
  real    :: t_manual, t_intrinsic, t_man_add, t_intrn_add


  integer                               :: N          ! matrix dimension
  integer, parameter                    :: NJ = 12

  real, dimension(:, :),    allocatable :: A, B       ! matrices
  real, dimension(:, :),    allocatable :: D
  real, dimension(:),       allocatable :: G
  real, dimension(:, :, :), allocatable :: E

  open(1, file = 'Delete.txt', status = 'unknown')


  do N = 1, 40
    allocate(A(N,N), B(N,N), G(N), D(N, 2*N+1), E(N, N+1, NJ))


    ! ##########################################################################
    ! manual matrix multiplication vs matmul
    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        call matmul_manual(j, N, NJ, A, B, D, G, E)
      end do
    end do

    call CPU_time(t2)
    t_manual = t2 - t1
    write(1, *) A, B, D, G, E


    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        B         =  matmul( A, E(:, 1:N, j-1) )
      end do
    end do

    call CPU_time(t2)
    t_intrinsic = t2 - t1
    write(1, *) A, B, D, G, E
    ! --------------------------------------------------------------------------




    ! ##########################################################################
    ! manual matrix multiplication with matrix addition
    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        call manual_matmul_add(j, N, NJ, A, B, D, G, E)
      end do
    end do

    call CPU_time(t2)
    t_man_add = t2 - t1
    write(1, *) A, B, D, G, E
    ! --------------------------------------------------------------------------



    ! ##########################################################################
    ! intrinsic matrix multiplication (matmul) with matrix addition
    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        call intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
      end do
    end do

    call CPU_time(t2)
    t_intrn_add = t2 - t1
    write(1, *) A, B, D, G, E
    ! --------------------------------------------------------------------------


    deallocate(A, B, D, G, E)

    print*, N, t_manual, t_intrinsic, t_man_add, t_intrn_add

  end do





contains
  subroutine rand_fill
    ! fill the matrices with random numbers
    call random_number(A)
    call random_number(B)
    call random_number(D)
    call random_number(G)
    call random_number(E)

  end subroutine


end program test











subroutine matmul_manual(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E

  integer :: I, L, K  ! loop indices

  B = 0.0
  DO I = 1,N
    DO L = 1,N
      DO K = 1,N
        B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
      end do
    end do
  end do

end subroutine matmul_manual






subroutine manual_matmul_add(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E

  integer :: I, L, K  ! loop indices

  DO I = 1,N
    D(I,N+1) = -G(I)
    DO L = 1,N
      D(I,N+1) = D(I,N+1)+A(I,L)*E(L,N+1,J-1)
      DO K = 1,N
        B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
      end do
    end do
  end do

end subroutine manual_matmul_add




subroutine intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E

  real, dimension(N, N+1) :: temp1
  real, dimension(N, N)   :: temp2

  D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
  B         =  B + matmul( A, E(:, 1:N, j-1) )

end subroutine intrinsic_matmul_add




subroutine mat_sub_new(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E


  if (N == 1) then        ! matmul seems to be inefficient when N = 1
    D(N,N+1) = -G(N)   + A(N,N)*E(N, N+1, J-1)
    B(N,N)   =  B(N,N) + A(N,N)*E(N, N,   J-1)

  else
    D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
    B         =  B + matmul( A, E(:, 1:N, j-1) )
  end if

end subroutine mat_sub_new

Upvotes: 3

Views: 373

Answers (1)

Federico Perini
Federico Perini

Reputation: 1416

I suspect this has to do with two issues:

  1. How the compiler resolves the MATMUL generic interface to a proper call to the correct routine (REAL vs. COMPLEX vs. INTEGER, or matrix times vector vs. matrix times matrix): I have no idea whether this is done systematically at compilation or at runtime, or whether this choice is made based on the optimization level; (this would justify the additional overhead especially for the low-size cases)

  2. The internal "general purpose" algorithm may not be in general the best suited for your problem, as it looks like in some cases, brute-force compiler optimization does a better job. Is gfortran's MATMUL intrinsics based on BLAS, for example? If so, that may not be the fastest.

I've done a similar test with NORM2 on my PC (Windows, i7, gfortran 9.2.0, compiled with -O3 -march=core-avx2): Turns out that, for NORM2:

  • The BLAS implementation is always slowest, despite having been slightly refactored to feed the compiler a PURE version

  • Usage of the intrinsics (both NORM2 or SQRT(SUM(X**2))) is always slow, regardless of the array size

  • The fastest cases are when either using a simple loop, or the intrinsics with a fixed-size array:

    ARRAY SIZE   assumed-shape      fixed-size intrinsic NORM2            LOOP            BLAS
             N          [ms/N]          [ms/N]          [ms/N]          [ms/N]          [ms/N]
      2             5.93750E-09     4.06250E-09     8.43750E-09     4.06250E-09     1.03125E-08
     12             1.03125E-08     7.81250E-09     3.12500E-08     7.81250E-09     5.09375E-08
     22             1.65625E-08     1.40625E-08     5.50000E-08     1.43750E-08     9.15625E-08
     32             2.25000E-08     2.00000E-08     7.81250E-08     2.00000E-08     1.29375E-07
    

BTW The code is pasted here below (beware of the large memory footprint!):

program test_norm
   use iso_fortran_env
   implicit none

   integer :: xsize,i,iunit,icase
   integer, parameter :: testSize = 50000000
   real(real64), allocatable :: set(:,:),setNorm(:)
   real(real64) :: t0,t1,setSum(5),timeTable(5,35)
   intrinsic :: norm2 

   open(newunit=iunit,file='garbage.txt',action='write')

   print '(6(1x,a15))' ,'ARRAY SIZE','assumed-shape','fixed-size','intrinsic NORM2','LOOP','BLAS'
   print '(6(1x,a15))' ,'N',('[ms/N]',i=1,5)

   icase = 0
   do xsize = 2,32,10

     ! Initialize test set 
     icase = icase+1
     allocate(set(xsize,testSize),setNorm(testSize))
     call random_number(set)

     ! Test 1: intrinsic SQRT/SUM, assumed-shape array
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v1(set(:,i)); call cpu_time(t1)
     setSum(1) = sum(setNorm); timeTable(1,icase) = t1-t0
      
     ! Test 2: intrinsic SQRT/SUM, fixed-size array 
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v2(xsize,set(:,i)); call cpu_time(t1)
     setSum(2) = sum(setNorm); timeTable(2,icase) = t1-t0

     ! Test 3: intrinsic NORM2
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm2(set(:,i)); call cpu_time(t1)
     setSum(3) = sum(setNorm); timeTable(3,icase) = t1-t0

     ! Test 4: LOOP 
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v3(xsize,set(:,i)); call cpu_time(t1)
     setSum(4) = sum(setNorm); timeTable(4,icase) = t1-t0

     ! Test 5: BLAS
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = DNRM2(xsize,set(:,i),1); call cpu_time(t1)
     setSum(5) = sum(setNorm); timeTable(5,icase) = t1-t0

     ! Print output
     print '(7x,i2,9x,5(2x,1pe13.5e2,1x))', xsize,timeTable(:,icase)/testSize
     write (iunit,*) 'setSum = ',setSum
     deallocate(set,setNorm)
      
   end do 
   
   close(iunit)

   contains

   pure real(real64) function norm_v1(x) result(L2)
      real(real64), intent(in) :: x(:)

      L2 = sqrt(sum(x**2))
   end function norm_v1      

   pure real(real64) function norm_v2(n,x) result(L2)
      integer, intent(in) :: n 
      real(real64), intent(in) :: x(n)

      L2 = sqrt(sum(x**2))
   end function norm_v2
       
   pure real(real64) function norm_v3(n,x) result(L2)
      integer, intent(in) :: n 
      real(real64), intent(in) :: x(n)
      integer :: i

      L2 = 0.0_real64
      do i=1,n
        L2 = L2 + x(i)**2
      end do  
      L2 = sqrt(L2)
       
   end function norm_v3

   PURE REAL(REAL64) FUNCTION DNRM2 ( N, X, INCX )
   INTEGER, INTENT(IN) :: N,INCX
   REAL(REAL64), INTENT(IN) :: X( * )
   REAL(REAL64), PARAMETER :: ONE  = 1.0_REAL64
   REAL(REAL64), PARAMETER :: ZERO = 0.0_REAL64
   
   INTEGER :: IX
   REAL(REAL64) :: ABSXI, NORM, SCALE, SSQ
   INTRINSIC :: ABS, SQRT
   IF( N<1 .OR. INCX<1 )THEN
     NORM  = ZERO
   ELSE IF( N==1 )THEN
     NORM  = ABS( X( 1 ) )
   ELSE
     SCALE = ZERO
     SSQ   = ONE
     DO IX = 1, 1 + ( N - 1 )*INCX, INCX
        IF( X( IX )/=ZERO )THEN
           ABSXI = ABS( X( IX ) )
           IF( SCALE<ABSXI )THEN
              SSQ   = ONE   + SSQ*( SCALE/ABSXI )**2
              SCALE = ABSXI
           ELSE
              SSQ   = SSQ   +     ( ABSXI/SCALE )**2
           END IF
        END IF
     END DO
     NORM  = SCALE * SQRT( SSQ )
  END IF
  DNRM2 = NORM
  RETURN
  END FUNCTION DNRM2

end program test_norm

Upvotes: 1

Related Questions