Why does a manually programmed matrix multiplication combined with matrix addition give better performance than the intrinsic functions?

Question

I have some legacy code which performs the matrix operation of B = B + A*E as

DO I = 1,N
  DO L = 1,N
    DO K = 1,N
      B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
    end do
  end do
end do

To improve readability as well as take advantage of modern fortran intrinsic functions, I would like to write the above code as

B = B + matmul( A, E(:, 1:N, J-1) )

I noticed that the improved readability comes at the cost of performance. I determined that the problem is not with the intrinsic function matmul - the left figure shows that matmul performs just as well as the manually written operation for all values of N.

When matrix multiplication is combined with matrix addition, then for small values of N the manually written operation performs better than the intrinsic functions. For my uses, usually N < 10; I would like to improve the readability without losing the performance. Might there be a suggestion for that?

The code I am using is below. I am using Mac OS 10.14.6 with gfortran 8.2.0 and compiling with the -O3 optimization option.

program test
  implicit none

  integer :: loop_max = 1000
  integer :: j                                        ! loop index
  integer :: i                                        ! loop index
  real    :: t1, t2                                   ! start and end times
  real    :: t_manual, t_intrinsic, t_man_add, t_intrn_add


  integer                               :: N          ! matrix dimension
  integer, parameter                    :: NJ = 12

  real, dimension(:, :),    allocatable :: A, B       ! matrices
  real, dimension(:, :),    allocatable :: D
  real, dimension(:),       allocatable :: G
  real, dimension(:, :, :), allocatable :: E

  open(1, file = 'Delete.txt', status = 'unknown')


  do N = 1, 40
    allocate(A(N,N), B(N,N), G(N), D(N, 2*N+1), E(N, N+1, NJ))


    ! ##########################################################################
    ! manual matrix multiplication vs matmul
    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        call matmul_manual(j, N, NJ, A, B, D, G, E)
      end do
    end do

    call CPU_time(t2)
    t_manual = t2 - t1
    write(1, *) A, B, D, G, E


    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        B         =  matmul( A, E(:, 1:N, j-1) )
      end do
    end do

    call CPU_time(t2)
    t_intrinsic = t2 - t1
    write(1, *) A, B, D, G, E
    ! --------------------------------------------------------------------------




    ! ##########################################################################
    ! manual matrix multiplication with matrix addition
    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        call manual_matmul_add(j, N, NJ, A, B, D, G, E)
      end do
    end do

    call CPU_time(t2)
    t_man_add = t2 - t1
    write(1, *) A, B, D, G, E
    ! --------------------------------------------------------------------------



    ! ##########################################################################
    ! intrinsic matrix multiplication (matmul) with matrix addition
    call rand_fill
    call CPU_time(t1)

    do i = 1, loop_max
      do j = 2, 12
        call intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
      end do
    end do

    call CPU_time(t2)
    t_intrn_add = t2 - t1
    write(1, *) A, B, D, G, E
    ! --------------------------------------------------------------------------


    deallocate(A, B, D, G, E)

    print*, N, t_manual, t_intrinsic, t_man_add, t_intrn_add

  end do





contains
  subroutine rand_fill
    ! fill the matrices with random numbers
    call random_number(A)
    call random_number(B)
    call random_number(D)
    call random_number(G)
    call random_number(E)

  end subroutine


end program test











subroutine matmul_manual(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E

  integer :: I, L, K  ! loop indices

  B = 0.0
  DO I = 1,N
    DO L = 1,N
      DO K = 1,N
        B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
      end do
    end do
  end do

end subroutine matmul_manual






subroutine manual_matmul_add(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E

  integer :: I, L, K  ! loop indices

  DO I = 1,N
    D(I,N+1) = -G(I)
    DO L = 1,N
      D(I,N+1) = D(I,N+1)+A(I,L)*E(L,N+1,J-1)
      DO K = 1,N
        B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
      end do
    end do
  end do

end subroutine manual_matmul_add




subroutine intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E

  real, dimension(N, N+1) :: temp1
  real, dimension(N, N)   :: temp2

  D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
  B         =  B + matmul( A, E(:, 1:N, j-1) )

end subroutine intrinsic_matmul_add




subroutine mat_sub_new(j, N, NJ, A, B, D, G, E)
  implicit none

  integer, intent(in)                          :: j
  integer, intent(in)                          :: N, NJ
  real, dimension(N, N),        intent(in out) :: A, B
  real, dimension(N, 2*N+1),    intent(in out) :: D
  real, dimension(N),           intent(in out) :: G
  real, dimension(N, N+1, NJ),  intent(in out) :: E


  if (N == 1) then        ! matmul seems to be inefficient when N = 1
    D(N,N+1) = -G(N)   + A(N,N)*E(N, N+1, J-1)
    B(N,N)   =  B(N,N) + A(N,N)*E(N, N,   J-1)

  else
    D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
    B         =  B + matmul( A, E(:, 1:N, j-1) )
  end if

end subroutine mat_sub_new

Why does a manually programmed matrix multiplication combined with matrix addition give better performance than the intrinsic functions?

Answers (1)

Related Questions