Unexpected performance in Fortran routines

Question

I have been playing with some Fortran routines and I would like to ask for guidance as to understand why some of them take more time than others.

The problem all routines solve is the following. Given a set of points find those that are too close. For that, the components of the points are compared and, if all of them are lower than a threshold, the two points are considered to be too close.

The first routine is the following:

subroutine routine1(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all(abs(points(i,:) - points(i_check,:)) < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

In the second routine there is just one "small" change. I break the clos = all(abs(points(i,:) - points(i_check,:)) < r) into two lines. First, the difference is calculated and later the all is evaluated:

subroutine routine2(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    real(dp), dimension(ndis) :: diff
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          diff = abs(points(i,:) - points(i_check,:))
          clos = all(diff < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

The last routine starts from routine1 but since in Fortran one should run from outer to inner loops I first transpose the points so I can execute the loop from the outer index:

subroutine routine3(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close
    real(dp), dimension(ndis,npoints) :: points_T

    integer :: i, i_check
    
    is_close = .FALSE.
    points_T = transpose(points)
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all( abs(points_T(:,i) - points_T(:,i_check))< r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

With this, I would expect routines 1 and 2 to perform basically the same and 3 slightly better. But when the performance is measured the results I get are the following (the full code to reproduce this calculation is available at the end of the question):

   npoints         routine1/s                 routine2/s                   routine3/s
      10   1.2979999999999999E-006   3.1230000000000000E-006   6.3010000000000003E-006
     100   2.7813000000000000E-005   1.9997000000000000E-004   3.8600999999999999E-005
    1000   2.5518790000000000E-003   1.4855343999999999E-002   1.1216130000000000E-003
   10000   8.0283021999999996E-002  0.64925195400000002        5.5415089000000001E-002
  100000   7.8920265199999999        79.243228029999997        16.829428664999998

My questions are:

Why routine2 takes so much longer than routine1 being basically the same?
What am I missing with the order of indexes that explains routine3 being so slow?

Full code to reproduce this calculations, compiled with gfortran and -O3 flag:

program test                                                                               
    implicit none
    integer, parameter :: dp = kind(0.d0)
    integer, parameter :: ndis = 21
    integer :: ipoint, npoints
    real(dp), parameter :: r = 1e-5
    real(dp), dimension(:, :), allocatable :: points
    logical, dimension(:), allocatable :: is_close
    real(dp) :: t1, t2, t3
    integer(8) :: ini, fin, count_rate

    do ipoint = 1,5
        npoints = 10**ipoint
        allocate(points(npoints, ndis), is_close(npoints))
        ! Generate data
        call RANDOM_NUMBER(points)

        ! routine 1
        call SYSTEM_CLOCK(ini,count_rate=count_rate)
        call routine1(npoints, ndis, points, r, is_close)
        call SYSTEM_CLOCK(fin)
        t1 = fin - ini

        ! routine 2
        call SYSTEM_CLOCK(ini,count_rate=count_rate)
        call routine2(npoints, ndis, points, r, is_close)
        call SYSTEM_CLOCK(fin)
        t2 = fin - ini

        ! routine 3
        call SYSTEM_CLOCK(ini,count_rate=count_rate)
        call routine3(npoints, ndis, points, r, is_close)
        call SYSTEM_CLOCK(fin)
        t3 = fin - ini

        deallocate(points, is_close)

        print *, npoints, dble(t1/count_rate), dble(t2/count_rate), dble(t3/count_rate)
    end do
end program

subroutine routine1(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all(abs(points(i,:) - points(i_check,:)) < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

subroutine routine2(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    real(dp), dimension(ndis) :: diff
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          diff = abs(points(i,:) - points(i_check,:))
          clos = all(diff < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

subroutine routine3(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close
    real(dp), dimension(ndis,npoints) :: points_T

    integer :: i, i_check
    
    is_close = .FALSE.
    points_T = transpose(points)
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all( abs(points_T(:,i) - points_T(:,i_check))< r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

Unexpected performance in Fortran routines

Answers (1)

Related Questions