pablo
pablo

Reputation: 69

Unexpected performance in Fortran routines

I have been playing with some Fortran routines and I would like to ask for guidance as to understand why some of them take more time than others.

The problem all routines solve is the following. Given a set of points find those that are too close. For that, the components of the points are compared and, if all of them are lower than a threshold, the two points are considered to be too close.

The first routine is the following:

subroutine routine1(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all(abs(points(i,:) - points(i_check,:)) < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

In the second routine there is just one "small" change. I break the clos = all(abs(points(i,:) - points(i_check,:)) < r) into two lines. First, the difference is calculated and later the all is evaluated:

subroutine routine2(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    real(dp), dimension(ndis) :: diff
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          diff = abs(points(i,:) - points(i_check,:))
          clos = all(diff < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

The last routine starts from routine1 but since in Fortran one should run from outer to inner loops I first transpose the points so I can execute the loop from the outer index:

subroutine routine3(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close
    real(dp), dimension(ndis,npoints) :: points_T

    integer :: i, i_check
    
    is_close = .FALSE.
    points_T = transpose(points)
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all( abs(points_T(:,i) - points_T(:,i_check))< r)
       enddo
       is_close(i) = clos
    enddo
end subroutine  

With this, I would expect routines 1 and 2 to perform basically the same and 3 slightly better. But when the performance is measured the results I get are the following (the full code to reproduce this calculation is available at the end of the question):

   npoints         routine1/s                 routine2/s                   routine3/s
      10   1.2979999999999999E-006   3.1230000000000000E-006   6.3010000000000003E-006
     100   2.7813000000000000E-005   1.9997000000000000E-004   3.8600999999999999E-005
    1000   2.5518790000000000E-003   1.4855343999999999E-002   1.1216130000000000E-003
   10000   8.0283021999999996E-002  0.64925195400000002        5.5415089000000001E-002
  100000   7.8920265199999999        79.243228029999997        16.829428664999998     

My questions are:

  1. Why routine2 takes so much longer than routine1 being basically the same?
  2. What am I missing with the order of indexes that explains routine3 being so slow?

Full code to reproduce this calculations, compiled with gfortran and -O3 flag:

program test                                                                               
    implicit none
    integer, parameter :: dp = kind(0.d0)
    integer, parameter :: ndis = 21
    integer :: ipoint, npoints
    real(dp), parameter :: r = 1e-5
    real(dp), dimension(:, :), allocatable :: points
    logical, dimension(:), allocatable :: is_close
    real(dp) :: t1, t2, t3
    integer(8) :: ini, fin, count_rate

    do ipoint = 1,5
        npoints = 10**ipoint
        allocate(points(npoints, ndis), is_close(npoints))
        ! Generate data
        call RANDOM_NUMBER(points)

        ! routine 1
        call SYSTEM_CLOCK(ini,count_rate=count_rate)
        call routine1(npoints, ndis, points, r, is_close)
        call SYSTEM_CLOCK(fin)
        t1 = fin - ini

        ! routine 2
        call SYSTEM_CLOCK(ini,count_rate=count_rate)
        call routine2(npoints, ndis, points, r, is_close)
        call SYSTEM_CLOCK(fin)
        t2 = fin - ini

        ! routine 3
        call SYSTEM_CLOCK(ini,count_rate=count_rate)
        call routine3(npoints, ndis, points, r, is_close)
        call SYSTEM_CLOCK(fin)
        t3 = fin - ini

        deallocate(points, is_close)

        print *, npoints, dble(t1/count_rate), dble(t2/count_rate), dble(t3/count_rate)
    end do
end program

subroutine routine1(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all(abs(points(i,:) - points(i_check,:)) < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

subroutine routine2(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close

    integer :: i, i_check
    real(dp), dimension(ndis) :: diff
    
    is_close = .FALSE.
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          diff = abs(points(i,:) - points(i_check,:))
          clos = all(diff < r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

subroutine routine3(npoints, ndis, points, r, is_close)
    implicit none
    integer, parameter :: dp = kind(0d0)
    integer, intent(in) :: npoints, ndis
    real(dp), dimension(npoints, ndis), intent(in) :: points
    real(dp), intent(in) :: r
    logical :: clos
    logical, dimension(npoints) :: is_close
    real(dp), dimension(ndis,npoints) :: points_T

    integer :: i, i_check
    
    is_close = .FALSE.
    points_T = transpose(points)
    do i=1,npoints
       clos = .FALSE.
       i_check = i
       do while (.not. clos .and. i_check < npoints)
          i_check = i_check + 1
          clos = all( abs(points_T(:,i) - points_T(:,i_check))< r)
       enddo
       is_close(i) = clos
    enddo
end subroutine

Upvotes: 2

Views: 134

Answers (1)

Serge3leo
Serge3leo

Reputation: 549

  1. The optimizer's constraints probably lead to a complete calculation of the diff array;
  2. Of course, points_T(k,i) and points_T(k+1,i) are directly adjacent in RAM, so the overhead is substantially less. In particular, for most computers, the amount of data sent between RAM and cache is several times less. But the cost of transposing is just as high, and judging by the time difference between routine1() and routine2(), all() in the main loop processes no more than 1/10 of the row of the points() array.

Upvotes: 3

Related Questions