MPI_Scatterv from Intel MPI (mpiifort) using MPI data types is much slower (23 times) compared to flattening array and scattering. Why it could be?

Question

ifort version: (IFORT) 2021.8.0 20221119 OS: WSL Ubuntu 20.04LTS

I have a (1000x1000x1000) 3D array to distribute among procs. In the first approach, I flatten the array and then distribute arrays among procs and it takes about 7.86sec
In the second approach I use MPI-derived data types to Scatterv the 3D array and I noticed that it takes about 165.34sec. But Gatherv of the same data took about 14.24 sec.
What could be the reason for this inconsistency? I expected Scatterv to take similar time as Gatherv

Here is the code

program ex_scatterv
  use mpi
  use iso_fortran_env, only : real64
  implicit none

!allocate arrays
  real(real64), allocatable,dimension(:,:,:) :: array, array_local
  real(real64), allocatable,dimension(:) :: array_flat, array_local_flat
  integer :: rank, num_procs, i, j, k
  integer :: nx, ny, nz, str_idx, end_idx, local_size, local_size_flat
  integer, dimension(:), allocatable :: sendcounts, displacements
  integer :: sizes(3), sub_sizes(3), starts(3), recv_starts(3), recv_sizes(3), &
    send_type, resize_send_type, recv_type, resize_recv_type
  integer(kind=8) :: lb, extent, lb_resize
  real(real64) :: start_time
  integer :: mpierr
  call mpi_init(mpierr)
  call mpi_comm_size(mpi_comm_world, num_procs, mpierr)
  call mpi_comm_rank(mpi_comm_world, rank, mpierr)

!size of array
  nx=1000
  ny=1000
  nz=1000

  if(rank==0) then
    if(num_procs>nx) then
      print*, "Number of procs should be less than or equal to first dimension of the array"
      call MPI_Abort(mpi_comm_world, 1, mpierr)
    endif
  endif

  start_time=MPI_Wtime()
!allocate in the root rank
  if(rank==0) then
    allocate(array(nx,ny,nz))
    allocate(array_flat(nx*ny*nz))
  else !for other procs allocate with zero size
    allocate(array(0,0,0))
  endif

!assign values to the array
  if(rank==0) then
    do k=1,nz
      do j=1,ny
        do i=1,nx
          array(i,j,k) = (i-1)+(j-1)*nx+(k-1)*nx*ny
        end do
      end do
    end do
!print*, "Before scattering..."
!print*, array
!flatten the 3D array
    forall(k=1:nz, j=1:ny, i=1:nx) array_flat(k+(j-1)*nz+(i-1)*ny*nz)=array(i,j,k)
  endif

!distribute the 3d array among different procs
  call distribute_points(nx, rank, num_procs, str_idx, end_idx)
  local_size = end_idx - str_idx + 1
  local_size_flat = local_size*ny*nz

!allocate local(for each rank) arrays
  allocate(array_local_flat(local_size_flat))
  allocate(array_local(local_size, ny, nz))

!allocate sendcoutns and displacements arrays for braodcasting
  allocate(sendcounts(num_procs), displacements(num_procs))

!gather displacements and sendcounts for all ranks
  call MPI_Allgather(str_idx, 1, MPI_INTEGER, displacements, 1, MPI_INTEGER, &
    MPI_COMM_WORLD, mpierr)
  call MPI_Allgather(local_size, 1, MPI_INTEGER, sendcounts, 1, &
    MPI_INTEGER, MPI_COMM_WORLD, mpierr)

!total sendcounts and displacements
  sendcounts = sendcounts*ny*nz
  displacements = displacements - 1 !Array index starts with 0 in MPI (C)
  displacements = displacements*ny*nz

!scatter the flattened array among procs
  call MPI_Scatterv(array_flat, sendcounts, displacements, MPI_DOUBLE_PRECISION, &
    array_local_flat, local_size*ny*nz, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, &
    mpierr)

!form 3D array from flattened local array
  forall(k=1:nz, j=1:ny, i=1:local_size) array_local(i,j,k) = &
    array_local_flat(k+(j-1)*nz+(i-1)*ny*nz)

!print*, "Scattered array: ", rank
!print*, array_local
  if(rank==0) then
    print*, "Time taken by flatten and scatter: ", MPI_Wtime()-start_time
  endif

  call MPI_Barrier(mpi_comm_world, mpierr)
!deallocate(array_flat, array_local_flat)

  start_time=MPI_Wtime()
!Scatterning using subarray type
  sizes = [nx, ny, nz]
  recv_sizes=[local_size, ny, nz]
  sub_sizes = [1, ny, nz]
  starts = [0, 0, 0]
  recv_starts = [0, 0, 0]

!to get extent of MPI_DOUBLE_PRECISION
  call MPI_Type_get_extent(MPI_DOUBLE_PRECISION, lb, extent, mpierr)

!create a mpi subarray data type for sending data
  call MPI_Type_create_subarray(3, sizes, sub_sizes, starts, &
    MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, send_type, mpierr)
  lb_resize=0
!resize the send subarray for starting at correct location for next send
  call MPI_Type_create_resized(send_type, lb_resize, extent, &
    resize_send_type, mpierr)
  call MPI_Type_commit(resize_send_type, mpierr)

!create a mpi subarray data type for receiving data
  call MPI_Type_create_subarray(3, recv_sizes, sub_sizes, recv_starts, &
    MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, recv_type, mpierr)

!resize the receive subarray for starting at correct location for next receive
  call MPI_Type_create_resized(recv_type, lb_resize, extent, &
    resize_recv_type, mpierr)
  call MPI_Type_commit(resize_recv_type, mpierr)

!sendcounts and displacement for sending and receiving subarrays
  sendcounts=sendcounts/(ny*nz)
  displacements = displacements/(ny*nz)

  if(rank==0) then
    print*, "Time taken for creating MPI type subarrays: ", MPI_Wtime()-start_time
  endif

  call MPI_Barrier(mpi_comm_world, mpierr)
  start_time=MPI_Wtime()
!scatter the subarrays
  call MPI_Scatterv(array, sendcounts, displacements, resize_send_type, &
    array_local, sendcounts, resize_recv_type, 0, MPI_COMM_WORLD, mpierr)

  if(rank==0) then
    print*, "Time taken for scattering using MPI type subarrays: ", MPI_Wtime()-start_time
  endif
  call MPI_Barrier(mpi_comm_world, mpierr)
!print the scattered array
!print*, "Scattered array with subarray: ", rank
!print*, array_local

!do some computations on the scattered local arrays
  array_local = array_local+1

  call MPI_Barrier(mpi_comm_world, mpierr)
  start_time=MPI_Wtime()
!Gather the local arrays to global (array) using the same subarrays
  call MPI_Gatherv(array_local, local_size, resize_recv_type, array, &
    sendcounts, displacements, resize_send_type, 0, MPI_COMM_WORLD, mpierr)

  if(rank==0) then
    print*, "Time taken by MPI_Type_create_subarray Gathering: ", MPI_Wtime()-start_time
  endif

!if(rank==0) then
! print*, "Gathered array: ------------------"
! print*, array
!endif
  call MPI_Finalize(mpierr)



  contains

  subroutine distribute_points(npts, rank, size, start_idx, end_idx)
    implicit none

    integer, intent(in) :: npts, size, rank
    integer, intent(out) :: start_idx, end_idx
    integer :: pts_per_proc

    pts_per_proc = npts/size

    if(rank < mod(npts, size)) then
      pts_per_proc=pts_per_proc + 1
    end if

    if(rank < mod(npts, size)) then
      start_idx = rank * pts_per_proc + 1
      end_idx = (rank + 1) * pts_per_proc
    else
      start_idx = mod(npts, size) + rank*pts_per_proc + 1
      end_idx = mod(npts, size) + (rank + 1) * pts_per_proc
    end if

  end subroutine distribute_points
end program ex_scatterv

MPI_Scatterv from Intel MPI (mpiifort) using MPI data types is much slower (23 times) compared to flattening array and scattering. Why it could be?

Answers (1)

Related Questions