Reputation: 33
I have a Fortran program that uses MPI types to describe subarrays for data transfers, which prevents me to create send/receive buffers manually. This works nice, but when accelerating the program with OpenACC, the subarrays may be not contiguous in memory, and may require more or less time to be copied from the GPU memory.
Here is a (rather long) minimal example of two cubic domains of n**3
elements exchanging one face with each other (domain 0 sends face n
- 1 and receives face n
, domain 1 sends face 2 and receives face 1). The face to use is either i, j, or k, and must be set at compile time with either -DORDER_I
, -DORDER_J
, or -DORDER_K
. Subarray MPI types are created with mpi_type_create_subarray
and data are exchanged with mpi_sendrecv
:
#if defined(ORDER_I)
#define SUBARRAY_ELEMENTS(size) [size, elements, elements]
#define SUBARRAY_INDEX(index) [index, 0, 0]
#define ARRAY(array, dim, other) array(dim, other, other)
#elif defined(ORDER_J)
#define SUBARRAY_ELEMENTS(size) [elements, size, elements]
#define SUBARRAY_INDEX(index) [0, index, 0]
#define ARRAY(array, dim, other) array(other, dim, other)
#elif defined(ORDER_K)
#define SUBARRAY_ELEMENTS(size) [elements, elements, size]
#define SUBARRAY_INDEX(index) [0, 0, index]
#define ARRAY(array, dim, other) array(other, other, dim)
#else
#error "Define ORDER_I, ORDER_J, or ORDER_K"
#endif
#ifndef ELEMENTS
#define ELEMENTS 100
#endif
#ifndef ITERATIONS
#define ITERATIONS 10
#endif
program test
use mpi
use openacc
implicit none
integer, allocatable :: array(:, :, :)
integer, parameter :: elements = ELEMENTS
integer, parameter :: iterations = ITERATIONS
integer :: ierror
integer :: rank, rank_other
integer :: recv_index, send_index
integer :: recv_type, send_type
integer :: gpu_device, n_gpu
integer :: k, i
call mpi_init(ierror)
call mpi_comm_rank(MPI_COMM_WORLD, rank, ierror)
! assign GPU to MPI rank
gpu_device = acc_get_device_type()
n_gpu = acc_get_num_devices(gpu_device)
call acc_set_device_num(mod(rank, n_gpu), gpu_device)
! set parameters according to MPI rank
if (rank == 0) then
rank_other = 1
send_index = elements - 1
recv_index = elements
else
rank_other = 0
send_index = 2
recv_index = 1
end if
! create mpi types
call mpi_type_create_subarray( &
3, &
[elements, elements, elements], &
SUBARRAY_ELEMENTS(1), &
SUBARRAY_INDEX(send_index - 1), &
MPI_ORDER_FORTRAN, &
MPI_INTEGER, &
send_type, &
ierror &
)
call mpi_type_commit(send_type, ierror)
call mpi_type_create_subarray( &
3, &
[elements, elements, elements], &
SUBARRAY_ELEMENTS(1), &
SUBARRAY_INDEX(recv_index - 1), &
MPI_ORDER_FORTRAN, &
MPI_INTEGER, &
recv_type, &
ierror &
)
call mpi_type_commit(recv_type, ierror)
! create arrays
allocate(array(elements, elements, elements))
!$acc data copyout(array)
!$acc kernels loop independent
do i = 1, elements
ARRAY(array, i, :) = rank * 1000 + i
end do
!$acc end kernels
!$acc host_data use_device(array)
do k = 1, iterations
!$acc kernels
array(:, :, :) = array(:, :, :) + 1
!$acc end kernels
! transfer data
call mpi_sendrecv( &
array, &
1, &
send_type, &
rank_other, &
10, &
array, &
1, &
recv_type, &
rank_other, &
10, &
MPI_COMM_WORLD, &
MPI_STATUS_IGNORE, &
ierror &
)
end do
!$acc end host_data
!$acc end data
! print outcome
print "(i0, ':', 2(x, i5), ' ...', 2(x, i5))", rank, ARRAY(array, 1:2, 1), ARRAY(array, elements-1:elements, 1)
deallocate(array)
call mpi_finalize(ierror)
end program
Compiled with NVHPC SDK 22.7 and executed on 2 NVIDIA P100 GPUs, the code shows that memory copy times differ significantly: the worst being order i (86 % of time spent in copying), then order j (6 %), then order k (less than 1 %). This makes sense, as a k face would be contiguous in memory. Visualizing the different performance on NVIDIA Nsight Systems gives:
The graph shows the memory operations sandwiched between two dummy computations.
I suppose using MPI types implies the creation of buffers under the hood. Unfortunately, the way the buffers are created is not optimal as many operations may be made from and to the GPU memory.
Rewriting the example with manual buffer creation, the code becomes:
#if defined(ORDER_I)
#define SUBARRAY_ELEMENTS(size) [size, elements, elements]
#define SUBARRAY_INDEX(index) [index, 0, 0]
#define ARRAY(array, dim, other) array(dim, other, other)
#define ARRAY2(array, dim, other1, other2) array(dim, other1, other2)
#elif defined(ORDER_J)
#define SUBARRAY_ELEMENTS(size) [elements, size, elements]
#define SUBARRAY_INDEX(index) [0, index, 0]
#define ARRAY(array, dim, other) array(other, dim, other)
#define ARRAY2(array, dim, other1, other2) array(other1, dim, other2)
#elif defined(ORDER_K)
#define SUBARRAY_ELEMENTS(size) [elements, elements, size]
#define SUBARRAY_INDEX(index) [0, 0, index]
#define ARRAY(array, dim, other) array(other, other, dim)
#define ARRAY2(array, dim, other1, other2) array(other1, other2, dim)
#else
#error "Define ORDER_I, ORDER_J, or ORDER_K"
#endif
#ifndef ELEMENTS
#define ELEMENTS 100
#endif
#ifndef ITERATIONS
#define ITERATIONS 10
#endif
program test
use mpi
use openacc
implicit none
integer, allocatable :: array(:, :, :)
integer, allocatable :: send_buffer(:), recv_buffer(:)
integer, parameter :: elements = ELEMENTS
integer, parameter :: iterations = ITERATIONS
integer :: ierror
integer :: rank, rank_other
integer :: recv_index, send_index
integer :: gpu_device, n_gpu
integer :: k, i, j
call mpi_init(ierror)
call mpi_comm_rank(MPI_COMM_WORLD, rank, ierror)
! assign GPU to MPI rank
gpu_device = acc_get_device_type()
n_gpu = acc_get_num_devices(gpu_device)
call acc_set_device_num(mod(rank, n_gpu), gpu_device)
! set parameters according to MPI rank
if (rank == 0) then
rank_other = 1
send_index = elements - 1
recv_index = elements
else
rank_other = 0
send_index = 2
recv_index = 1
end if
! create arrays
allocate(array(elements, elements, elements))
allocate(send_buffer(elements * elements))
allocate(recv_buffer(elements * elements))
!$acc data copyout(array) copyout(send_buffer, recv_buffer)
!$acc kernels loop independent
do i = 1, elements
ARRAY(array, i, :) = rank * 1000 + i
end do
!$acc end kernels
!$acc host_data use_device(send_buffer, recv_buffer)
do k = 1, iterations
!$acc kernels
array(:, :, :) = array(:, :, :) + 1
!$acc end kernels
! data to buffer
!$acc kernels loop independent collapse(2)
do j = 1, elements
do i = 1, elements
send_buffer((j - 1) * elements + i) = ARRAY2(array, send_index, i, j)
end do
end do
!$acc end kernels
! transfer data
call mpi_sendrecv( &
send_buffer, &
size(send_buffer), &
MPI_INTEGER, &
rank_other, &
10, &
recv_buffer, &
size(recv_buffer), &
MPI_INTEGER, &
rank_other, &
10, &
MPI_COMM_WORLD, &
MPI_STATUS_IGNORE, &
ierror &
)
! buffer to data
!$acc kernels loop independent collapse(2)
do j = 1, elements
do i = 1, elements
ARRAY2(array, recv_index, i, j) = recv_buffer((j - 1) * elements + i)
end do
end do
!$acc end kernels
end do
!$acc end host_data
!$acc end data
! print outcome
print "(i0, ':', 2(x, i5), ' ...', 2(x, i5))", rank, ARRAY(array, 1:2, 1), ARRAY(array, elements-1:elements, 1)
deallocate(array)
call mpi_finalize(ierror)
end program
Executed in the same environment, for any face used, time spend in copying is almost the same (less than 1 %):
So my question is, is there a way to accelerate the use of MPI types (first code)?
Upvotes: 3
Views: 169