Reputation: 23
I don't get why the program deadlocks on rank=3
in first iteration. (Possibly different for you).
I want to do some batch processing and if the batch size isn't a multiply of processes I have to handle that case.
I would expect following program flow:
activ_com
rank=1
exits: disconnects from activ_com
, waits on MPI_Finalize
rank=0
sends data, rank=2,3
receive datarank=3
exits: disconnects form active_com
, waits on 'MPI_Finalize`, in real life possible that 5. happens before discconectrank=0
sends data, rank=2
receive datarank=0,1
exitsIt suggest that I didn't understood something fundamentally but I can't figure it out, what is happening and why here.
I tried to study the docs [3] as suggested in the solution of [2]. Find below and MRE below that shows my case.
As you can see I try to use a sub communicator to Bcast only on the active processes. The deadlock apears in step (3), in the first iteration.
I already got an explanation how to handle such case with MPI_Scatterv
and MPI_Gatherv
. [1]
But I think there should be another possibility than using MPI_Scatterv
instead of MPI_Bcast
Another possible solution could be the use of Session Model
, but I think it is crucial to understand what is happening here.
Thank you very much for your help and time.
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>
#define N 32
static int world_size;
MPI_Comm activ_com;
#define COM_ACTIVE_TAG 1
static void signoff(int rank)
{
printf("Registered for Signoff: rank %d out of %d processors\n", rank, world_size);
/* tested splitting for signoff */
//MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, rank, &activ_com);
if (rank != 0) {
int process = 0;
MPI_Status status;
MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG, activ_com, &process, &status);
if (process)
printf("Communication still pending from %d on %d\n", status.MPI_SOURCE, rank);
}
MPI_Comm_disconnect(&activ_com);
}
int main(int argc, char** argv) {
MPI_Init(NULL, NULL); // initialize MPI environment
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
//MPI_Comm_split(MPI_COMM_WORLD, COM_ACTIVE_TAG, world_rank, &activ_com);
MPI_Comm_dup(MPI_COMM_WORLD, &activ_com);
int workers[world_size];
for (int i = 0; i < world_size; ++i)
workers[i] = true;
long dummy[N];
time_t t;
if (world_rank == 0)
srand(3);
if (world_rank == 2)
srand(7);
if (world_rank == 0) {
for (int i = 0; i < N; ++i)
dummy[i] = i;
}
if (world_rank == 1) {
//printf("Registered for Signoff: rank %d out of %d processors\n", world_rank, world_size);
signoff(world_rank);
MPI_Finalize();
printf("Signoff: rank %d out of %d processors\n", world_rank, world_size);
return 0;
}
for(int i = world_rank; i < 7; i+=world_size) {
int wait = 0;
/* use different wait times for testing */
if (world_rank != 3)
wait = rand() % 10;
printf("Bcast: %d/%d, i=%d, waittime=%d\n", world_rank, world_size, i, wait);
sleep(wait);
/* rank=3 is hanging here in the first iteration */
MPI_Bcast(&dummy, N, MPI_LONG, 0, activ_com);
printf("Bcast: %d/%d, i=%d, processed\n", world_rank, world_size, i);
}
/* keep rank alive for testing */
if (world_rank == 3)
sleep(5);
signoff(world_rank);
MPI_Comm_free(&activ_com);
MPI_Finalize();
printf("Signoff: rank %d out of %d processors\n", world_rank, world_size);
return 0;
}
using:
$ mpirun --version
mpirun (Open MPI) 4.1.2
$ mpicc --version
gcc (Ubuntu 11.3.0-1ubuntu1~22.04.1) 11.3.0
[1] Better way than loop over MPI_Send/MPI_Recv if data not divisable my COMM_SIZE?
[2] MPI Bcast and the number of processes
[3] https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf
Upvotes: 0
Views: 74
Reputation: 23
Thanks to @janneb and @GillesGouaillardet their hints let me play around with the right things it seems.
The following example shows how (I think) to use MPI_Comm_split
correctly to create different communicators to handle early exits of the processes.
The key idea here is to create communicators upfront the loop, instead of doing it in the loop.
We create 2 communicators: activ_com
which includes all processes that possible loop.
loop_com
is again an subset of activ_com
of processes for the last iteration
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>
#define N 32
static int world_size;
MPI_Comm activ_com;
#define COM_ACTIVE 1
#define COM_LOOP 2
static void signoff(int rank)
{
printf("Registered for Signoff: rank %d out of %d processors\n", rank, world_size);
MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, rank, &activ_com);
MPI_Finalize();
printf("Signoff: rank %d out of %d processors\n", rank, world_size);
}
int main(int argc, char** argv) {
MPI_Init(NULL, NULL); // initialize MPI environment
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int workers[world_size];
for (int i = 0; i < world_size; ++i)
workers[i] = true;
long dummy[N];
time_t t;
if (world_rank == 0)
srand(3);
if (world_rank == 2)
srand(7);
if (world_rank == 0) {
for (int i = 0; i < N; ++i)
dummy[i] = i;
}
if (world_rank == 1) {
signoff(world_rank);
return 0;
}
MPI_Comm_split(MPI_COMM_WORLD, COM_ACTIVE, world_rank, &activ_com);
const int nmax = 7;
MPI_Comm loop_com;
if (world_rank > (nmax % world_size)) {
MPI_Comm_split(activ_com, MPI_UNDEFINED, world_rank, &loop_com);
} else {
MPI_Comm_split(activ_com, COM_LOOP, world_rank, &loop_com);
}
for(int i = world_rank; i < nmax; i+=world_size) {
int wait = 0;
/* use different wait times for testing */
if (world_rank != 3)
wait = rand() % 10;
printf("Bcast: %d/%d, i=%d, waittime=%d\n", world_rank, world_size, i, wait);
sleep(wait);
MPI_Bcast(&dummy, N, MPI_LONG, 0, (i > (nmax%world_size))?loop_com:activ_com);
printf("Bcast: %d/%d, i=%d, processed\n", world_rank, world_size, i);
}
if (world_rank == 3)
sleep(5);
MPI_Comm_free(&activ_com);
MPI_Comm_free(&loop_com);
MPI_Finalize();
printf("Signoff: rank %d out of %d processors\n", world_rank, world_size);
return 0;
}
Upvotes: 0