VladMir
VladMir

Reputation: 23

MPI Broadcast on subset of MPI_COMM_WORLD results in deadlock

I don't get why the program deadlocks on rank=3 in first iteration. (Possibly different for you).

I want to do some batch processing and if the batch size isn't a multiply of processes I have to handle that case.

I would expect following program flow:

  1. sign up all processes to activ_com
  2. rank=1 exits: disconnects from activ_com, waits on MPI_Finalize
  3. first iteration: rank=0 sends data, rank=2,3 receive data
  4. rank=3 exits: disconnects form active_com, waits on 'MPI_Finalize`, in real life possible that 5. happens before discconect
  5. second iteration: rank=0 sends data, rank=2 receive data
  6. rank=0,1 exits

It suggest that I didn't understood something fundamentally but I can't figure it out, what is happening and why here.

I tried to study the docs [3] as suggested in the solution of [2]. Find below and MRE below that shows my case.

As you can see I try to use a sub communicator to Bcast only on the active processes. The deadlock apears in step (3), in the first iteration.

I already got an explanation how to handle such case with MPI_Scattervand MPI_Gatherv. [1] But I think there should be another possibility than using MPI_Scatterv instead of MPI_Bcast

Another possible solution could be the use of Session Model, but I think it is crucial to understand what is happening here.

Thank you very much for your help and time.

#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>

#define N 32

static int world_size;

MPI_Comm activ_com;
#define COM_ACTIVE_TAG 1

static void signoff(int rank)
{
        printf("Registered for Signoff: rank %d out of %d processors\n", rank, world_size);
        /* tested splitting for signoff */        
        //MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, rank, &activ_com);
        if (rank != 0) {
                int process = 0;
                MPI_Status status;
                MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG, activ_com, &process, &status);
                if (process)
                        printf("Communication still pending from %d on %d\n", status.MPI_SOURCE, rank);
        }
                MPI_Comm_disconnect(&activ_com);
}

int main(int argc, char** argv) {

        MPI_Init(NULL, NULL);      // initialize MPI environment
        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
        int world_rank;
        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

        //MPI_Comm_split(MPI_COMM_WORLD, COM_ACTIVE_TAG, world_rank, &activ_com);
        MPI_Comm_dup(MPI_COMM_WORLD, &activ_com);

        int workers[world_size];
        for (int i = 0; i < world_size; ++i)
                workers[i] = true;
        
        long dummy[N];
        time_t t;
        if (world_rank == 0)
                srand(3);
        if (world_rank == 2)
                srand(7);
        if (world_rank == 0) {
                for (int i = 0; i < N; ++i)
                        dummy[i] = i;
        }
        if (world_rank == 1) {
                
                //printf("Registered for Signoff: rank %d out of %d processors\n", world_rank, world_size);
                signoff(world_rank);
                MPI_Finalize();
                printf("Signoff: rank %d out of %d processors\n", world_rank, world_size);
                return 0;
        }

        for(int i = world_rank; i < 7; i+=world_size) {
                int wait = 0;

                /* use different wait times for testing */
                if (world_rank != 3)
                        wait = rand() % 10;
                
                printf("Bcast: %d/%d, i=%d, waittime=%d\n", world_rank, world_size, i, wait);
                sleep(wait);
                
                /* rank=3 is hanging here in the first iteration */              
                MPI_Bcast(&dummy, N, MPI_LONG, 0, activ_com);
                
                printf("Bcast: %d/%d, i=%d, processed\n", world_rank, world_size, i);
        }

        /* keep rank alive for testing */
        if (world_rank == 3)
                sleep(5);


        signoff(world_rank);
        MPI_Comm_free(&activ_com);
        MPI_Finalize();
        printf("Signoff: rank %d out of %d processors\n", world_rank, world_size);
        return 0;
}

using:


$ mpirun --version
mpirun (Open MPI) 4.1.2

$ mpicc --version
gcc (Ubuntu 11.3.0-1ubuntu1~22.04.1) 11.3.0

[1] Better way than loop over MPI_Send/MPI_Recv if data not divisable my COMM_SIZE?

[2] MPI Bcast and the number of processes

[3] https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf

Upvotes: 0

Views: 74

Answers (1)

VladMir
VladMir

Reputation: 23

Thanks to @janneb and @GillesGouaillardet their hints let me play around with the right things it seems.

The following example shows how (I think) to use MPI_Comm_split correctly to create different communicators to handle early exits of the processes.

The key idea here is to create communicators upfront the loop, instead of doing it in the loop.

We create 2 communicators: activ_com which includes all processes that possible loop. loop_com is again an subset of activ_com of processes for the last iteration

#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>

#define N 32

static int world_size;

MPI_Comm activ_com;
#define COM_ACTIVE 1
#define COM_LOOP 2

static void signoff(int rank)
{
    printf("Registered for Signoff: rank %d out of %d processors\n", rank, world_size);
    MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, rank, &activ_com);
    MPI_Finalize();
    printf("Signoff: rank %d out of %d processors\n", rank, world_size);
 }

 int main(int argc, char** argv) {

    MPI_Init(NULL, NULL);      // initialize MPI environment
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    int workers[world_size];
    for (int i = 0; i < world_size; ++i)
            workers[i] = true;
    
    long dummy[N];
    time_t t;
    if (world_rank == 0)
            srand(3);
    if (world_rank == 2)
            srand(7);
    if (world_rank == 0) {
            for (int i = 0; i < N; ++i)
                    dummy[i] = i;
    }
    if (world_rank == 1) {
            signoff(world_rank);
            return 0;
    }
    MPI_Comm_split(MPI_COMM_WORLD, COM_ACTIVE, world_rank, &activ_com);

    const int nmax = 7;
    MPI_Comm loop_com;
    if (world_rank > (nmax % world_size)) {
            MPI_Comm_split(activ_com, MPI_UNDEFINED, world_rank, &loop_com);
    } else {
            MPI_Comm_split(activ_com, COM_LOOP, world_rank, &loop_com);
    }
    for(int i = world_rank; i < nmax; i+=world_size) {
            int wait = 0;

            /* use different wait times for testing */
            if (world_rank != 3)
                    wait = rand() % 10;
            
            printf("Bcast: %d/%d, i=%d, waittime=%d\n", world_rank, world_size, i, wait);
            sleep(wait);
            
            MPI_Bcast(&dummy, N, MPI_LONG, 0, (i > (nmax%world_size))?loop_com:activ_com);

            printf("Bcast: %d/%d, i=%d, processed\n", world_rank, world_size, i);
    }

    if (world_rank == 3)
            sleep(5);

    MPI_Comm_free(&activ_com);
    MPI_Comm_free(&loop_com);
    MPI_Finalize();
    printf("Signoff: rank %d out of %d processors\n", world_rank, world_size);
    return 0;

}

Upvotes: 0

Related Questions