nosmaS
nosmaS

Reputation: 23

segmentation fault MPI create struct

I was writing an MPI program that solves the N-queens problem, process 0 is supposed to solve the problem partially and let the other processes finish it. My program compiles but it is showing me a segmentation fault after creating the struct type and while trying to send the struct. this happens in the spawn_processes function (which is executed by process 0, thus the segfault happens in process 0), while trying to send the subProblemType.

here is my code:

#include <stdio.h>
#include <mpi.h>


#define MAXBOARDSIZE 8

static int board_size=8;
typedef struct boardplus{
  int size; // board size
  int x; // this is where we need to restart
  int y; // idem
  int board[MAXBOARDSIZE]; // the actual board, padded to largest instance
} subproblem;




#define  INIT   1  // Message to client:  subproblem
#define  DATA   2  // Message from client with results
#define  EXIT   4  // Message from client with CPU time
                   // Also to client, giving permission to exit
static long int N_solutions;
int solution_count;

void spawn_processes(int rows[board_size],int y){
    //printf("_______________________");
    subproblem subP;
    int col,//column number to start from
        count,//number of solutions recieved from a worker
        nProc,//total number of processes
        proc,
        nActive,i;// number of active processes


    MPI_Status status;
    MPI_Datatype subProblemType;
    MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
    int block_len[4]={1,1,1,MAXBOARDSIZE};
    MPI_Aint disp[4];


    MPI_Address(&subP,disp);
    MPI_Address(&subP.x,disp+1);
    MPI_Address(&subP.y,disp+2);
    MPI_Address(&subP.board,disp+3);
    int base=disp[0];
    for(i=0;i<4;i++) disp[i]-=base;

    MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
    MPI_Type_commit(&subProblemType);
    MPI_Comm_size(MPI_COMM_WORLD,&nProc);
    subP.size=board_size;
    subP.y=y;
    //subP.board=rows;
    for(i=0;i<y;i++)subP.board[i]=rows[i];

    printf("spawning processes  ....\n");

    for(col=0,proc=1;proc<nProc && col<board_size;proc++, col++){
        printf("sending to process %d \n ",proc);
        fflush(stdout);
       subP.x=col;
       MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
    }

    nActive=proc-1;
    // Receive back results and send out new problems
    while(col<board_size){
        MPI_Recv(&count,1,MPI_INT,MPI_ANY_SOURCE,DATA,MPI_COMM_WORLD,&status);
        proc=status.MPI_SOURCE;
        printf("recieved from process %d, found %d solutions \n",proc,count);
        N_solutions+=count;
        subP.x=col++;
        MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
    }

    // Finally, receive back pending results and send termination
    // indication (message with size of zero).
    subP.size=0;
    while(nActive>0){
        MPI_Recv(&count,1,MPI_INT,MPI_ANY_SOURCE,DATA,MPI_COMM_WORLD,&status);
        proc=status.MPI_SOURCE;
        printf("recieved from process %d, found %d solutions \n",proc,count);
        --nActive;
        N_solutions+=count;
        //send a subproblem with size=0 (termination messages)
        MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
    }

    for (proc = 1; proc < nProc; proc++)  
        MPI_Send(&proc, 0, MPI_INT, proc, EXIT, MPI_COMM_WORLD);


}




void process_queens(int my_id){

    int root=0;
    subproblem subP;
    MPI_Status status;
    int rows[board_size];
    int x,y,i;  

    MPI_Datatype subProblemType;
    MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
    int block_len[4]={1,1,1,MAXBOARDSIZE};
    MPI_Aint disp[4];


    MPI_Address(&subP,disp);
    MPI_Address(&subP.x,disp+1);
    MPI_Address(&subP.y,disp+2);
    MPI_Address(&subP.board,disp+3);
    int base=disp[0];

    for(i=0;i<4;i++) disp[i]-=base;

    MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
    MPI_Type_commit(&subProblemType);

    printf("process %d waiting to recieve a task\n",my_id);
    fflush(stdout);
    MPI_Recv(&subP,1,subProblemType,root,INIT,MPI_COMM_WORLD,&status);

    while(subP.size>0){
    x=subP.x;
    y=subP.y;
    for(i=0;i<y;i++)rows[i]=subP.board[i];
    //rows=subP.board;


    if(is_safe(rows,x,y)){
    rows[y]=x;
    n_queens_solver(rows,y+1);
    }
    MPI_Send(&N_solutions,1,MPI_INT,root,DATA,MPI_COMM_WORLD);
    }

    // Final hand-shake:  get permission to terminate
   MPI_Recv(&N_solutions, 0, MPI_INT, 0, EXIT, MPI_COMM_WORLD, &status);

}

int is_safe(int rows[board_size], int x, int y)  
{
    int i;
    if (y == 0)
            return 1;
    for (i=0; i < y; ++i) {
       if (rows[i] == x || rows[i] == x + y - i || rows[i] == x - y +i)
            return 0;
    } 
    return 1;
}



void n_queens_solver(int rows[board_size], int y) 
{
    int x;

    for (x=0; x < board_size; ++x) {
        if (is_safe(rows, x, y)) {
            rows[y] = x;
            if (y == board_size-1) {

        ++N_solutions;
        }
            else
              n_queens_solver(rows, y+1);
        }
    }
}


void n_queens_expander(int rows[board_size], int y, int expand_levels)
{
  int x;
 if(y == expand_levels-1){

    spawn_processes(rows,y);

    }
else{
     for (x=0; x < board_size; ++x) {
    if (is_safe(rows, x, y)) 
      {
    rows[y] = x;
    n_queens_expander(rows, y+1, expand_levels-1);
      }
  }
  }
}



int main(int argc,char *argv[]) {

int rows[board_size];
//int expand_levels=1;
int numproc,my_id;
MPI_Status status;
MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&numproc);
    MPI_Comm_rank(MPI_COMM_WORLD,&my_id);
    //printf("number of processes:%d \n",numproc);
    if(my_id==0){
        //printf("process 0 starting...\n");
        n_queens_expander(rows,0,1);
    }
    else{

        process_queens(my_id);
}
MPI_Finalize();

return 0;
}

Upvotes: 1

Views: 394

Answers (3)

Hristo Iliev
Hristo Iliev

Reputation: 74495

Your error is right...

MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0]; // <--------------------- HERE
for(i=0;i<4;i++) disp[i]-=base;

On LP64 systems (which includes 64-bit x86 systems running OS X, FreeBSD, Solaris or Linux) MPI_Aint is 8 bytes long while int is only 4 bytes long. subP is a stack variable and the stack of the main thread on x64 is located high in the virtual address space, therefore a truncation occurs in the assignment to base and the computed displacements have nothing to do with the actual ones.

Solution: base should be of type MPI_Aint.

Solution 2:

for(i=1;i<4;i++) disp[i]-=disp[0];
disp[0] = 0;

The same problem is present in all ranks but since MPI_Recv in the workers never writes to the memory due to the master crashing and not sending data, they do not segfault.

Remember to always compile with -Wall and to pay attention to the warning messages produced by your compiler.

Upvotes: 1

Rob Latham
Rob Latham

Reputation: 5223

You have this:

static int board_size=8;
typedef struct boardplus{
  int size; // board size
  int x; // this is where we need to restart
  int y; // idem
 int board[MAXBOARDSIZE]; // the actual board, padded to largest instance
} subproblem;

and make a type like this:

MPI_Datatype subProblemType;
MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
int block_len[4]={1,1,1,MAXBOARDSIZE};
MPI_Aint disp[4];


MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0];
for(i=0;i<4;i++) disp[i]-=base;

MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);

I'm skeptical of your fixup of displacement. Far more commonly, the way to deal with MPI_Address kinds of things is to then use MPI_BOTTOM as your buffer. So instead of sending subP, your send would look like this:

MPI_Send(MPI_BOTTOM,1,subProblemType,proc,INIT,MPI_COMM_WORLD);

Upvotes: 0

Marshall Conover
Marshall Conover

Reputation: 845

While I'm not familiar with MPI, a hunch and some quick googling suggests you need to declare a type for that array in the struct, instead of just passing it as an integer.* Passing it as an integer will likely just send the pointer to the first item in the array to the remote machine, which will then probably be some bogus memory address on the remote machine, resulting in a segfault when the remote machine uses that address.

Take a look at this answer: Creating an MPI_Datatype for a structure containing pointers. You'll probably be able to work off of that basic idea. Note - from a brief skimming, Hristo Iliev's response in that thread may be the better approach structurally for your program.

* I'm assuming that's what the line MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT}; means

Upvotes: 0

Related Questions