Reputation: 23
I was writing an MPI program that solves the N-queens problem, process 0 is supposed to solve the problem partially and let the other processes finish it. My program compiles but it is showing me a segmentation fault after creating the struct type and while trying to send the struct.
this happens in the spawn_processes
function (which is executed by process 0, thus the segfault happens in process 0), while trying to send the subProblemType
.
here is my code:
#include <stdio.h>
#include <mpi.h>
#define MAXBOARDSIZE 8
static int board_size=8;
typedef struct boardplus{
int size; // board size
int x; // this is where we need to restart
int y; // idem
int board[MAXBOARDSIZE]; // the actual board, padded to largest instance
} subproblem;
#define INIT 1 // Message to client: subproblem
#define DATA 2 // Message from client with results
#define EXIT 4 // Message from client with CPU time
// Also to client, giving permission to exit
static long int N_solutions;
int solution_count;
void spawn_processes(int rows[board_size],int y){
//printf("_______________________");
subproblem subP;
int col,//column number to start from
count,//number of solutions recieved from a worker
nProc,//total number of processes
proc,
nActive,i;// number of active processes
MPI_Status status;
MPI_Datatype subProblemType;
MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
int block_len[4]={1,1,1,MAXBOARDSIZE};
MPI_Aint disp[4];
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0];
for(i=0;i<4;i++) disp[i]-=base;
MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
MPI_Type_commit(&subProblemType);
MPI_Comm_size(MPI_COMM_WORLD,&nProc);
subP.size=board_size;
subP.y=y;
//subP.board=rows;
for(i=0;i<y;i++)subP.board[i]=rows[i];
printf("spawning processes ....\n");
for(col=0,proc=1;proc<nProc && col<board_size;proc++, col++){
printf("sending to process %d \n ",proc);
fflush(stdout);
subP.x=col;
MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
}
nActive=proc-1;
// Receive back results and send out new problems
while(col<board_size){
MPI_Recv(&count,1,MPI_INT,MPI_ANY_SOURCE,DATA,MPI_COMM_WORLD,&status);
proc=status.MPI_SOURCE;
printf("recieved from process %d, found %d solutions \n",proc,count);
N_solutions+=count;
subP.x=col++;
MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
}
// Finally, receive back pending results and send termination
// indication (message with size of zero).
subP.size=0;
while(nActive>0){
MPI_Recv(&count,1,MPI_INT,MPI_ANY_SOURCE,DATA,MPI_COMM_WORLD,&status);
proc=status.MPI_SOURCE;
printf("recieved from process %d, found %d solutions \n",proc,count);
--nActive;
N_solutions+=count;
//send a subproblem with size=0 (termination messages)
MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
}
for (proc = 1; proc < nProc; proc++)
MPI_Send(&proc, 0, MPI_INT, proc, EXIT, MPI_COMM_WORLD);
}
void process_queens(int my_id){
int root=0;
subproblem subP;
MPI_Status status;
int rows[board_size];
int x,y,i;
MPI_Datatype subProblemType;
MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
int block_len[4]={1,1,1,MAXBOARDSIZE};
MPI_Aint disp[4];
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0];
for(i=0;i<4;i++) disp[i]-=base;
MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
MPI_Type_commit(&subProblemType);
printf("process %d waiting to recieve a task\n",my_id);
fflush(stdout);
MPI_Recv(&subP,1,subProblemType,root,INIT,MPI_COMM_WORLD,&status);
while(subP.size>0){
x=subP.x;
y=subP.y;
for(i=0;i<y;i++)rows[i]=subP.board[i];
//rows=subP.board;
if(is_safe(rows,x,y)){
rows[y]=x;
n_queens_solver(rows,y+1);
}
MPI_Send(&N_solutions,1,MPI_INT,root,DATA,MPI_COMM_WORLD);
}
// Final hand-shake: get permission to terminate
MPI_Recv(&N_solutions, 0, MPI_INT, 0, EXIT, MPI_COMM_WORLD, &status);
}
int is_safe(int rows[board_size], int x, int y)
{
int i;
if (y == 0)
return 1;
for (i=0; i < y; ++i) {
if (rows[i] == x || rows[i] == x + y - i || rows[i] == x - y +i)
return 0;
}
return 1;
}
void n_queens_solver(int rows[board_size], int y)
{
int x;
for (x=0; x < board_size; ++x) {
if (is_safe(rows, x, y)) {
rows[y] = x;
if (y == board_size-1) {
++N_solutions;
}
else
n_queens_solver(rows, y+1);
}
}
}
void n_queens_expander(int rows[board_size], int y, int expand_levels)
{
int x;
if(y == expand_levels-1){
spawn_processes(rows,y);
}
else{
for (x=0; x < board_size; ++x) {
if (is_safe(rows, x, y))
{
rows[y] = x;
n_queens_expander(rows, y+1, expand_levels-1);
}
}
}
}
int main(int argc,char *argv[]) {
int rows[board_size];
//int expand_levels=1;
int numproc,my_id;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numproc);
MPI_Comm_rank(MPI_COMM_WORLD,&my_id);
//printf("number of processes:%d \n",numproc);
if(my_id==0){
//printf("process 0 starting...\n");
n_queens_expander(rows,0,1);
}
else{
process_queens(my_id);
}
MPI_Finalize();
return 0;
}
Upvotes: 1
Views: 394
Reputation: 74495
Your error is right...
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0]; // <--------------------- HERE
for(i=0;i<4;i++) disp[i]-=base;
On LP64 systems (which includes 64-bit x86 systems running OS X, FreeBSD, Solaris or Linux) MPI_Aint
is 8 bytes long while int
is only 4 bytes long. subP
is a stack variable and the stack of the main thread on x64 is located high in the virtual address space, therefore a truncation occurs in the assignment to base
and the computed displacements have nothing to do with the actual ones.
Solution: base
should be of type MPI_Aint
.
Solution 2:
for(i=1;i<4;i++) disp[i]-=disp[0];
disp[0] = 0;
The same problem is present in all ranks but since MPI_Recv
in the workers never writes to the memory due to the master crashing and not sending data, they do not segfault.
Remember to always compile with -Wall
and to pay attention to the warning messages produced by your compiler.
Upvotes: 1
Reputation: 5223
You have this:
static int board_size=8;
typedef struct boardplus{
int size; // board size
int x; // this is where we need to restart
int y; // idem
int board[MAXBOARDSIZE]; // the actual board, padded to largest instance
} subproblem;
and make a type like this:
MPI_Datatype subProblemType;
MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
int block_len[4]={1,1,1,MAXBOARDSIZE};
MPI_Aint disp[4];
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0];
for(i=0;i<4;i++) disp[i]-=base;
MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
I'm skeptical of your fixup of displacement. Far more commonly, the way to deal with MPI_Address kinds of things is to then use MPI_BOTTOM as your buffer. So instead of sending subP, your send would look like this:
MPI_Send(MPI_BOTTOM,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
Upvotes: 0
Reputation: 845
While I'm not familiar with MPI, a hunch and some quick googling suggests you need to declare a type for that array in the struct, instead of just passing it as an integer.* Passing it as an integer will likely just send the pointer to the first item in the array to the remote machine, which will then probably be some bogus memory address on the remote machine, resulting in a segfault when the remote machine uses that address.
Take a look at this answer: Creating an MPI_Datatype for a structure containing pointers. You'll probably be able to work off of that basic idea. Note - from a brief skimming, Hristo Iliev's response in that thread may be the better approach structurally for your program.
* I'm assuming that's what the line MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
means
Upvotes: 0