Reputation: 29
I am having trouble with using MPI for multiplying matrices.
I have the program reading two n x n matrices from two files and am supposed to use MPI. But I am getting a segmentation fault in one of the processes. This is the output I get when I run my code:
read matrix A from matrixA
read matrix B from matrixB
mpirun noticed that process rank 1 with PID 15599 on node VirtualBox exited on signal 11 (Segmentation fault).
Here is my code:
int main (int argc, char * argv[])
{
/* Check the number of arguments */
int n; /* Dimension of the matrix */
float *sa, *sb, *sc; /* Storage for matrix A, B, and C */
float **a, **b, **c; /* 2D array to access matrix A, B, and C */
int i, j, k;
MPI_Init(&argc, &argv); //Initialize MPI operations
MPI_Comm_rank(MPI_COMM_WORLD, &rank); //Get the rank
MPI_Comm_size(MPI_COMM_WORLD, &size); //Get number of processes
if(argc != 4) {
printf("Usage: %s fileA fileB fileC\n", argv[0]);
return 1;
}
if(rank == 0)
{
/* Read matrix A */
printf("read matrix A from %s\n", argv[1]);
read_matrix(argv[1], &a, &sa, &i, &j);
if(i != j) {
printf("ERROR: matrix A not square\n"); return 2;
}
n = i;
//printf("%d", n);
/* Read matrix B */
printf("Read matrix B from %s\n", argv[2]);
read_matrix(argv[2], &b, &sb, &i, &j);
if(i != j) {
printf("ERROR: matrix B not square\n");
return 2;
}
if(n != i) {
printf("ERROR: matrix A and B incompatible\n");
return 2;
}
}
printf("test");
if(rank == 0)
{
/* Initialize matrix C */
sc = (float*)malloc(n*n*sizeof(float));
memset(sc, 0, n*n*sizeof(float));
c = (float**)malloc(n*sizeof(float*));
for(i=0; i<n; i++) c[i] = &sc[i*n];
}
////////////////////////////////////////////////////////////////////////////////////////////
float matrA[n][n];
float matrB[n][n];
float matrC[n][n];
for(i = 0; i < n; i++)
{
for(j = 0; j < n; j++)
{
matrA[i][j] = sa[(i*n) + j];
matrB[i][j] = sb[(i*n) + j];
}
}
/* Master initializes work*/
if (rank == 0)
{
start_time = MPI_Wtime();
for (i = 1; i < size; i++)
{
//For each slave other than the master
portion = (n / (size - 1)); // Calculate portion without master
low_bound = (i - 1) * portion;
if (((i + 1) == size) && ((n % (size - 1)) != 0))
{
//If rows of [A] cannot be equally divided among slaves,
upper_bound = n; //the last slave gets all the remaining rows.
}
else
{
upper_bound = low_bound + portion; //Rows of [A] are equally divisable among slaves
}
//Send the low bound first without blocking, to the intended slave.
MPI_Isend(&low_bound, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &request);
//Next send the upper bound without blocking, to the intended slave
MPI_Isend(&upper_bound, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &request);
//Finally send the allocated row portion of [A] without blocking, to the intended slave
MPI_Isend(&matrA[low_bound][0], (upper_bound - low_bound) * n, MPI_FLOAT, i, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &request);
}
}
//broadcast [B] to all the slaves
MPI_Bcast(&matrB, n*n, MPI_FLOAT, 0, MPI_COMM_WORLD);
/* work done by slaves*/
if (rank > 0)
{
//receive low bound from the master
MPI_Recv(&low_bound, 1, MPI_INT, 0, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &status);
//next receive upper bound from the master
MPI_Recv(&upper_bound, 1, MPI_INT, 0, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &status);
//finally receive row portion of [A] to be processed from the master
MPI_Recv(&matrA[low_bound][0], (upper_bound - low_bound) * n, MPI_FLOAT, 0, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &status);
for (i = low_bound; i < upper_bound; i++)
{
//iterate through a given set of rows of [A]
for (j = 0; j < n; j++)
{
//iterate through columns of [B]
for (k = 0; k < n; k++)
{
//iterate through rows of [B]
matrC[i][j] += (matrA[i][k] * matrB[k][j]);
}
}
}
//send back the low bound first without blocking, to the master
MPI_Isend(&low_bound, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &request);
//send the upper bound next without blocking, to the master
MPI_Isend(&upper_bound, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &request);
//finally send the processed portion of data without blocking, to the master
MPI_Isend(&matrC[low_bound][0],
(upper_bound - low_bound) * n,
MPI_FLOAT,
0,
SLAVE_TO_MASTER_TAG + 2,
MPI_COMM_WORLD,
&request);
}
/* Master gathers processed work*/
if (rank == 0)
{
for (i = 1; i < size; i++)
{
// Until all slaves have handed back the processed data,
// receive low bound from a slave.
MPI_Recv(&low_bound, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &status);
//Receive upper bound from a slave
MPI_Recv(&upper_bound, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &status);
//Receive processed data from a slave
MPI_Recv(&matrC[low_bound][0],
(upper_bound - low_bound) * n,
MPI_FLOAT,
i,
SLAVE_TO_MASTER_TAG + 2,
MPI_COMM_WORLD,
&status);
}
end_time = MPI_Wtime();
printf("\nRunning Time = %f\n\n", end_time - start_time);
}
MPI_Finalize(); //Finalize MPI operations
/* Do the multiplication */
//////////////////////////////////////////////////// matmul(a, b, c, n);
for(i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
sc[(i*n) + j] = matrC[i][j];
}
}
}
Upvotes: 2
Views: 973
Reputation: 51443
Every process declares the pointers to the matrices, namely:
float *sa, *sb, *sc; /* storage for matrix A, B, and C */
but only the process 0
(allocates and) fills up the arrays sa
and sb
:
if(rank == 0)
{
...
read_matrix(argv[1], &a, &sa, &i, &j);
...
read_matrix(argv[2], &b, &sb, &i, &j);
...
}
However, afterward every process tries to access the positions of the sa
and sb
array:
for(i = 0; i < n; i++)
{
for(j = 0; j < n; j++)
{
matrA[i][j] = sa[(i*n) + j];
matrB[i][j] = sb[(i*n) + j];
}
}
Since only the process 0
had (allocated and) filled up the arrays sa
and sb
, the remaining processes are trying to access memory (sa[(i*n) + j]
and sb[(i*n) + j]
) that they have not allocated. Hence, the reason why you get segmentation fault
.
On a side note, there is another problem in your program - you initiate non-blocking sends with MPI_Isend
but never wait on the completion of the returned request handles. MPI implementations are not even required to start the send operation until it is properly progressed to completion, mostly by a call to one of the wait or test operations (MPI_Wait
, MPI_Waitsome
, MPI_Waitall
, and so on). Even worse, you reuse the same handle variable request
, effectively losing the handles to all previously initiated requests, which makes them unwaitable/untestable. Use an array of requests instead and wait for all of them to finish with MPI_Waitall
after the send loop.
Also think about this - do you really need non-blocking operations to send data back from the workers?
Upvotes: 3