CUDA: Matrix + Matrix, segmentation fault when printing solution matrix in host

Question

I'm trying to make a simple operation of adding a matrix to another one in CUDA, but I get a segmentation fault when I try to check the resault, here's the code:

/* Includes, system */
#include 

#include 
#include 
#define N 15000

/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){

    int idThread = blockIdx.x*blockDim.x + threadIdx.x;

    if (idThread < N)
    {
        d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
    }
}

__host__ void printMatrix(int **matrix) 
{
    int i, j;
    //only 4 so the file is not too big
    for (i = 0; i < 4; i++) 
    {
        for (j = 0; j < 4; j++)
        {
            printf("%d", matrix[i][j]);
            printf("  ");
        }
        printf("
");
    }
    printf("
");
}

/* HOST CODE*/
int main(int argc, char** argv)
{
    int i;

    int **h_matrix1;
    int **h_matrix2;
    int **h_matrixSolucion;

    int *d_matrix1;
    int *d_matrix2;
    int *d_matrixSolucion;

    h_matrix1 = (int**)malloc(N * sizeof(int*));
    for (i = 0; i < N; i++)
    {
        h_matrix1[i] = (int*)malloc(N * sizeof(int*));
    }

    h_matrix2 = (int**)malloc(N * sizeof(int*));
    for (i = 0; i < N; i++)
    {
        h_matrix2[i] = (int*)malloc(N * sizeof(int*));
    }

    h_matrixSolucion = (int**)malloc(N * sizeof(int*));
    for (i = 0; i < N; i++)
    {
        h_matrixSolucion[i] = (int*)malloc(N * sizeof(int*));
    }

    cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
    cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
    cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));

    fillMatrix(h_matrix1);
    fillMatrix(h_matrix2);
    fillMatrixTo0(h_matrixSolucion);

    for(i = 0; i < N; i++) 
    {
        cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
    }

    int tamBloque = 256;
    int tamGrid = N/tamBloque + 1;
    sumaMatrices<<>>(d_matrix1, d_matrix2, d_matrixSolucion);

    //nos traemos la información del device
cudaThreadSynchronize();
for(i = 0; i < N; i++) 
{
    cudaMemcpy(h_matrixSolucion[i], &d_matrixSolucion[i*N],tamGrid*sizeof(h_matrixSolucion[0]),cudaMemcpyDeviceToHost);
}

    printMatrix(h_matrix1);
    printMatrix(h_matrix2);
    printMatrix(h_matrixSolucion);
}

If I comment that last line the progams doens't give any error.

I'm guesss the problem is that I don't storage the information properly in the kernel (this line: d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];) but I am pretty new to CUDA and I don't really know how to solve it.

EDIT: Now that I've changed the way I get the information back from the device this is what is printing:

0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6

2 3 4 5
3 4 5 6
4 5 6 7
5 6 7 8

2 4 6 8
0 0 0 0
0 0 0 0
0 0 0 0

The first 2 matrix are the ones with the information and the other one is the solution, but only has 1 line filled.

Robert Crovella · Accepted Answer

There are a variety of errors in your code.

There was not definition for fillMatrix
Your underlying host allocations performed with malloc are not guaranteed to be contiguous, therefore you cannot transfer the data back in a single cudaMemcpy operation, but you must use a loop, like the loop you used to transfer data to the GPU
Your host allocations aren't quite right but they don't present an actual problem. This:
```
h_matrix1[i] = (int*)malloc(N * sizeof(int*));
```
should be this:
```
h_matrix1[i] = (int*)malloc(N * sizeof(int));
```
and likewise for the other similar instances.
Your grid (total number of threads) sizing is not correct. Your kernel uses one thread to perform one elementwise addition. Therefore, for a NxN matrix, you need NxN threads, not just N as you are creating and testing against.

The following code has these issues fixed and seems to work correctly for me:

$ cat t2.cu
/* Includes, system */
#include 

#include 
#include 
#define N 15000

/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){

    int idThread = blockIdx.x*blockDim.x + threadIdx.x;

    if (idThread < N*N)
    {
        d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
    }
}

__host__ void printMatrix(int **matrix)
{
    int i, j;
    //only 4 so the file is not too big
    for (i = 0; i < 4; i++)
    {
        for (j = 0; j < 4; j++)
        {
            printf("%d", matrix[i][j]);
            printf("  ");
        }
        printf("
");
    }
    printf("
");
}

/* HOST CODE*/
int main(int argc, char** argv)
{
    int i;

    int **h_matrix1;
    int **h_matrix2;
    int **h_matrixSolucion;

    int *d_matrix1;
    int *d_matrix2;
    int *d_matrixSolucion;

    h_matrix1 = (int**)malloc(N * sizeof(int*));
    for (i = 0; i < N; i++)
    {
        h_matrix1[i] = (int*)malloc(N * sizeof(int));
        for (int j = 0; j < N; j++) h_matrix1[i][j] = 1;
    }

    h_matrix2 = (int**)malloc(N * sizeof(int*));
    for (i = 0; i < N; i++)
    {
        h_matrix2[i] = (int*)malloc(N * sizeof(int));
        for (int j = 0; j < N; j++) h_matrix2[i][j] = 2;
    }

    h_matrixSolucion = (int**)malloc(N * sizeof(int*));
    for (i = 0; i < N; i++)
    {
        h_matrixSolucion[i] = (int*)malloc(N * sizeof(int));
        for (int j = 0; j < N; j++) h_matrixSolucion[i][j] = 0;
    }

    cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
    cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
    cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));


    for(i = 0; i < N; i++)
    {
        cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
    }

    int tamBloque = 256;
    int tamGrid = (N*N)/tamBloque + 1;
    sumaMatrices<<>>(d_matrix1, d_matrix2, d_matrixSolucion);

    cudaThreadSynchronize();
    for(i = 0; i < N; i++)
    {
        cudaMemcpy(h_matrixSolucion[i],&d_matrixSolucion[i*N],N*sizeof(int),cudaMemcpyDeviceToHost);
    }

    printMatrix(h_matrix1);
    printMatrix(h_matrix2);
    printMatrix(h_matrixSolucion);
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
1  1  1  1
1  1  1  1
1  1  1  1
1  1  1  1

2  2  2  2
2  2  2  2
2  2  2  2
2  2  2  2

3  3  3  3
3  3  3  3
3  3  3  3
3  3  3  3

========= ERROR SUMMARY: 0 errors
$

CUDA: Matrix + Matrix, segmentation fault when printing solution matrix in host

Answers (1)

Related Questions