Reputation: 661
I'm trying to make a simple operation of adding a matrix to another one in CUDA, but I get a segmentation fault when I try to check the resault, here's the code:
/* Includes, system */
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 15000
/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){
int idThread = blockIdx.x*blockDim.x + threadIdx.x;
if (idThread < N)
{
d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
}
}
__host__ void printMatrix(int **matrix)
{
int i, j;
//only 4 so the file is not too big
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
printf("%d", matrix[i][j]);
printf(" ");
}
printf("\n");
}
printf("\n");
}
/* HOST CODE*/
int main(int argc, char** argv)
{
int i;
int **h_matrix1;
int **h_matrix2;
int **h_matrixSolucion;
int *d_matrix1;
int *d_matrix2;
int *d_matrixSolucion;
h_matrix1 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix1[i] = (int*)malloc(N * sizeof(int*));
}
h_matrix2 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix2[i] = (int*)malloc(N * sizeof(int*));
}
h_matrixSolucion = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrixSolucion[i] = (int*)malloc(N * sizeof(int*));
}
cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));
fillMatrix(h_matrix1);
fillMatrix(h_matrix2);
fillMatrixTo0(h_matrixSolucion);
for(i = 0; i < N; i++)
{
cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
}
int tamBloque = 256;
int tamGrid = N/tamBloque + 1;
sumaMatrices<<<tamGrid, tamBloque>>>(d_matrix1, d_matrix2, d_matrixSolucion);
//nos traemos la información del device
cudaThreadSynchronize();
for(i = 0; i < N; i++)
{
cudaMemcpy(h_matrixSolucion[i], &d_matrixSolucion[i*N],tamGrid*sizeof(h_matrixSolucion[0]),cudaMemcpyDeviceToHost);
}
printMatrix(h_matrix1);
printMatrix(h_matrix2);
printMatrix(h_matrixSolucion);
}
If I comment that last line the progams doens't give any error.
I'm guesss the problem is that I don't storage the information properly in the kernel (this line: d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];) but I am pretty new to CUDA and I don't really know how to solve it.
EDIT: Now that I've changed the way I get the information back from the device this is what is printing:
0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
2 3 4 5
3 4 5 6
4 5 6 7
5 6 7 8
2 4 6 8
0 0 0 0
0 0 0 0
0 0 0 0
The first 2 matrix are the ones with the information and the other one is the solution, but only has 1 line filled.
Upvotes: 1
Views: 60
Reputation: 152143
There are a variety of errors in your code.
fillMatrix
malloc
are not guaranteed to be contiguous, therefore you cannot transfer the data back in a single cudaMemcpy
operation, but you must use a loop, like the loop you used to transfer data to the GPUYour host allocations aren't quite right but they don't present an actual problem. This:
h_matrix1[i] = (int*)malloc(N * sizeof(int*));
should be this:
h_matrix1[i] = (int*)malloc(N * sizeof(int));
and likewise for the other similar instances.
Your grid (total number of threads) sizing is not correct. Your kernel uses one thread to perform one elementwise addition. Therefore, for a NxN matrix, you need NxN threads, not just N as you are creating and testing against.
The following code has these issues fixed and seems to work correctly for me:
$ cat t2.cu
/* Includes, system */
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 15000
/* DEVICE CODE */
__global__ void sumaMatrices(int *d_matrix1, int *d_matrix2, int *d_matrixSolucion){
int idThread = blockIdx.x*blockDim.x + threadIdx.x;
if (idThread < N*N)
{
d_matrixSolucion[idThread] = d_matrix1[idThread] + d_matrix2[idThread];
}
}
__host__ void printMatrix(int **matrix)
{
int i, j;
//only 4 so the file is not too big
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
printf("%d", matrix[i][j]);
printf(" ");
}
printf("\n");
}
printf("\n");
}
/* HOST CODE*/
int main(int argc, char** argv)
{
int i;
int **h_matrix1;
int **h_matrix2;
int **h_matrixSolucion;
int *d_matrix1;
int *d_matrix2;
int *d_matrixSolucion;
h_matrix1 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix1[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrix1[i][j] = 1;
}
h_matrix2 = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrix2[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrix2[i][j] = 2;
}
h_matrixSolucion = (int**)malloc(N * sizeof(int*));
for (i = 0; i < N; i++)
{
h_matrixSolucion[i] = (int*)malloc(N * sizeof(int));
for (int j = 0; j < N; j++) h_matrixSolucion[i][j] = 0;
}
cudaMalloc((void**)& d_matrix1,N*N*sizeof(int));
cudaMalloc((void**)& d_matrix2,N*N*sizeof(int));
cudaMalloc((void**)& d_matrixSolucion,N*N*sizeof(int));
for(i = 0; i < N; i++)
{
cudaMemcpy(&d_matrix1[i*N], h_matrix1[i], N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&d_matrix2[i*N], h_matrix2[i], N*sizeof(int), cudaMemcpyHostToDevice);
}
int tamBloque = 256;
int tamGrid = (N*N)/tamBloque + 1;
sumaMatrices<<<tamGrid, tamBloque>>>(d_matrix1, d_matrix2, d_matrixSolucion);
cudaThreadSynchronize();
for(i = 0; i < N; i++)
{
cudaMemcpy(h_matrixSolucion[i],&d_matrixSolucion[i*N],N*sizeof(int),cudaMemcpyDeviceToHost);
}
printMatrix(h_matrix1);
printMatrix(h_matrix2);
printMatrix(h_matrixSolucion);
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
1 1 1 1
1 1 1 1
1 1 1 1
1 1 1 1
2 2 2 2
2 2 2 2
2 2 2 2
2 2 2 2
3 3 3 3
3 3 3 3
3 3 3 3
3 3 3 3
========= ERROR SUMMARY: 0 errors
$
Upvotes: 2