Reputation: 1023
I have written the following code to sum two 4x4 matrices in cuda.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
__global__ void Matrix_add(double* a, double* b, double* c,int n)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int index = row * n + col;
if(col<n && row <n)
c[index] = a[index] + b[index];
}
int main()
{
int n=4;
double **h_a;
double **h_b;
double **h_c;
double *d_a, *d_b, *d_c;
int size = n*n*sizeof(double);
h_a = (double **) malloc(n*sizeof(double*));
h_b = (double **) malloc(n*sizeof(double*));
h_c = (double **) malloc(n*sizeof(double*));
cudaMalloc((void**)&d_a,size);
cudaMalloc((void**)&d_b,size);
cudaMalloc((void**)&d_c,size);
int t=0;
for (t=0;t<n;t++)
{
h_a[t]= (double *)malloc(n*sizeof(double));
h_b[t]= (double *)malloc(n*sizeof(double));
h_c[t]= (double *)malloc(n*sizeof(double));
}
int i=0,j=0;
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
h_a[i][j]=sin(i)*sin(i);
h_b[i][j]=cos(i)*cos(i);
}
}
cudaMemcpy(d_a,h_a+n,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b+n,size,cudaMemcpyHostToDevice);
dim3 dimBlock(4,4);
dim3 dimGrid(1,1);
Matrix_add<<<dimGrid, dimBlock>>>(d_a,d_b,d_c,n);
cudaMemcpy(h_c+n,d_c,size,cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
{
for( j=0;j<n;j++)
{
printf("%f",h_c[i][j]);
printf("\t");
}
printf("\n");
}
for(i=0;i<n;i++)
{
free(h_a[i]);
free(h_b[i]);
free(h_c[i]);
}
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
Result of this addition should be a 2x2 all-ones matrix but in the result all the elements of matrix are 0. Also I get this message after getting result:
Segmentation fault (core dumped)
Can anyone please help me to find out the problem.
Thank you
Upvotes: 3
Views: 1192
Reputation: 106
Your host arrays (h_a, h_b, h_c) are not contiguous in memory, so your initial cudaMemcpy() calls will read garbage into GPU memory (apparently zeros in your case).
The reason is that your hosts arrays are not actually flat, but instead are represented as arrays of pointers. I guess to fake two-dimensional arrays in C? In any case, you either need to be more careful with your cudaMemcpy()s and copy the host arrays row-by-row, or use a flat representation on the host.
Upvotes: 6