Reputation: 21
I am having problems with my first CUDA program. In it I try to add two arrays but when I print the results are completly wrong.
This is my code:
#include <stdio.h>
const int N=10;
__global__ void add(int *a, int *b, int *c) {
int tid = threadIdx.x;
c[tid] = a[tid] + b[tid];
}
int main(){
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );
for (int i=0; i<N; i++) {
a[i] = -i; b[i] = i * i;
}
cudaMemcpy ( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy ( dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice );
add<<<1,N>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost );
for (int i=0; i<N; i++) {
printf("%d + %d = %d\n", a[i],b[i],c[i]);
}
cudaFree (dev_a); cudaFree (dev_b); cudaFree (dev_c);
return 0;
}
As you will see I try to add A and B in C, but, I don't know why,it don't work.
And this is the output:
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 1139031032
-3 + 9 = 32737
-4 + 16 = 0
-5 + 25 = 0
-6 + 36 = 1203890432
-7 + 49 = 32767
-8 + 64 = 6299112
-9 + 81 = 0
Finally, I apologize for my English ... after learn cuda the next thing I will do is improve it.
Upvotes: 1
Views: 5133
Reputation: 151799
There is something wrong with your machine. Please do error checking, and you will get an idea of what it is.
When I compile and run your exact code, I get the following:
$ ./t81
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72
$
Here is your code modified with error checking. If you compile it and run it, I'm sure you'll discover a problem with your machine:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int N=10;
__global__ void add(int *a, int *b, int *c) {
int tid = threadIdx.x;
c[tid] = a[tid] + b[tid];
}
int main(){
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );
cudaCheckErrors("cudamalloc fail");
for (int i=0; i<N; i++) {
a[i] = -i; b[i] = i * i;
}
cudaMemcpy ( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy ( dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice );
cudaCheckErrors("cuda memcpy fail");
add<<<1,N>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost );
cudaCheckErrors("cudamemcpy or cuda kernel fail");
for (int i=0; i<N; i++) {
printf("%d + %d = %d\n", a[i],b[i],c[i]);
}
cudaFree (dev_a); cudaFree (dev_b); cudaFree (dev_c);
return 0;
}
This poster did the same thing and discovered a problem with their machine.
Upvotes: 3