Reputation: 11
I try to code a simple example with cuda C, I follow a screencast about this but I have wrong result
this is an the example :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
#define SIZE 1024
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
int i = threadIdx.x;
if (i < n){
c[i] = a[i] + b[i];
}
}
int main()
{
int *a, *b, *c;
int *d_a, *d_b, *d_c;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
c = (int *)malloc(SIZE*sizeof(int));
cudaMalloc(&d_a, SIZE*sizeof(int));
cudaMalloc(&d_b, SIZE*sizeof(int));
cudaMalloc(&d_c, SIZE*sizeof(int));
for (int i = 0; i < SIZE; i++)
{
a[i] = i;
b[i] = i;
c[i] = 0;
}
cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; ++i)
printf("c[%d] = %d\n", i, c[i]);
free(a);
free(b);
free(c);
enter code here
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
the result is :
c[0]=0
c[1]=0
c[2]=0
c[3]=0
c[4]=0
c[5]=0
c[6]=0
c[7]=0
c[8]=0
c[9]=0
but I expect this result :
c[0]=0
c[1]=2
c[2]=4
c[3]=6
c[4]=8
c[5]=10
c[6]=12
c[7]=14
c[8]=16
c[9]=18
please any one can help about this !
Upvotes: 0
Views: 1489
Reputation: 3137
I did some wrong comments, so I will try fix my errors and give a correct answer here. First all, please, attend the comments related to proper CUDA error checking.
Second, the Maximum Thread Block Size for a GT210 (CC 1.2) is 512, not 256 as I commented in a moment of confusion.
That said, You should get the following error by doing the mentioned error checking:
GPUassert: invalid device function
In this case, this error indicates the architecture for which you have compiled your code is higher than the one you are using to run the example. You are compiling the example for devices of compute capability = 2.0
or above (as you commented), but then you execute the code in your GT210 which has a compute capability = 1.2
.
So, first, re-compile your example for the corresponding architecture. Change the
-gencode=arch=compute_20 TO -gencode=arch=compute_12
Once you have successfully compiled the example for your architecture, you will get the following error (because you ALREADY are doing proper error checking ;)
GPUassert: invalid configuration argument
In this case, the error indicates that you are using more resources than the ones available for your architecture (compute capability 1.2) because you are trying to launch blocks of SIZE = 1024
but the Maximum Thread Block Size is 512
, that is, you can not configure a block with more than 512 threads.
So, adjust the SIZE to 512 and everything should work as expected. Below is your example, doing proper CUDA error checking.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
#define SIZE 1024
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
int i = threadIdx.x;
if (i < n){
c[i] = a[i] + b[i];
}
}
int main()
{
int *a, *b, *c;
int *d_a, *d_b, *d_c;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
c = (int *)malloc(SIZE*sizeof(int));
gpuErrchk( cudaMalloc(&d_a, SIZE*sizeof(int)) );
gpuErrchk( cudaMalloc(&d_b, SIZE*sizeof(int)) );
gpuErrchk( cudaMalloc(&d_c, SIZE*sizeof(int)) );
for (int i = 0; i < SIZE; i++)
{
a[i] = i;
b[i] = i;
c[i] = 0;
}
gpuErrchk( cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost) );
for (int i = 0; i < 10; ++i)
printf("c[%d] = %d\n", i, c[i]);
free(a);
free(b);
free(c);
// enter code here
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
Upvotes: 2