wrong results in cuda

Question

I try to code a simple example with cuda C, I follow a screencast about this but I have wrong result

this is an the example :

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include
#define SIZE    1024

__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x;

    if (i < n){
        c[i] = a[i] + b[i];
    }

}

int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU         installed?");

    }
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    c = (int *)malloc(SIZE*sizeof(int));

    cudaMalloc(&d_a, SIZE*sizeof(int));
    cudaMalloc(&d_b, SIZE*sizeof(int));
    cudaMalloc(&d_c, SIZE*sizeof(int));

    for (int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);

    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d
", i, c[i]);

    free(a);
    free(b);
    free(c);
    enter code here
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

the result is :

c[0]=0
c[1]=0
c[2]=0
c[3]=0
c[4]=0
c[5]=0
c[6]=0
c[7]=0
c[8]=0
c[9]=0

but I expect this result :

c[0]=0
c[1]=2
c[2]=4
c[3]=6
c[4]=8
c[5]=10
c[6]=12
c[7]=14
c[8]=16
c[9]=18

please any one can help about this !

pQB · Accepted Answer

I did some wrong comments, so I will try fix my errors and give a correct answer here. First all, please, attend the comments related to proper CUDA error checking.

Second, the Maximum Thread Block Size for a GT210 (CC 1.2) is 512, not 256 as I commented in a moment of confusion.

That said, You should get the following error by doing the mentioned error checking:

GPUassert: invalid device function

In this case, this error indicates the architecture for which you have compiled your code is higher than the one you are using to run the example. You are compiling the example for devices of compute capability = 2.0 or above (as you commented), but then you execute the code in your GT210 which has a compute capability = 1.2.

So, first, re-compile your example for the corresponding architecture. Change the

-gencode=arch=compute_20 TO -gencode=arch=compute_12

Once you have successfully compiled the example for your architecture, you will get the following error (because you ALREADY are doing proper error checking ;)

GPUassert: invalid configuration argument

In this case, the error indicates that you are using more resources than the ones available for your architecture (compute capability 1.2) because you are trying to launch blocks of SIZE = 1024 but the Maximum Thread Block Size is 512, that is, you can not configure a block with more than 512 threads.

So, adjust the SIZE to 512 and everything should work as expected. Below is your example, doing proper CUDA error checking.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include
#define SIZE    1024

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x;

    if (i < n){
        c[i] = a[i] + b[i];
    }
}

int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU         installed?");
    }
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    c = (int *)malloc(SIZE*sizeof(int));

    gpuErrchk( cudaMalloc(&d_a, SIZE*sizeof(int)) );
    gpuErrchk( cudaMalloc(&d_b, SIZE*sizeof(int)) );
    gpuErrchk( cudaMalloc(&d_c, SIZE*sizeof(int)) );

    for (int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    gpuErrchk( cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice) );

    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    gpuErrchk( cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost) );

    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d
", i, c[i]);

    free(a);
    free(b);
    free(c);
    // enter code here 
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

wrong results in cuda

Answers (1)

Related Questions