cuda kernel seems not to be called

Question

May I know why the following simple cuda program fails on a device with CC5.2?

#include 
#include 
#include 
#define N 1

__global__ void vector_addition( int *a, int *b, int *c )
{
  int i = blockDim.x + blockIdx.x + threadIdx.x;
  if ( i < N )
    c[ i ] = a[ i ] + b[ i ];
}

int main()
{
  size_t bytes = N*sizeof( int );
  int *A = (int *)malloc(bytes);  
  int *B = (int *)malloc(bytes);  
  int *C = (int *)malloc(bytes);

  int *d_A, *d_B, *d_C;
  cudaMalloc( &d_A, bytes ); 
  cudaMalloc( &d_B, bytes );  
  cudaMalloc( &d_C, bytes );

  for ( int i = 0; i < N; i++ ) {
    A[ i ] = 1; B[ i ] = 2; C[ i ] = 0;
  }

  cudaMemcpy( d_A, A, bytes, cudaMemcpyHostToDevice );
  cudaMemcpy( d_B, B, bytes, cudaMemcpyHostToDevice );

  int thr_per_blk = 1024;
  int blk_in_grid = ceil( float( N ) / thr_per_blk );

  vector_addition<<< blk_in_grid, thr_per_blk >>>( d_A, d_B, d_C );

  cudaMemcpy( C, d_C, bytes, cudaMemcpyDeviceToHost );
  for ( int i = 0; i < N; i++ ) {
    if ( C[ i ] != 3 ) {
      std::cout << "error
";
    }
  }

  free( A ); free( B ); free( C );
  cudaFree( d_A ); cudaFree( d_B ); cudaFree( d_C );

  return 0;
}

The output is error message.

Robert Crovella · Accepted Answer

This line in your kernel is not correct:

int i = blockDim.x + blockIdx.x + threadIdx.x;

That is not the proper way to generate a 1D index. It should be:

int i = blockDim.x * blockIdx.x + threadIdx.x;

With your incorrect indexing, the first thread, which should generate a 0 for a globally unique index, generates 1024+0+0 = 1024. This fails the if test in your kernel, so no threads actually do anything.

cuda kernel seems not to be called

Answers (1)

Related Questions