CUDA atomicAdd across blocks

Question

I cannot get the atomicAdd function to work over all blocks. It turns out that the following kernel code gives me the total number of threads in a block (< 5000 for example):

__global __ void kernelCode(float *result)
{
    int index = threadIdx.x+blockIdx.x*blockDim.x;
    if (index < 5000)
    {
        atomicAdd(result, 1.0f);
    }
}

Can you please tell me how to add something to a value but without allocating the whole array of 1.0f? This is because I'm using this code on a system with very limited resources - every bit counts.

Robert Crovella · Accepted Answer

This code can work across multiple blocks without allocating an array of 1.0f. The if (index < 5000) statement is not intended to limit you to a single threadblock. It is intended to make sure that only legitimate threads in the entire grid take part in the operation.

try something like this:

#include 
#define TOTAL_SIZE 100000
#define nTPB 256

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)
", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING
"); \
            exit(1); \
        } \
    } while (0)

__global__ void kernelCode(float *result)
{
    int index = threadIdx.x+blockIdx.x*blockDim.x;
    if (index < TOTAL_SIZE)
    {
        atomicAdd(result, 1.0f);
    }
}

int main(){

  float h_result, *d_result;
  cudaMalloc((void **)&d_result, sizeof(float));
  cudaCheckErrors("cuda malloc fail");
  h_result = 0.0f;
  cudaMemcpy(d_result, &h_result, sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy 1 fail");
  kernelCode<<<(TOTAL_SIZE+nTPB-1)/nTPB, nTPB>>>(d_result);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy 2 fail");
  std::cout<< "result = " << h_result << std::endl;
  return 0;
}

You can change TOTAL_SIZE to any number that will conveniently fit in a float

Note that I typed this code in the browser, there may be typographical errors.

CUDA atomicAdd across blocks

Answers (1)

Related Questions