Atomic Operation failed in CUDA

Question

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.

It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.

The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.

#include 
#include 
#include 
#include 
#include 


#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4


__device__ double gsum[num_of_blocks];

__device__ double dev_sum;

__device__ double dev_max;

// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &globalState[idx]);
}

// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
    unsigned long long int* address_as_i = (unsigned long long int*)address;
    unsigned long long int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
    } while (assumed != old);
    return __longlong_as_double(old);
}

// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do{
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
    }while(assumed != old);
    return __longlong_as_double(old);
}

__global__ void kernel(curandState *globalState){
    // global id
    int gidx    = threadIdx.x + blockIdx.x * blockDim.x;
    // local id
    int lidx    = threadIdx.x;

    // creat shared memory to store seeds
    __shared__ curandState localState[tot_threads];

    __shared__ double srandnum[threads_per_block];

    // copy global seed to local
    localState[lidx]    = globalState[gidx];

    //synchronize the local threads writing to the local memory cache
    __syncthreads();

    // generate random number from normal distribution in shared memory
    srandnum[lidx]  = curand_normal(&localState[lidx]);
    __syncthreads();

    if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];}   // sum of each block
    if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];}      // copy the sums back to global memory

    __threadfence();

    if( gidx < num_of_blocks){
        atomicAdd_d(&dev_sum, gsum[gidx]);
    }

    if( gidx < num_of_blocks){
        atomicMax_d(&dev_max, gsum[gidx]);
    }

    if( gidx == 0){
        printf("Sum is: %lf
", dev_sum);
    }

    if( gidx == 1){
        printf("Max is: %lf
", dev_max);
    }
}


int main(){
    // set seed on device
    curandState *globalState;
    cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
    initcuRand<<>>(globalState, 1);

    // launch kernel
    kernel<<>>(globalState);
    double randnum[num_of_blocks];

    cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);

    std::cout << "Sum of each block:
";
    for (int i = 0; i < num_of_blocks; ++i){
        std::cout << randnum[i] << std::endl;
    }

    cudaFree(globalState);
    return 0;
}

The result I get is

Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303

From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.

Robert Crovella · Accepted Answer

You don't ever initialize dev_max or dev_sum. You can't sensibly do these types of atomic operations on them if they don't start with a known value.

Try something like this instead:

__device__ double dev_sum = 0.0;

__device__ double dev_max = -1e99;

and I think you'll be happier with the results.

Atomic Operation failed in CUDA

Answers (1)

Related Questions