Reputation: 269
As the compute ability is 2.1, the atomicAdd
and atomicMax
operations do not support double precision, then I define both functions based on some answers on stack overflow.
It is strange that the atomicAdd
function works well but the atomicMax
doesn't work, here is my code.
The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd
and atomicMax
on the block sum.
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4
__device__ double gsum[num_of_blocks];
__device__ double dev_sum;
__device__ double dev_max;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
unsigned long long int* address_as_i = (unsigned long long int*)address;
unsigned long long int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
}
// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
}while(assumed != old);
return __longlong_as_double(old);
}
__global__ void kernel(curandState *globalState){
// global id
int gidx = threadIdx.x + blockIdx.x * blockDim.x;
// local id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[tot_threads];
__shared__ double srandnum[threads_per_block];
// copy global seed to local
localState[lidx] = globalState[gidx];
//synchronize the local threads writing to the local memory cache
__syncthreads();
// generate random number from normal distribution in shared memory
srandnum[lidx] = curand_normal(&localState[lidx]);
__syncthreads();
if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];} // sum of each block
if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];} // copy the sums back to global memory
__threadfence();
if( gidx < num_of_blocks){
atomicAdd_d(&dev_sum, gsum[gidx]);
}
if( gidx < num_of_blocks){
atomicMax_d(&dev_max, gsum[gidx]);
}
if( gidx == 0){
printf("Sum is: %lf\n", dev_sum);
}
if( gidx == 1){
printf("Max is: %lf\n", dev_max);
}
}
int main(){
// set seed on device
curandState *globalState;
cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);
// launch kernel
kernel<<<num_of_blocks, threads_per_block>>>(globalState);
double randnum[num_of_blocks];
cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);
std::cout << "Sum of each block:\n";
for (int i = 0; i < num_of_blocks; ++i){
std::cout << randnum[i] << std::endl;
}
cudaFree(globalState);
return 0;
}
The result I get is
Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303
From the result, I know that the atomicAdd
function works but the atomicMax
function doesn't work, I have no idea of this. Thanks beforehand.
Upvotes: 0
Views: 680
Reputation: 152164
You don't ever initialize dev_max
or dev_sum
. You can't sensibly do these types of atomic operations on them if they don't start with a known value.
Try something like this instead:
__device__ double dev_sum = 0.0;
__device__ double dev_max = -1e99;
and I think you'll be happier with the results.
Upvotes: 1