Reputation: 376
I am somewhat confused, I have used the warp reduction as outlined by the online tutorial for quite a while now and it never caused problems. These are the snippets:
while (r < total_rotations){
rot_index(d_refinements, h_num_refinements, abg,&rot_linear_index, r,s);
concat[threadIdx.x] = min(concat[threadIdx.x],score_offset[rot_linear_index]);
r += blockDim.x;
}
__syncthreads();
if (BLOCKSIZE >= 1024){if (tid < 512) { concat[tid] = min(concat[tid],concat[tid + 512]);} __syncthreads();}
if (BLOCKSIZE >= 512){if (tid < 256) { concat[tid] = min(concat[tid],concat[tid + 256]);} __syncthreads();}
if (BLOCKSIZE >= 256){if (tid < 128) { concat[tid] = min(concat[tid],concat[tid + 128]);} __syncthreads();}
if (BLOCKSIZE >= 128){if (tid < 64) { concat[tid] = min(concat[tid],concat[tid + 64]);} __syncthreads();}
if (tid < 32) min_warp_reduce<float,BLOCKSIZE>(concat,tid); __syncthreads();
if (tid==0){
min_offset[0] = concat[0];
}
And the __device__
code.
template <class T, unsigned int blockSize>
__device__
void min_warp_reduce(volatile T * sdata, int tid){
if (blockSize >= 64) sdata[tid] = min(sdata[tid],sdata[tid + 32]);
if (blockSize >= 32) sdata[tid] = min(sdata[tid],sdata[tid + 16]);
if (blockSize >= 16) sdata[tid] = min(sdata[tid],sdata[tid + 8]);
if (blockSize >= 8) sdata[tid] = min(sdata[tid],sdata[tid + 4]);
if (blockSize >= 4) sdata[tid] = min(sdata[tid],sdata[tid + 2]);
if (blockSize >= 2) sdata[tid] = min(sdata[tid],sdata[tid + 1]);
}
To me I have copied the tutorial code faithfully, yet a race condition check tells me that there are several conflicts. What am I missing?
Upvotes: 0
Views: 158
Reputation: 152164
The tutorial you are referring to is quite old and does not take into account the Volta execution model. It assumes that warps will remain in lockstep.
The volta execution model does not guarantee this particularly in the presence of conditional code.
You should be able to fix this (get rid of the race check errors) with addition of __syncwarp()
:
__device__
void min_warp_reduce(volatile T * sdata, int tid){
if (blockSize >= 64) {sdata[tid] = min(sdata[tid],sdata[tid + 32]); __syncwarp();}
if (blockSize >= 32) {sdata[tid] = min(sdata[tid],sdata[tid + 16]); __syncwarp();}
if (blockSize >= 16) {sdata[tid] = min(sdata[tid],sdata[tid + 8]); __syncwarp();}
if (blockSize >= 8) {sdata[tid] = min(sdata[tid],sdata[tid + 4]); __syncwarp();}
if (blockSize >= 4) {sdata[tid] = min(sdata[tid],sdata[tid + 2]); __syncwarp();}
if (blockSize >= 2) {sdata[tid] = min(sdata[tid],sdata[tid + 1]); __syncwarp();}
}
__syncwarp()
implies a memory barrier, so you may optionally remove the volatile
decorator if you wish; but that is not required for the correctness/race being discussed here.
If that doesn't fix it, you will need to provide a mcve
Upvotes: 3