Reduce not giving consistent results

Question

I am trying to implement my own reduce sum for big 1D arrays. I came up with a reduce kernel and a way to call the kernel several times for step by step reduction to reach a single value. Now I know that this is not the optimal way of computing this (if you see my code it may get to a point where a kernel call needs to be made to add 3 values), but let's obviate that for a moment and try to work with this.

Basically, in short: I call the reduce kernel to each time reduce MAXTHREADS times, in this case 1024. So the size of the array will be reduced by 1024 each time. When the size is smaller than 1024 it seems that works properly, but for bigger values it miserably fails to get the right sum.

This happens with all of the array sizes I tried. What am I missing?

I will also gladly accept comments about the quality of the code but mainly interested in fixing it.

Below I have posted the whole kernel and a snippet of the kernel call. If I have missed some relevant part of the kernel call, please comment and I will fix it. The original code has error checks, all kernels run alway and are never returning CUDAErrors.

Reduce kernel

__global__ void  reduce6(float *g_idata, float *g_odata, unsigned int n){
    extern __shared__ float sdata[];

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(MAXTREADS*2) + tid;
    unsigned int gridSize = MAXTREADS*2*gridDim.x;
    //blockSize==MAXTHREADS
    sdata[tid] = 0;
    float mySum = 0;   
    if (tid>=n) return; 
    if ( (gridDim.x>0) & ((i+MAXTREADS)= 300 )
    if ( tid < 32 )
    {
        // Fetch final intermediate sum from 2nd warp
        mySum = sdata[tid]+ sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = warpSize/2; offset > 0; offset /= 2) 
            mySum += __shfl_down(mySum, offset);
    }
    sdata[0]=mySum;
#else

    // fully unroll reduction within a single warp
    if (tid < 32) {
        warpReduce(sdata,tid);
    }
#endif
    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}



__device__ void warpReduce(volatile float *sdata,unsigned int tid) {
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}

Call the kernel for an arbitrary size of `total_pixels`:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include     
#defineMAXTREADS 1024
__global__ void initKernel(float * devPtr, const float val, const size_t nwords)
{
    //https://stackoverflow.com/questions/10589925/initialize-device-array-in-cuda
    int tidx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (; tidx < nwords; tidx += stride)
        devPtr[tidx] = val;
} 


int main()
{

size_t total_pixels = 5000;
unsigned long int n = (unsigned long int)total_pixels;
float* d_image, *d_aux;
float sum;
cudaMalloc(&d_image, total_pixels*sizeof(float));
cudaMalloc(&d_aux, sizeof(float)*(n + MAXTREADS - 1) / MAXTREADS);


for (int i = 0; i < 10; i++){
    sum = 0;
    cudaMemset(&d_image, 1, total_pixels*sizeof(float));

    int dimblockRed = MAXTREADS;
    int dimgridRed = (total_pixels + MAXTREADS - 1) / MAXTREADS;
    int reduceCont = 0;
    initKernel << < dimgridRed, dimblockRed >> >(d_image, 1.0, total_pixels);

    while (dimgridRed > 1) {
        if (reduceCont % 2 == 0){
            reduce6 << > >(d_image, d_aux, n);
        }
        else{
            reduce6 << > >(d_aux, d_image, n);
        }
        n = dimgridRed;
        dimgridRed = (n + MAXTREADS - 1) / MAXTREADS;
        reduceCont++;

    }
    if (reduceCont % 2 == 0){
        reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_image, d_aux, n);
        cudaMemcpy(&sum, d_aux, sizeof(float), cudaMemcpyDeviceToHost);

    }
    else{
        reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);
        cudaMemcpy(&sum, d_image, sizeof(float), cudaMemcpyDeviceToHost);
    }
    printf("%f ", sum);
}
cudaDeviceReset();
return 0;
}

talonmies · Accepted Answer

There is a lot broken in both your host and device code here, and I am not going to attempt to go through all of the problems. but I can at least see:

extern __shared__ float sdata[]; // must be declared volatile for the warp reduct to work reliably

This accumulate code is broken in a lot of ways, but at least:

if (tid>=n) return; // undefined behaviour with __syncthreads() below
if ( (gridDim.x>0) & ((i+MAXTREADS) n - MAXTREADS
        i += gridSize; 
    }
}else{
    sdata[tid] += g_idata[i]; // out of bounds if i > n
}
__syncthreads(); // potential deadlock

The shuffle based reduction is not correct either:

if ( tid < 32 )
{
    // Fetch final intermediate sum from 2nd warp
    mySum = sdata[tid]+ sdata[tid + 32];
    // Reduce final warp using shuffle
    for (int offset = warpSize/2; offset > 0; offset /= 2) 
        mySum += __shfl_down(mySum, offset);
}
sdata[0]=mySum; // must be conditionally executed only by thread 0, otherwise a memory race

Your host code for calling the reduction kernels is a complete mystery. The reduction kernel needs to only be called at most twice, so the loop is redundant. The, in the final phase of the reduction you call the kernel like this:

reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);

but d_aux only has at most dimgridRed entries, so that is a memory overflow as well.

I think what you really want is something like this:

#include     
#include 
#define MAXTHREADS 1024    


__device__ void warpReduce(volatile float *sdata,unsigned int tid) {
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}


__global__ void mymemset(float* image, const float val, unsigned int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N) {
        image[tid] = val;
        tid += gridDim.x * blockDim.x;
    }
}

__global__ void  reduce6(float *g_idata, float *g_odata, unsigned int n){
    extern __shared__ volatile float sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + tid;
    unsigned int gridSize = blockDim.x*gridDim.x;
    float mySum = 0;   
    while (i < n) { 
        mySum += g_idata[i]; 
        i += gridSize; 
    }
    sdata[tid] = mySum;
    __syncthreads();

   if (tid < 512)
        sdata[tid] += sdata[tid + 512];
    __syncthreads();
    if (tid < 256)
        sdata[tid] += sdata[tid + 256];
    __syncthreads();

    if (tid < 128)
        sdata[tid] += sdata[tid + 128];
     __syncthreads();

    if (tid <  64)
       sdata[tid]  += sdata[tid +  64];
    __syncthreads();


#if (__CUDA_ARCH__ >= 300)
    if ( tid < 32 )
    {
        mySum = sdata[tid] + sdata[tid + 32];
        for (int offset = warpSize/2; offset > 0; offset /= 2) {
            mySum += __shfl_down(mySum, offset);
        }
    }
#else
    if (tid < 32) {
        warpReduce(sdata,tid);
        mySum = sdata[0];
    }
#endif
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}


int main()
{
    size_t total_pixels = 8000;
    unsigned long int n = (unsigned long int)total_pixels;
    float* d_image, *d_aux;
    cudaMalloc(&d_image, total_pixels*sizeof(float));
    cudaMalloc(&d_aux, sizeof(float)*(n + MAXTHREADS - 1) / MAXTHREADS);

    for (int i = 0; i < 10000; i++){
        {
            dim3 bsz = dim3(1024);
            dim3 gsz = dim3(total_pixels / bsz.x + ((total_pixels % bsz.x > 0) ? 1: 0));
            mymemset<<>>(d_image, 1.0f, total_pixels);
            cudaDeviceSynchronize();
        }

        int dimblockRed = MAXTHREADS;
        int dimgridRed = (n + MAXTHREADS - 1) / MAXTHREADS;
        float sum;
        reduce6<<>>(d_image, d_aux, n);

        if (dimgridRed > 1) {
            reduce6<<<1, dimblockRed, MAXTHREADS*sizeof(float)>>>(d_aux, d_image, dimgridRed);
            cudaMemcpy(&sum, d_image, sizeof(float), cudaMemcpyDeviceToHost);
        } else {
            cudaMemcpy(&sum, d_aux, sizeof(float), cudaMemcpyDeviceToHost);
        }
        assert(sum == float(total_pixels));
    }
    cudaDeviceReset();
    return 0;
}

[for future reference, that is what an MCVE looks like]

But I am not going to waste any more time trying to decipher whatever the twisted logic in the accumulation phase the kernel and the host code was. There are other things that should be fixed (the grid size only needs to be as large as the maximum number of concurrent blocks that will fit on your device), but I leave that as an exercise to the reader.

Reduce not giving consistent results

Reduce kernel

Call the kernel for an arbitrary size of `total_pixels`:

Answers (1)

Related Questions

Reduce not giving consistent results

Reduce kernel

Call the kernel for an arbitrary size of total_pixels:

Answers (1)

Related Questions

Call the kernel for an arbitrary size of `total_pixels`: