getting the wrong output with CUDA when using more than one block

Question

I am trying to develop an implementation of the FFT in CUDA using visual studio 2010, so far I've made it work for as far as 1024 points inside one block. The issue is that whenever I use more than one block the results for Block 1 will be ok and the others will return a wrong value (doesn't seem random, they don't change in multiple runs.) Here is my kernel

__device__ void FFT(int idxS,int bfsize, Complex* data1, Complex* data0, int k, int N ){
        Complex alpha;
        if((idxS % bfsize) < (bfsize/2)){
            data1[idxS] = ComplexAdd(data0[idxS],data0[idxS+bfsize/2]);
        }
        else
        {
            float angle = -PI*2*((idxS*(1<>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaDeviceSynchronize()); 
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());
        }
        else{
        HANDLE_ERROR (cudaMalloc((void**)&d_saida,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        HANDLE_ERROR (cudaMalloc((void**)&data1_d,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        const dim3 numThreads (1024,1,1);
        const dim3 numBlocks(N/1024 +1,1,1);
            for(int k = 0;k < m;k++)
    {
        GPU_FFT<<>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());     
        cudaFree(data1_d);
        cudaFree(d_saida);
        delete data1;

        }
        return 1;
    }
    else
        return 0;
}

I've tried using Shared memory, however it would return all 0s and I figured CUDA wasn't copying from global to shared ( NSight would tell me that the value for that memory position was ????). This code should be just a proof of concept for now, doesn't have to be optimized, just return the right values. If you guys need the full code I'll provide it. I've been searching for a solution for this for over a month now, this is my desperate call.

Thanks, John

------- Update --------

I changed the code for debugging purposes launching 2 threads in each of the 2 blocks.

int prepFFT(float *Entrada, Complex* saida, int N ){
    if(ceilf(log2((float)N)) == log2((float)N) ){
        for (int i=0; i>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaDeviceSynchronize()); 
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());
        }
        else{
        HANDLE_ERROR (cudaMalloc((void**)&d_saida,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        HANDLE_ERROR (cudaMalloc((void**)&data1_d,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        const dim3 numThreads (1024,1,1);
        const dim3 numBlocks(N/1024 +1,1,1);
            for(int k = 0;k < m;k++)
    {
        GPU_FFT<<>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());     
        cudaFree(data1_d);
        cudaFree(d_saida);
        delete data1;

        }
        return 1;
    }
    else
        return 0;

}

---------------------Edit 2 ---------------------

What is really weird is that when I use memcheck (in any mode) the program returns the right results.

----Final Edit ---------------

I found out that the problem was in this bit of code

FFT(idxS, bfsize,  data1,  data0, k, N);
data0[idxS] = data1[idxS];

I found out that separating the last line in a new function and calling it with the CPU produced correct results for me. Thank you for the help!! Best Regards!

Rodion · Accepted Answer

First of all you should be checking your main kernel function __global__ void GPU_FFTfor the issue

Just change it to this:

__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
    int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
        int bfsize = 1<<(r - k);
        //FFT(idxS, bfsize,  data1,  data0, k, N);
        //data0[idxS] = data1[idxS];
        if (idxS  <= N) data0[idxS] = idxS;
   }

What happens in the second block now?

If it's OK uncomment //FFT(idxS, bfsize, data1, data0, k, N);

and change last line to:

if (idxS <= N) data0[idxS] = data1[idxS];

What happens now? Still old error?

p.s. and you don't need __syncthreads; just after retrieving your thread indexes

upd.

if((idxS % bfsize) < (bfsize/2)){
__syncthreads;
...}

getting the wrong output with CUDA when using more than one block

Answers (1)

Related Questions