Reputation: 346
I am trying to develop an implementation of the FFT in CUDA using visual studio 2010, so far I've made it work for as far as 1024 points inside one block. The issue is that whenever I use more than one block the results for Block 1 will be ok and the others will return a wrong value (doesn't seem random, they don't change in multiple runs.) Here is my kernel
__device__ void FFT(int idxS,int bfsize, Complex* data1, Complex* data0, int k, int N ){
Complex alpha;
if((idxS % bfsize) < (bfsize/2)){
data1[idxS] = ComplexAdd(data0[idxS],data0[idxS+bfsize/2]);
}
else
{
float angle = -PI*2*((idxS*(1<<k)%(bfsize/2)))/N;
alpha.x = cos(angle);
alpha.y= sin(angle);
Complex v0;
v0 = ComplexAdd(data0[idxS-bfsize/2] ,ComplexScale(data0[idxS],-1));
data1[idxS] = ComplexMul(v0, alpha);
}
}
__device__ void Ordenador(int r, int idxS ,Complex* data1, Complex* data0 ){
int p = 0;
for(int k = 0;k < r;k++)
{
if(idxS & (1<<k))
p+=1<<(r - k - 1);
}
data1[idxS] = data0[p];
__syncthreads();
}
__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
__syncthreads;
int bfsize = 1<<(r - k);
FFT(idxS, bfsize, data1, data0, k, N);
data0[idxS] = data1[idxS];
}
int prepFFT(float *Entrada, Complex* saida, int N ){
if(ceilf(log2((float)N)) == log2((float)N) ){
for (int i=0; i<N; i++){
saida[i].x = Entrada[i];
saida[i].y = 0;
}
Complex *d_saida;
int m = (int)log2((float)N);
Complex *data1 = new Complex[N];
Complex *data1_d;
if (N<1024){
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (N,1,1);
const dim3 numBlocks(1,1,1);
for(int k = 0 ;k < m ; k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaDeviceSynchronize());
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
}
else{
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (1024,1,1);
const dim3 numBlocks(N/1024 +1,1,1);
for(int k = 0;k < m;k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
cudaFree(data1_d);
cudaFree(d_saida);
delete data1;
}
return 1;
}
else
return 0;
}
I've tried using Shared memory, however it would return all 0s and I figured CUDA wasn't copying from global to shared ( NSight would tell me that the value for that memory position was ????). This code should be just a proof of concept for now, doesn't have to be optimized, just return the right values. If you guys need the full code I'll provide it. I've been searching for a solution for this for over a month now, this is my desperate call.
Thanks, John
------- Update --------
I changed the code for debugging purposes launching 2 threads in each of the 2 blocks.
int prepFFT(float *Entrada, Complex* saida, int N ){
if(ceilf(log2((float)N)) == log2((float)N) ){
for (int i=0; i<N; i++){
saida[i].x = Entrada[i];
saida[i].y = 0;
}
Complex *d_saida;
int m = (int)log2((float)N);
Complex *data1 = new Complex[N];
Complex *data1_d;
if (N<1024){
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (2,1,1);
const dim3 numBlocks(2,1,1);
for(int k = 0 ;k < m ; k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaDeviceSynchronize());
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
}
else{
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (1024,1,1);
const dim3 numBlocks(N/1024 +1,1,1);
for(int k = 0;k < m;k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
cudaFree(data1_d);
cudaFree(d_saida);
delete data1;
}
return 1;
}
else
return 0;
}
---------------------Edit 2 ---------------------
What is really weird is that when I use memcheck (in any mode) the program returns the right results.
----Final Edit ---------------
I found out that the problem was in this bit of code
FFT(idxS, bfsize, data1, data0, k, N);
data0[idxS] = data1[idxS];
I found out that separating the last line in a new function and calling it with the CPU produced correct results for me. Thank you for the help!! Best Regards!
Upvotes: 2
Views: 402
Reputation: 886
First of all you should be checking your main kernel function __global__ void GPU_FFT
for the issue
Just change it to this:
__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
int bfsize = 1<<(r - k);
//FFT(idxS, bfsize, data1, data0, k, N);
//data0[idxS] = data1[idxS];
if (idxS <= N) data0[idxS] = idxS;
}
What happens in the second block now?
If it's OK uncomment //FFT(idxS, bfsize, data1, data0, k, N);
and change last line to:
if (idxS <= N) data0[idxS] = data1[idxS];
What happens now? Still old error?
p.s. and you don't need __syncthreads;
just after retrieving your thread indexes
upd.
if((idxS % bfsize) < (bfsize/2)){
__syncthreads;
...}
Upvotes: 2