Reputation: 1
I have used the cufft to do my research, but there some problem about to use it. The steps of mine is under below:
But, when I used the complex results to multiply the kernel, a serious problem happened, the cufft complex results is not equal to the results of fftw and there are lots of zero in the result. I know the size of result of R2C is N1(N2/2+1), but I want to got the complete complex results. How to solve this problem? i.e. How to restore the R2C results? And how to put the multiplying results into C2R and get the right answer?
My implement program code is under below:
__global__ void MultiplyKernel(cufftComplex *data, float *data1,cufftComplex *data2, unsigned vectorSize) {
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < vectorSize){
data[idx].x = data2[idx].x*data1[idx];
data[idx].y = data2[idx].y*data1[idx];
}
}
__global__ void Scale(cufftReal *data, unsigned vectorSize) {
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < vectorSize){
data[idx] = data[idx]/vectorSize;
}
}
void ApplyKernel1(cufftReal *data2, float *ImageBuffer, float *KernelBuffer, unsigned int NX, unsigned int NY,unsigned int NZ)
{
float *Akernel;
cufftComplex *data_dev1, *data_dev2;
cufftReal *data_dev3, *data_dev;
cudaMalloc((void **)&Akernel, NX * NY * NZ * sizeof(float));
cudaMalloc((void **)&data_dev3, NX * NY * NZ * sizeof(cufftReal));
cudaMalloc((void **)&data_dev, NX * NY * NZ * sizeof(cufftComplex));
cudaMalloc((void **)&data_dev1, NX * NY * NZ * sizeof(cufftComplex));
cudaMalloc((void **)&data_dev2, NX * NY * NZ * sizeof(cufftComplex));
cudaMemset(data_dev, 0, NX * NY * NZ * sizeof(cufftReal));
cudaMemset(data_dev1, 0, NX * NY * NZ * sizeof(cufftComplex));
cudaMemset(data_dev2, 0, NX * NY * NZ * sizeof(cufftComplex));
//cufftComplex *resultFFT = (cufftComplex*)malloc(NX * NY * NZ * sizeof(cufftComplex));
//cufftReal *resultIFFT = (cufftReal*)malloc(NX * NY * NZ * sizeof(cufftReal));
cudaMemcpy(data_dev, ImageBuffer, NX * NY * NZ * sizeof(cufftReal), cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan3d(&plan, NZ, NY, NX, CUFFT_R2C);
cufftExecR2C(plan, data_dev, data_dev1);
//Multiply kernel
cudaMemcpy(Akernel, KernelBuffer, NX * NY * NZ * sizeof(float), cudaMemcpyHostToDevice);
static const int BLOCK_SIZE = 1000;
const int blockCount = (NX*NY*NZ+BLOCK_SIZE-1)/BLOCK_SIZE;
MultiplyKernel <<<blockCount, BLOCK_SIZE>>> (data_dev2, Akernel, data_dev1, NX*NY*NZ);
cufftDestroy(plan);
//cufftPlan3d(&plan, NZ, NY, NX, CUFFT_C2R);
cufftPlan3d(&plan, NZ,NY,NX, CUFFT_C2R);
cufftExecC2R(plan, data_dev2, data_dev3 );
Scale <<<blockCount, BLOCK_SIZE>>> (data_dev3, NX*NY*NZ);
cudaMemcpy(data2, data_dev3, NZ * NY * NX * sizeof(cufftReal), cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(data_dev);
cudaFree(data_dev1);
cudaFree(data_dev2);
cudaFree(data_dev3);
cudaFree(Akernel);
}
Upvotes: 0
Views: 2028
Reputation: 381
When you multiply the results of the R2C fft by a complex number, the results no longer correspond to a symmetric array.
Upvotes: 1