About cufft R2C and C2R

Question

I have used the cufft to do my research, but there some problem about to use it. The steps of mine is under below:

do forward FFT on the image by using R2C
multiply the kernel coefficients with the complex results
do the inverse FFT on the multiplying results by using C2R

But, when I used the complex results to multiply the kernel, a serious problem happened, the cufft complex results is not equal to the results of fftw and there are lots of zero in the result. I know the size of result of R2C is N1(N2/2+1), but I want to got the complete complex results. How to solve this problem? i.e. How to restore the R2C results? And how to put the multiplying results into C2R and get the right answer?

My implement program code is under below:

__global__ void MultiplyKernel(cufftComplex *data, float *data1,cufftComplex *data2, unsigned vectorSize) {
    unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
    if (idx < vectorSize){
        data[idx].x = data2[idx].x*data1[idx];
        data[idx].y = data2[idx].y*data1[idx];
    }
}

__global__ void Scale(cufftReal *data, unsigned vectorSize) {
    unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
    if (idx < vectorSize){
        data[idx] = data[idx]/vectorSize;
    }
}

void ApplyKernel1(cufftReal *data2, float *ImageBuffer, float *KernelBuffer, unsigned int NX, unsigned int NY,unsigned int NZ)
{
      float *Akernel;
      cufftComplex *data_dev1, *data_dev2;
      cufftReal *data_dev3, *data_dev;
      cudaMalloc((void **)&Akernel, NX * NY * NZ * sizeof(float));
      cudaMalloc((void **)&data_dev3, NX * NY * NZ * sizeof(cufftReal));
      cudaMalloc((void **)&data_dev, NX * NY * NZ * sizeof(cufftComplex));
      cudaMalloc((void **)&data_dev1, NX * NY * NZ * sizeof(cufftComplex));
      cudaMalloc((void **)&data_dev2, NX * NY * NZ * sizeof(cufftComplex));
      cudaMemset(data_dev, 0, NX * NY * NZ * sizeof(cufftReal));
      cudaMemset(data_dev1, 0, NX * NY * NZ * sizeof(cufftComplex));
      cudaMemset(data_dev2, 0, NX * NY * NZ * sizeof(cufftComplex));
      //cufftComplex *resultFFT = (cufftComplex*)malloc(NX * NY * NZ * sizeof(cufftComplex));
      //cufftReal *resultIFFT = (cufftReal*)malloc(NX * NY * NZ * sizeof(cufftReal));

      cudaMemcpy(data_dev, ImageBuffer, NX * NY * NZ * sizeof(cufftReal), cudaMemcpyHostToDevice);

      cufftHandle plan;
      cufftPlan3d(&plan, NZ, NY, NX, CUFFT_R2C);
      cufftExecR2C(plan, data_dev, data_dev1);

      //Multiply kernel
      cudaMemcpy(Akernel, KernelBuffer, NX * NY * NZ * sizeof(float), cudaMemcpyHostToDevice);
      static const int BLOCK_SIZE = 1000;
      const int blockCount = (NX*NY*NZ+BLOCK_SIZE-1)/BLOCK_SIZE;
      MultiplyKernel <<>> (data_dev2, Akernel, data_dev1, NX*NY*NZ);


      cufftDestroy(plan);
      //cufftPlan3d(&plan, NZ, NY, NX, CUFFT_C2R);
      cufftPlan3d(&plan, NZ,NY,NX, CUFFT_C2R);
      cufftExecC2R(plan, data_dev2, data_dev3 );
      Scale <<>> (data_dev3, NX*NY*NZ);
      cudaMemcpy(data2, data_dev3, NZ * NY * NX * sizeof(cufftReal), cudaMemcpyDeviceToHost);

      cufftDestroy(plan);
      cudaFree(data_dev);
      cudaFree(data_dev1);
      cudaFree(data_dev2);
      cudaFree(data_dev3);
      cudaFree(Akernel);


}

About cufft R2C and C2R

Answers (1)

Related Questions