Reputation: 77
i want to make a FFT from double to std::complex with the CuFFT Lib. My Code looks like
#include <complex>
#include <iostream>
#include <cufft.h>
#include <cuda_runtime_api.h>
typedef std::complex<double> Complex;
using namespace std;
int main(){
int n = 100;
double* in;
Complex* out;
in = (double*) malloc(sizeof(double) * n);
out = (Complex*) malloc(sizeof(Complex) * n/2+1);
for(int i=0; i<n; i++){
in[i] = 1;
}
cufftHandle plan;
plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
unsigned int mem_size = sizeof(double)*n;
cufftDoubleReal *d_in;
cufftDoubleComplex *d_out;
cudaMalloc((void **)&d_in, mem_size);
cudaMalloc((void **)&d_out, mem_size);
cudaMemcpy(d_in, in, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, mem_size, cudaMemcpyHostToDevice);
int succes = cufftExecD2Z(plan,(cufftDoubleReal *) d_in,(cufftDoubleComplex *) d_out);
cout << succes << endl;
cudaMemcpy(out, d_out, mem_size, cudaMemcpyDeviceToHost);
for(int i=0; i<n/2; i++){
cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl;
}
return 0;
}
but it seems to me this must be wrong, because i think the transformed values should be 1 0 0 0 0 .... or without the normalization 100 0 0 0 0 .... but i just get 0 0 0 0 0 ...
Furthermore i would like it more if the cufftExecD2Z would work in place, which should be possible but i haven't figured out how to correctly do so. Can anybody help?
Upvotes: 0
Views: 1516
Reputation: 151799
Your code has a variety of errors. You should probably review cufft documentation as well as the sample codes.
The return value of the cufftPlan1d
function does not go into the plan:
plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
The function itself sets the plan (that is why you pass &plan
to the function), then when you assign the return value into the plan, it ruins the plan set up by the function.
You correctly identified that the output can be of size ((N/2)+1)
, but then you didn't allocate space for it properly either on the host side:
out = (Complex*) malloc(sizeof(Complex) * n/2+1);
or on the device side:
unsigned int mem_size = sizeof(double)*n;
...
cudaMalloc((void **)&d_out, mem_size);
The following code has some of the above problems fixed, enough to get your desired result (100, 0, 0, ...)
#include <complex>
#include <iostream>
#include <cufft.h>
#include <cuda_runtime_api.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef std::complex<double> Complex;
using namespace std;
int main(){
int n = 100;
double* in;
Complex* out;
#ifdef IN_PLACE
in = (double*) malloc(sizeof(Complex) * (n/2+1));
out = (Complex*)in;
#else
in = (double*) malloc(sizeof(double) * n);
out = (Complex*) malloc(sizeof(Complex) * (n/2+1));
#endif
for(int i=0; i<n; i++){
in[i] = 1;
}
cufftHandle plan;
cufftResult res = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
if (res != CUFFT_SUCCESS) {cout << "cufft plan error: " << res << endl; return 1;}
cufftDoubleReal *d_in;
cufftDoubleComplex *d_out;
unsigned int out_mem_size = (n/2 + 1)*sizeof(cufftDoubleComplex);
#ifdef IN_PLACE
unsigned int in_mem_size = out_mem_size;
cudaMalloc((void **)&d_in, in_mem_size);
d_out = (cufftDoubleComplex *)d_in;
#else
unsigned int in_mem_size = sizeof(cufftDoubleReal)*n;
cudaMalloc((void **)&d_in, in_mem_size);
cudaMalloc((void **)&d_out, out_mem_size);
#endif
cudaCheckErrors("cuda malloc fail");
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy H2D fail");
res = cufftExecD2Z(plan,d_in, d_out);
if (res != CUFFT_SUCCESS) {cout << "cufft exec error: " << res << endl; return 1;}
cudaMemcpy(out, d_out, out_mem_size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy D2H fail");
for(int i=0; i<n/2; i++){
cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl;
}
return 0;
}
Review the documentation on what is necessary to do an in-place transform in the real to complex case. The above code can be recompiled with -DIN_PLACE
to see the behavior for an in-place transform, and the necessary code changes.
Upvotes: 1