Reputation: 823
I'm writing a frequency filtering application for a school assignment in C++ and Cuda using cuFFT and I can't get it to work. You can find the whole Visual Studio 2010 solution here. (Needs glut.)
Here is the part I think is relevant: (fourierUtils.cu/194)
//////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets
// the result
__host__
void Process(
BitmapStruct& in_img, // these contain an image in an rgba byte array
BitmapStruct& out_img,
MaskGenerator maskGenerator, // this is a pointer to a device function
float param1, // mask parameters
float param2)
{
// Declare and allocate variables
cufftHandle plan;
cufftReal* img;
cufftReal* dev_img;
cufftComplex* dev_freq_img;
int imgsize = in_img.image_size();
int pixelcount = imgsize / 4;
img = new float[pixelcount];
checkResult(
cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount));
checkResult(
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
// Optimize execution
cudaFuncAttributes attrs;
checkResult(
cudaFuncGetAttributes(&attrs, &Filter));
std::pair<dim3, dim3> params
= Optimizer::GetOptimalParameters(pixelcount, attrs);
// Process r, g, b channels
for(int chan = 0; chan <= 2; chan++)
{
// Init
for(int i = 0; i < pixelcount; i++)
{
img[i] = in_img.pixels[4 * i + chan];
}
checkResult(
cudaMemcpy(dev_img, img, pixelcount, cudaMemcpyHostToDevice));
// Create frequency image
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1));
checkResult(
cufftExecR2C(plan, dev_img, dev_freq_img));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
// Mask frequency image
Filter<<<params.first, params.second>>>(
dev_freq_img, in_img.x, in_img.y, maskGenerator, param1, param2);
getLastCudaError("Filtering the image failed.");
// Get result
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2R, 1));
checkResult(
cufftExecC2R(plan, dev_freq_img, dev_img));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
checkResult(
cudaMemcpy(img, dev_img, pixelcount, cudaMemcpyDeviceToHost));
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + chan] = img[i];
}
}
// Copy alpha channel
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
}
// Free memory
checkResult(
cudaFree(dev_freq_img));
checkResult(
cudaFree(dev_img));
delete img;
getLastCudaError("An error occured during processing the image.");
}
I can't see any practical differences compared to the official examples I've seen, yet when I debug into it with Nsight, all the cufftComplex values received by my kernel are NaNs and the only difference between the input and the result images are that the result has a black bar at the bottom, no matter which filtering mask and what parameters I use. All Cuda and cuFFT calls return success and there is no error reported after the kernel invocation either.
What do I do wrong?
I've tried replacing img and dev_img with complex arrays and using C2C conversions and also doing them inplace, but it only changed the size of the black bar on the result image.
Thank you for your help.
Edit: here is a reduced version that doesn't need glut and should also compile on linux.
Upvotes: 0
Views: 1333
Reputation: 21455
I haven't compiled and run your reduced version, but I think the problem is in the size of dev_img
and dev_freq_imag
.
Consider the example on Section 4.2 of the CUFFT Library User's Guide. It performs an in-place real-to-complex transform, which is the same step you are performing first.
#define NX 256
cufftHandle plan;
cufftComplex *data;
cudaMalloc((void**)&data, sizeof(cufftComplex)*(NX/2+1)*BATCH);
cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH);
cufftExecR2C(plan, (cufftReal*)data, data);
Due to the symmetry properties of the transform, cufftExecR2C
fills only NX/2+1
output elements, where NX
is the size of the input array.
In your case, you are doing the following:
cufftHandle plan;
cufftReal* dev_img;
cufftComplex* dev_freq_img;
cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount);
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount);
so you are allocating a cufftReal
array and a cufftComplex
array of the same size. When you use
cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1);
cufftExecR2C(plan, dev_img, dev_freq_img);
then only half of the dev_freq_img
is filled by cufftExecR2C
, the remaining part containing garbage. If you use the full extent of dev_freq_img
in the Filter
__global__
function, then this will be probably the cause of your NaN
s.
Upvotes: 2
Reputation: 823
My mistake was forgetting to multiply the number of items with their size in some of the cudaMemcpy
calls, thus the end of the vectors fed to cuFFT was made up of NaNs. Fixing those has solved the problem.
I also replaced the cufftReal arrays with cufftComplex ones as the C2C transformations seem to be more predictable and added normalization for the values.
So the final working method is:
///////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets
// the result
__host__
void Process(
BitmapStruct& in_img,
BitmapStruct& out_img,
MaskGenerator maskGenerator,
float param1,
float param2)
{
// Declare and allocate variables
cufftHandle plan;
cufftComplex* img;
cufftComplex* dev_img;
cufftComplex* dev_freq_img;
int imgsize = in_img.image_size();
int pixelcount = imgsize / 4;
img = new cufftComplex[pixelcount];
checkResult(
cudaMalloc(&dev_img, sizeof(cufftComplex) * pixelcount));
checkResult(
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
// Optimize execution
cudaFuncAttributes attrs;
checkResult(
cudaFuncGetAttributes(&attrs, &Filter));
std::pair<dim3, dim3> params =
Optimizer::GetOptimalParameters(pixelcount, attrs);
// Process r, g, b channels
for(int chan = 0; chan <= 2; chan++)
{
// Init
for(int i = 0; i < pixelcount; i++)
{
img[i].x = in_img.pixels[4 * i + chan];
img[i].y = 0;
}
checkResult(
cudaMemcpy(
dev_img,
img,
pixelcount * sizeof(cufftComplex),
cudaMemcpyHostToDevice));
// Create frequency image
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
checkResult(
cufftExecC2C(plan, dev_img, dev_freq_img, CUFFT_FORWARD));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
// Mask frequency image
Filter<<<params.first, params.second>>>(
dev_freq_img,
in_img.x,
in_img.y,
maskGenerator,
param1,
param2);
getLastCudaError("Filtering the image failed.");
// Get result
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
checkResult(
cufftExecC2C(plan, dev_freq_img, dev_img, CUFFT_INVERSE));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
checkResult(
cudaMemcpy(
img,
dev_img,
pixelcount * sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + chan] = img[i].x / pixelcount;
}
}
// Copy alpha channel
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
}
// Free memory
checkResult(
cudaFree(dev_freq_img));
checkResult(
cudaFree(dev_img));
delete img;
getLastCudaError("An error occured during processing the image.");
}
Thank you for the help.
Upvotes: 1