Reputation: 11
(Updated) I am trying to understand how CUDA parallelism works in CuFFT while learning CUDA coding.
I wrote my version of 1-D FFT in CUDA C++ and compared it with cuFFT. Below are the throughputs I got when I compared the two methods for a single block FFT tests.
1-D FFT Size: throughput-1 (time) in my version, throughput-2 (time) in CuFFT. Gf = GFlops (using cudaEventElapsedTime
, GTX 1050, includes cufftPlan1d in cuFFT - see code below)
(1)2^16: 1.3 Gf(3.8 msec), 0.19 Gf(27 msec)
(2)2^17: 1.6 Gf(6.7 msec), 0.5 Gf(23 msec)
(3)2^18: 1.8 Gf(13 msec), 1.1 Gf(22 msec)
(4)2^19: 1.8 Gf(27 msec), 2.1 Gf(23 msec)
(5)2^20: 1.8 Gf(56 msec), 4.5 Gf(23 msec)
(6)2^21: 1.8 Gf(121 msec), 9.6 Gf(22 msec)
(7)2^23: 1.8 Gf(523 msec), 31 Gf (30 msec)
(8)2^25: 1.8 Gf(2298 msec), 106 Gf(39 msec)
I am wondering why the throughput from CuFFT increases almost linearly as the transform size? Since the throughput is computed by computational complexity of the transform divided by computation time, I thought it should stay almost constant as my version shows if everything remain same. What technique should I explore to improve my version behave like 1-D CuFFT (grid size strategy)? My code is included below ( I think the code works fine since FFT-IFFT shows the errors less than 1e-8 where the computation is double precision with the input vector data[i] = Complex(i.0, 0))
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <thrust/complex.h>
#include <cmath>
#include <iomanip> // For std::fixed and std::setprecision
#include <cufft.h>
using Complex = thrust::complex<double>;
#define USE_SHARED_MEM 0 // disabled: using shared memory didn't improve performance
#define BLOCK_SIZE 1024
// #define BLOCK_SIZE 512
// #define BLOCK_SIZE 32
#if USE_SHARED_MEM
#define SHARED_MEM_SIZE_FACTOR 4 // relative to BLOCK_SIZE
#endif
unsigned int mixedRadixReverse(unsigned int x, int numBits) {
unsigned int rev = 0;
int numRadix4Stages = numBits / 2; // Number of radix-4 stages
int useRadix2 = numBits % 2; // If numBits is odd, we have an extra radix-2 stage
// First, reverse the radix-2 bit if present (apply to LSB first)
if (useRadix2) {
rev = (rev << 1) | (x & 1); // Extract LSB for radix-2
x >>= 1;
}
// Then, reverse the radix-4 groups (higher bits)
for (int i = 0; i < numRadix4Stages; i++) {
rev = (rev << 2) | (x & 3); // Extract 2 bits at a time
x >>= 2;
}
return rev;
}
__global__ void scale(Complex* d_data, float scale, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
d_data[idx] *= scale;
}
}
__global__ void fft_radix2(Complex* data, int sizeFFT, int inFFTSize, int direction) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k >= sizeFFT / 2) return;
int outFFTSize = 2*inFFTSize;
float sign = direction == 1 ? -1.0: 1.0;
Complex tw = Complex(cos(2*M_PI*k/outFFTSize), sign*sin(2*M_PI*k/outFFTSize));
Complex t = data[k + inFFTSize]*tw;
Complex tmp = data[k];
data[k] = tmp + t;
data[k + inFFTSize] = tmp - t;
}
__global__ void fft_radix4(Complex* data, int sizeFFT, int inFFTSize, int direction) {
float sign = direction == 1 ? -1.0: 1.0;
int outFFTSize = 4*inFFTSize;
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k >= sizeFFT / 4) return; // max number of threads is sizeFFT /4 for radix-4 stage
// k = 0, 1, 2, ..., 7 when sizeFFT = 32
int k_offset_inFFTSize = k%inFFTSize; // k_inFFTSize= 0, 1 ..., 3 when inFFTSize = 4
// printf("k=%d: inFFTSize=%d, outFFTSize=%d\n",k, inFFTSize,outFFTSize);
int blockOffset = (k/inFFTSize)*outFFTSize; // replaces outer loop
int offset = blockOffset + k_offset_inFFTSize; // replaces inner loop
// dragonfly inputs
int idx0 = offset;
int idx1 = offset + inFFTSize;
int idx2 = offset + 2 * inFFTSize;
int idx3 = offset + 3 * inFFTSize;
Complex tw = Complex(cos(2.0*M_PI*k_offset_inFFTSize/outFFTSize),
sign*sin(2.0*M_PI*k_offset_inFFTSize/outFFTSize));
Complex t0 = data[idx0];
Complex t1 = data[idx1]*tw;
Complex t2 = data[idx2]*tw*tw;
Complex t3 = data[idx3]*tw*tw*tw;
Complex a = t0 + t2;
Complex b = t1 + t3;
Complex c = t0 - t2;
Complex d = (t1 - t3) * Complex(0, 1.0*sign);
// in-place computation
data[idx0] = a + b;
data[idx1] = c + d;
data[idx2] = a - b;
data[idx3] = c - d;
}
__global__ void fft_radix2_shared(Complex* data, int sizeFFT, int inFFTSize, int direction) {
extern __shared__ Complex s_data[];
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k >= sizeFFT / 2) return;
int outFFTSize = 2 * inFFTSize;
float sign = (direction == 1) ? -1.0 : 1.0;
Complex tw = Complex(cos(2 * M_PI * k / outFFTSize), sign * sin(2 * M_PI * k / outFFTSize));
// Load data into shared memory
int idx1 = k + inFFTSize;
s_data[threadIdx.x] = data[k];
s_data[threadIdx.x + blockDim.x] = data[idx1];
__syncthreads();
// Perform FFT operation in shared memory
Complex t = s_data[threadIdx.x + blockDim.x] * tw;
Complex tmp = s_data[threadIdx.x];
s_data[threadIdx.x] = tmp + t;
s_data[threadIdx.x + blockDim.x] = tmp - t;
__syncthreads();
// Write back to global memory
data[k] = s_data[threadIdx.x];
data[idx1] = s_data[threadIdx.x + blockDim.x];
}
__global__ void fft_radix4_shared(Complex* data, int sizeFFT, int inFFTSize, int direction) {
extern __shared__ Complex s_data[];
float sign = (direction == 1) ? -1.0 : 1.0;
int outFFTSize = 4 * inFFTSize;
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k >= sizeFFT / 4) return;
int k_offset_inFFTSize = k % inFFTSize;
int blockOffset = (k / inFFTSize) * outFFTSize;
int offset = blockOffset + k_offset_inFFTSize;
// Indices for dragonfly inputs
int idx0 = offset;
int idx1 = offset + inFFTSize;
int idx2 = offset + 2 * inFFTSize;
int idx3 = offset + 3 * inFFTSize;
// Load into shared memory
s_data[threadIdx.x] = data[idx0];
s_data[threadIdx.x + blockDim.x] = data[idx1];
s_data[threadIdx.x + 2 * blockDim.x] = data[idx2];
s_data[threadIdx.x + 3 * blockDim.x] = data[idx3];
__syncthreads();
// Twiddle factors
Complex tw = Complex(cos(2.0 * M_PI * k_offset_inFFTSize / outFFTSize),
sign * sin(2.0 * M_PI * k_offset_inFFTSize / outFFTSize));
Complex t0 = s_data[threadIdx.x];
Complex t1 = s_data[threadIdx.x + blockDim.x] * tw;
Complex t2 = s_data[threadIdx.x + 2 * blockDim.x] * tw * tw;
Complex t3 = s_data[threadIdx.x + 3 * blockDim.x] * tw * tw * tw;
// Compute FFT butterfly operations
Complex a = t0 + t2;
Complex b = t1 + t3;
Complex c = t0 - t2;
Complex d = (t1 - t3) * Complex(0, sign);
// Store results back to shared memory
s_data[threadIdx.x] = a + b;
s_data[threadIdx.x + blockDim.x] = c + d;
s_data[threadIdx.x + 2 * blockDim.x] = a - b;
s_data[threadIdx.x + 3 * blockDim.x] = c - d;
__syncthreads();
// Write results back to global memory
data[idx0] = s_data[threadIdx.x];
data[idx1] = s_data[threadIdx.x + blockDim.x];
data[idx2] = s_data[threadIdx.x + 2 * blockDim.x];
data[idx3] = s_data[threadIdx.x + 3 * blockDim.x];
}
void fft_1d(Complex* data, int sizeFFT, int direction) {
//(1) Host memory allocation for radix reversal and DMA
Complex * h_pinned_data;
cudaMallocHost((void**)&h_pinned_data, sizeFFT*sizeof(Complex));
// radix-reverse in the host memory
int numBits = __builtin_ctz(sizeFFT);
#pragma omp parallel for // Parallel processing for CPU speedup
for (int i = 0; i < sizeFFT; i++) {
unsigned int revIdx = mixedRadixReverse(i, numBits);
h_pinned_data[revIdx] = data[i];
}
//(2) device memory allocaiton
Complex* d_data;
cudaError_t err;
err = cudaMalloc((void**)&d_data, sizeFFT * sizeof(Complex));
if (err != cudaSuccess) { std::cerr << "cudaMalloc failed: " << std::endl; return; }
// Asynchronous copy using pinned memory for performance boost
cudaMemcpyAsync(d_data, h_pinned_data, sizeFFT * sizeof(Complex), cudaMemcpyHostToDevice, 0);
// (3) plan
// dim3 blockSize(256);
dim3 blockSize(BLOCK_SIZE);
dim3 gridSize((sizeFFT + blockSize.x - 1) / blockSize.x);
int numStagesRadix4 = 0;
int temp_sizeFFT = sizeFFT;
while (temp_sizeFFT % 4 == 0) {
temp_sizeFFT /= 4;
numStagesRadix4++;
}
// std::cout << "number of radix4 stages = " << numStagesRadix4 << std::endl;
#if USE_SHARED_MEM
// shared memory allocation
size_t sharedMemSize = blockSize.x * sizeof(Complex) *SHARED_MEM_SIZE_FACTOR;//256;//4;
#endif
// (4) Call radix computations
int inFFTSize = 1;
for (int i = 0; i < numStagesRadix4; ++i) {
printf("radix-4 stage = %d\n", i);
#if USE_SHARED_MEM
fft_radix4_shared<<<gridSize, blockSize, sharedMemSize>>>(d_data, sizeFFT, inFFTSize, direction);
#else
fft_radix4<<<gridSize, blockSize>>>(d_data, sizeFFT, inFFTSize, direction);
#endif
err = cudaDeviceSynchronize();
if (err != cudaSuccess) { std::cerr << "fft_radix4 execution at stage " << i << " failed!" << std::endl; return; }
inFFTSize *= 4;
// display_device_array_cmplx(d_data, sizeFFT, "d_data =");
}
if (temp_sizeFFT == 2) {
std::cout << "final radix2 stage: inFFTSize = " << inFFTSize << std::endl;
#if USE_SHARED_MEM
fft_radix2_shared<<<gridSize, blockSize, sharedMemSize>>>(d_data, sizeFFT, inFFTSize, direction);
#else
fft_radix2<<<gridSize, blockSize>>>(d_data, sizeFFT, inFFTSize, direction);
#endif
err = cudaDeviceSynchronize();
if (err != cudaSuccess) { std::cerr << "fft_radix2 execution failed!" << std::endl; return; }
}
// (5) scale if necessary
if (direction == -1) scale<<<gridSize, blockSize>>>(d_data, 1.0/(float) sizeFFT, sizeFFT);
// send the result back to host
err = cudaMemcpy(data, d_data, sizeFFT * sizeof(Complex), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) { std::cerr << "cudaMemcpyDeviceToHost failed!" << std::endl; return; }
cudaFree(d_data);
}
int main() {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
// define input data ==================
// int sizeFFT = 1<< 28; //Not working: cudaMalloc fails
// int sizeFFT = 1<< 27;
// int sizeFFT = 1<< 26;
// int sizeFFT = 1<< 25;
// int sizeFFT = 1<< 23;
// int sizeFFT = 1<< 21;
// int sizeFFT = 1<< 20;
// int sizeFFT = 1<< 19;
// int sizeFFT = 1<< 18;
// int sizeFFT = 1<< 17;
int sizeFFT = 1<< 16;
// test vector
std::vector<Complex> data(sizeFFT, Complex(1.0, 0.0));
std::vector<Complex> data1(sizeFFT, Complex(0.0, 0.0));
for (int i = 0; i < sizeFFT; ++i) {data[i] = Complex(i,0);}
data1 = data;
cudaEventRecord(start); // =============>
// FFT
int direction = 1; // 1: forward, -1: reverse
fft_1d(data.data(), sizeFFT, direction);
cudaEventRecord(stop); // <===============
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
double N = sizeFFT;
double gflops = (5.0 * N * log2(N)) / (milliseconds * 1.0e6);
std::cout << "Throughput: " << gflops << " GFLOPS, time:" << milliseconds<< ", BLOCK_SIZE:" << BLOCK_SIZE << std::endl;
///////////////////////////////////////////
// Comparison with cuFFT
///////////////////////////////////////////
//(1) copy data
cufftComplex* d_data;
cudaError_t err;
err = cudaMalloc((void**)&d_data, sizeFFT * sizeof(cufftComplex));
if (err != cudaSuccess) { std::cerr << "cudaMalloc failed: " << std::endl; return; }
// Asynchronous copy using pinned memory for performance boost
cudaMemcpyAsync(d_data, data.data(), sizeFFT * sizeof(cufftComplex), cudaMemcpyHostToDevice, 0);
//(2) Create cuFFT plan
cudaEventRecord(start); // =============>
cufftHandle plan;
cufftPlan1d(&plan, sizeFFT, CUFFT_C2C, 1);
size_t workspace_size;
cufftGetSize(plan, &workspace_size);
std::cout << "Workspace size: " << workspace_size/1024 << " K bytes" << std::endl;
//(3) Execute FFT
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
cudaEventRecord(stop); // <===============
cufftDestroy(plan);
cudaFree(d_data);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
N = sizeFFT;
gflops = (5.0 * N * log2(N)) / (milliseconds * 1.0e6);
std::cout << "Throughput (cuFFT): " << gflops << " GFLOPS, time:" << milliseconds << " msec" <<std::endl;
// IFFT
direction = -1; // inverse
fft_1d(data.data(), sizeFFT, direction);
double maxDiff = 0;
for (int i = 0; i < sizeFFT; ++i) {
thrust::complex<double> err = data[i] - data1[i];
double errSquare = err.real()*err.real() + err.imag()*err.imag() ;
maxDiff = fmax(maxDiff, sqrt(errSquare));
}
std::cout << "Max difference: " << maxDiff << std::endl;
return 0;
}
Upvotes: 1
Views: 39