Why CuFFT throughput increases as the transform size gets larger?

Question

(Updated) I am trying to understand how CUDA parallelism works in CuFFT while learning CUDA coding.

I wrote my version of 1-D FFT in CUDA C++ and compared it with cuFFT. Below are the throughputs I got when I compared the two methods for a single block FFT tests.

1-D FFT Size: throughput-1 (time) in my version, throughput-2 (time) in CuFFT. Gf = GFlops (using cudaEventElapsedTime, GTX 1050, includes cufftPlan1d in cuFFT - see code below)

(1)2^16: 1.3 Gf(3.8 msec), 0.19 Gf(27 msec)
(2)2^17: 1.6 Gf(6.7 msec), 0.5 Gf(23 msec)
(3)2^18: 1.8 Gf(13 msec), 1.1 Gf(22 msec)
(4)2^19: 1.8 Gf(27 msec), 2.1 Gf(23 msec)
(5)2^20: 1.8 Gf(56 msec), 4.5 Gf(23 msec)
(6)2^21: 1.8 Gf(121 msec), 9.6 Gf(22 msec)
(7)2^23: 1.8 Gf(523 msec), 31 Gf (30 msec)
(8)2^25: 1.8 Gf(2298 msec), 106 Gf(39 msec)

I am wondering why the throughput from CuFFT increases almost linearly as the transform size? Since the throughput is computed by computational complexity of the transform divided by computation time, I thought it should stay almost constant as my version shows if everything remain same. What technique should I explore to improve my version behave like 1-D CuFFT (grid size strategy)? My code is included below ( I think the code works fine since FFT-IFFT shows the errors less than 1e-8 where the computation is double precision with the input vector data[i] = Complex(i.0, 0))

#include 
#include 
#include 
#include 
#include 
#include   // For std::fixed and std::setprecision
#include 

using Complex = thrust::complex;
#define USE_SHARED_MEM 0 // disabled: using shared memory didn't improve performance

#define BLOCK_SIZE 1024  
// #define BLOCK_SIZE 512
// #define BLOCK_SIZE 32
#if USE_SHARED_MEM
    #define SHARED_MEM_SIZE_FACTOR 4 // relative to BLOCK_SIZE
#endif

unsigned int mixedRadixReverse(unsigned int x, int numBits) {
    unsigned int rev = 0;
    int numRadix4Stages = numBits / 2;  // Number of radix-4 stages
    int useRadix2 = numBits % 2;  // If numBits is odd, we have an extra radix-2 stage

    // First, reverse the radix-2 bit if present (apply to LSB first)
    if (useRadix2) {
        rev = (rev << 1) | (x & 1);  // Extract LSB for radix-2
        x >>= 1;
    }

    // Then, reverse the radix-4 groups (higher bits)
    for (int i = 0; i < numRadix4Stages; i++) {
        rev = (rev << 2) | (x & 3);  // Extract 2 bits at a time
        x >>= 2;
    }

    return rev;
}
__global__ void scale(Complex* d_data, float scale, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        d_data[idx] *= scale;
    }
}

__global__ void fft_radix2(Complex* data, int sizeFFT, int inFFTSize, int direction) {
    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k >= sizeFFT / 2) return;
    int outFFTSize = 2*inFFTSize;
    float sign = direction == 1 ? -1.0: 1.0;
    Complex tw = Complex(cos(2*M_PI*k/outFFTSize), sign*sin(2*M_PI*k/outFFTSize));
    Complex t = data[k + inFFTSize]*tw;
    Complex tmp = data[k];
    data[k] = tmp + t;
    data[k + inFFTSize] = tmp - t;
}

__global__ void fft_radix4(Complex* data, int sizeFFT, int inFFTSize, int direction) {
    float sign = direction == 1 ? -1.0: 1.0;
    int outFFTSize = 4*inFFTSize;
    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k >= sizeFFT / 4) return; // max number of threads is sizeFFT /4 for radix-4 stage
    // k = 0, 1, 2, ..., 7 when sizeFFT = 32
    int k_offset_inFFTSize = k%inFFTSize; // k_inFFTSize= 0, 1 ..., 3 when inFFTSize = 4
    
    // printf("k=%d: inFFTSize=%d, outFFTSize=%d
",k, inFFTSize,outFFTSize);
    
    int blockOffset = (k/inFFTSize)*outFFTSize; // replaces outer loop
    int offset = blockOffset + k_offset_inFFTSize; // replaces inner loop
    // dragonfly inputs
    int idx0 = offset;
    int idx1 = offset + inFFTSize;
    int idx2 = offset + 2 * inFFTSize;
    int idx3 = offset + 3 * inFFTSize;

    Complex tw = Complex(cos(2.0*M_PI*k_offset_inFFTSize/outFFTSize), 
                    sign*sin(2.0*M_PI*k_offset_inFFTSize/outFFTSize));
    Complex t0 = data[idx0];
    Complex t1 = data[idx1]*tw;
    Complex t2 = data[idx2]*tw*tw;
    Complex t3 = data[idx3]*tw*tw*tw;

    Complex a = t0 + t2;
    Complex b = t1 + t3;
    Complex c = t0 - t2;
    Complex d = (t1 - t3) * Complex(0, 1.0*sign);
    // in-place computation
    data[idx0] = a + b;
    data[idx1] = c + d;
    data[idx2] = a - b;
    data[idx3] = c - d;
}
__global__ void fft_radix2_shared(Complex* data, int sizeFFT, int inFFTSize, int direction) {
    extern __shared__ Complex s_data[];
    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k >= sizeFFT / 2) return;

    int outFFTSize = 2 * inFFTSize;
    float sign = (direction == 1) ? -1.0 : 1.0;
    Complex tw = Complex(cos(2 * M_PI * k / outFFTSize), sign * sin(2 * M_PI * k / outFFTSize));

    // Load data into shared memory
    int idx1 = k + inFFTSize;
    s_data[threadIdx.x] = data[k];
    s_data[threadIdx.x + blockDim.x] = data[idx1];
    __syncthreads();

    // Perform FFT operation in shared memory
    Complex t = s_data[threadIdx.x + blockDim.x] * tw;
    Complex tmp = s_data[threadIdx.x];

    s_data[threadIdx.x] = tmp + t;
    s_data[threadIdx.x + blockDim.x] = tmp - t;

    __syncthreads();

    // Write back to global memory
    data[k] = s_data[threadIdx.x];
    data[idx1] = s_data[threadIdx.x + blockDim.x];
}

__global__ void fft_radix4_shared(Complex* data, int sizeFFT, int inFFTSize, int direction) {
    extern __shared__ Complex s_data[];
    float sign = (direction == 1) ? -1.0 : 1.0;
    int outFFTSize = 4 * inFFTSize;
    int k = blockIdx.x * blockDim.x + threadIdx.x;
    if (k >= sizeFFT / 4) return;

    int k_offset_inFFTSize = k % inFFTSize;
    int blockOffset = (k / inFFTSize) * outFFTSize;
    int offset = blockOffset + k_offset_inFFTSize;

    // Indices for dragonfly inputs
    int idx0 = offset;
    int idx1 = offset + inFFTSize;
    int idx2 = offset + 2 * inFFTSize;
    int idx3 = offset + 3 * inFFTSize;

    // Load into shared memory
    s_data[threadIdx.x] = data[idx0];
    s_data[threadIdx.x + blockDim.x] = data[idx1];
    s_data[threadIdx.x + 2 * blockDim.x] = data[idx2];
    s_data[threadIdx.x + 3 * blockDim.x] = data[idx3];

    __syncthreads();

    // Twiddle factors
    Complex tw = Complex(cos(2.0 * M_PI * k_offset_inFFTSize / outFFTSize), 
                        sign * sin(2.0 * M_PI * k_offset_inFFTSize / outFFTSize));

    Complex t0 = s_data[threadIdx.x];
    Complex t1 = s_data[threadIdx.x + blockDim.x] * tw;
    Complex t2 = s_data[threadIdx.x + 2 * blockDim.x] * tw * tw;
    Complex t3 = s_data[threadIdx.x + 3 * blockDim.x] * tw * tw * tw;

    // Compute FFT butterfly operations
    Complex a = t0 + t2;
    Complex b = t1 + t3;
    Complex c = t0 - t2;
    Complex d = (t1 - t3) * Complex(0, sign);

    // Store results back to shared memory
    s_data[threadIdx.x] = a + b;
    s_data[threadIdx.x + blockDim.x] = c + d;
    s_data[threadIdx.x + 2 * blockDim.x] = a - b;
    s_data[threadIdx.x + 3 * blockDim.x] = c - d;

    __syncthreads();

    // Write results back to global memory
    data[idx0] = s_data[threadIdx.x];
    data[idx1] = s_data[threadIdx.x + blockDim.x];
    data[idx2] = s_data[threadIdx.x + 2 * blockDim.x];
    data[idx3] = s_data[threadIdx.x + 3 * blockDim.x];
}


void fft_1d(Complex* data, int sizeFFT, int direction) {

    //(1) Host memory allocation for radix reversal and DMA 
    Complex * h_pinned_data;
    cudaMallocHost((void**)&h_pinned_data, sizeFFT*sizeof(Complex));
    // radix-reverse in the host memory
    int numBits = __builtin_ctz(sizeFFT);  
    #pragma omp parallel for  // Parallel processing for CPU speedup
    for (int i = 0; i < sizeFFT; i++) {
        unsigned int revIdx = mixedRadixReverse(i, numBits);
        h_pinned_data[revIdx] = data[i]; 
    }
    //(2) device memory allocaiton
    Complex* d_data;
    cudaError_t err;
    err = cudaMalloc((void**)&d_data, sizeFFT * sizeof(Complex));
    if (err != cudaSuccess) { std::cerr << "cudaMalloc failed: "  << std::endl;  return; }
    // Asynchronous copy using pinned memory for performance boost
    cudaMemcpyAsync(d_data, h_pinned_data, sizeFFT * sizeof(Complex), cudaMemcpyHostToDevice, 0);
  

    // (3) plan
    // dim3 blockSize(256);
    dim3 blockSize(BLOCK_SIZE);
    dim3 gridSize((sizeFFT + blockSize.x - 1) / blockSize.x);
    
    int numStagesRadix4 = 0;
    int temp_sizeFFT = sizeFFT;
    while (temp_sizeFFT % 4 == 0) {
        temp_sizeFFT /= 4;
        numStagesRadix4++;
    }
    // std::cout << "number of radix4 stages = " << numStagesRadix4 << std::endl;
    #if USE_SHARED_MEM
    // shared memory allocation
    size_t sharedMemSize = blockSize.x * sizeof(Complex) *SHARED_MEM_SIZE_FACTOR;//256;//4;
    #endif
    // (4) Call radix computations
    int inFFTSize = 1;
    for (int i = 0; i < numStagesRadix4; ++i) {   
        printf("radix-4 stage = %d
", i);     
        
        #if USE_SHARED_MEM
        fft_radix4_shared<<>>(d_data, sizeFFT, inFFTSize, direction);
        #else
        fft_radix4<<>>(d_data, sizeFFT, inFFTSize, direction);
        #endif

        err = cudaDeviceSynchronize();
        if (err != cudaSuccess) { std::cerr << "fft_radix4 execution at stage " << i << " failed!" << std::endl; return; }

        inFFTSize *= 4;
        // display_device_array_cmplx(d_data, sizeFFT, "d_data =");
    }
    
    if (temp_sizeFFT == 2) {
        std::cout << "final radix2 stage: inFFTSize = " << inFFTSize  << std::endl;
        #if USE_SHARED_MEM
        fft_radix2_shared<<>>(d_data, sizeFFT, inFFTSize, direction);
        #else
        fft_radix2<<>>(d_data, sizeFFT, inFFTSize, direction);
        #endif

        err = cudaDeviceSynchronize();
        if (err != cudaSuccess) { std::cerr << "fft_radix2 execution failed!" << std::endl; return; }
    }
    // (5) scale if necessary
    if (direction == -1) scale<<>>(d_data, 1.0/(float) sizeFFT, sizeFFT);
    // send the result back to host
    err = cudaMemcpy(data, d_data, sizeFFT * sizeof(Complex), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) { std::cerr << "cudaMemcpyDeviceToHost failed!" << std::endl; return; }
    
    cudaFree(d_data);
}

int main() {
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float milliseconds = 0;

    // define input data ==================
    // int sizeFFT = 1<< 28; //Not working: cudaMalloc fails
    // int sizeFFT = 1<< 27; 
    // int sizeFFT = 1<< 26; 
    // int sizeFFT = 1<< 25; 
    // int sizeFFT = 1<< 23; 
    // int sizeFFT = 1<< 21; 
    // int sizeFFT = 1<< 20; 
    // int sizeFFT = 1<< 19; 
    // int sizeFFT = 1<< 18; 
    // int sizeFFT = 1<< 17; 
    int sizeFFT = 1<< 16; 

    // test vector
    std::vector data(sizeFFT, Complex(1.0, 0.0));
    std::vector data1(sizeFFT, Complex(0.0, 0.0));
    for (int i = 0; i < sizeFFT; ++i) {data[i] = Complex(i,0);}
    data1 = data;

    cudaEventRecord(start); // =============>
    // FFT
    int direction = 1; // 1: forward, -1: reverse
    fft_1d(data.data(), sizeFFT, direction);

    cudaEventRecord(stop); // <===============
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    double N = sizeFFT;
    double gflops = (5.0 * N * log2(N)) / (milliseconds * 1.0e6);
    std::cout << "Throughput: " << gflops << " GFLOPS, time:" << milliseconds<<  ", BLOCK_SIZE:" << BLOCK_SIZE << std::endl;


    ///////////////////////////////////////////
    // Comparison with cuFFT
    ///////////////////////////////////////////
    //(1) copy data
    cufftComplex* d_data;
    cudaError_t err;
    err = cudaMalloc((void**)&d_data, sizeFFT * sizeof(cufftComplex));
    if (err != cudaSuccess) { std::cerr << "cudaMalloc failed: "  << std::endl;  return; }
    // Asynchronous copy using pinned memory for performance boost
    cudaMemcpyAsync(d_data, data.data(), sizeFFT * sizeof(cufftComplex), cudaMemcpyHostToDevice, 0);
    //(2) Create cuFFT plan
    cudaEventRecord(start); // =============>
    cufftHandle plan;
    cufftPlan1d(&plan, sizeFFT, CUFFT_C2C, 1);
    size_t workspace_size;
    cufftGetSize(plan, &workspace_size);
    std::cout << "Workspace size: " << workspace_size/1024 << " K bytes" << std::endl;

    //(3) Execute FFT
    cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
    cudaEventRecord(stop); // <===============
    cufftDestroy(plan);
    cudaFree(d_data);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    N = sizeFFT;
    gflops = (5.0 * N * log2(N)) / (milliseconds * 1.0e6);
    std::cout << "Throughput (cuFFT): " << gflops << " GFLOPS, time:" << milliseconds << " msec" <  err = data[i] - data1[i];
        double errSquare = err.real()*err.real() + err.imag()*err.imag() ;
        maxDiff = fmax(maxDiff, sqrt(errSquare));
    }
    std::cout << "Max difference: " << maxDiff << std::endl;

    return 0;
}

Why CuFFT throughput increases as the transform size gets larger?

Answers (0)

Related Questions