Vitality
Vitality

Reputation: 21515

One dimensional fftshift in CUDA

I'm setting up a one dimensional fftshift in CUDA. My code is the following

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    double2 temp;

    if(i< N/2)
    {
        temp.x =  u_d[i].x;
        temp.y =  u_d[i].y;

        u_d[i].x =u_d[i+N/2].x;
        u_d[i].y =u_d[i+N/2].y;

        u_d[i+N/2].x = temp.x;
        u_d[i+N/2].y = temp.y;
    }
}

Is there any way, smarter than that shown above, to perform the fftshift in CUDA?

Thanks in advance.

A PERHAPS BETTER SOLUTION

I found that perhaps the following solution could be a good alternative

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if(i < N)
    {
        double a = pow(-1.0,i&1);
        u_d[i].x *= a;
        u_d[i].y *= a;
    }
}

It consists in multiplying the vector to be transformed by a sequence of 1s and -1s which is equivalent to the multiplication by exp(-jnpi) and thus to a shift in the conjugate domain.

You have to call this kernel before and after the application of the CUFFT.

One pro is that memory movements/swapping are avoided and the idea can be immediately extended to the 2D case, see CUDA Device To Device transfer expensive.

CONCERNING SYMMETRIC DATA

This solution seems not to be limited to symmetric data. Try for example the following Matlab code, applying the idea to a completely complex random matrix (Gaussian amplitude and uniform phase).

N1=512;
N2=256;

Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);

Im=Magnitude.*exp(j*Phase);

Transform=fftshift(fft2(ifftshift(Im)));

n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);

100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))

The returned normalized root mean square error will be 0, confirming that Transform=Im4.

IMPROVEMENT TO THE SPEED

Following the suggestion received at the NVIDIA Forum, improved speed can be achieved as by changing the instruction

double a = pow(-1.0,i&1);

to

double a = 1-2*(i&1);

to avoid the use of the slow routine pow.

Upvotes: 2

Views: 2973

Answers (2)

Vitality
Vitality

Reputation: 21515

After much time and the introduction of the callback functionality of cuFFT, I can provide a meaningful answer to my own question.

Above I was proposing a "perhaps better solution". After some testing, I have realized that, without using the callback cuFFT functionality, that solution is slower because it uses pow. Then, I have explored two alternatives to the use of pow, something like

float a     = (float)(1-2*((int)offset%2));

float2  out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;

and

float2  out = ((float2*)d_in)[offset];

if ((int)offset&1) {

    out.x = -out.x;
    out.y = -out.y;

}

But, with standard cuFFT, all the above solutions require two separate kernel calls, one for the fftshift and one for the cuFFT execution call. However, with the new cuFFT callback functionality, the above alternative solutions can be embedded in the code as __device__ functions.

So, finally I ended up with the below comparison code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <assert.h>

#include <cufft.h>
#include <cufftXt.h>

//#define DEBUG

#define BLOCKSIZE 256

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
    switch (error)
    {
        case CUFFT_SUCCESS:
            return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
            return "CUFFT_INVALID_PLAN";

        case CUFFT_ALLOC_FAILED:
            return "CUFFT_ALLOC_FAILED";

        case CUFFT_INVALID_TYPE:
            return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
            return "CUFFT_INVALID_VALUE";

        case CUFFT_INTERNAL_ERROR:
            return "CUFFT_INTERNAL_ERROR";

        case CUFFT_EXEC_FAILED:
            return "CUFFT_EXEC_FAILED";

        case CUFFT_SETUP_FAILED:
            return "CUFFT_SETUP_FAILED";

        case CUFFT_INVALID_SIZE:
             return "CUFFT_INVALID_SIZE";

        case CUFFT_UNALIGNED_DATA:
            return "CUFFT_UNALIGNED_DATA";
    }

    return "<unknown>";
}
#endif

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
    if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                            _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/2)
    {
        float2 temp = d_inout[tid];
        d_inout[tid] = d_inout[tid + (N / 2)];
        d_inout[tid + (N / 2)] = temp;
    }
}

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a     = (float)(1-2*((int)offset%2));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a = pow(-1.,(double)(offset&1));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float2  out = ((float2*)d_in)[offset];

    if ((int)offset&1) {

        out.x = -out.x;
        out.y = -out.y;

    }
return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;

/********/
/* MAIN */
/********/
int main()
{
    const int N = 131072;

    printf("N = %d\n", N);

    // --- Host side input array
    float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
    for (int i=0; i<N; i++) {
        h_vect[i].x = (float)rand() / (float)RAND_MAX;
        h_vect[i].y = (float)rand() / (float)RAND_MAX;
    }

    // --- Host side output arrays
    float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));

    // --- Device side input arrays
    float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
    float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
    float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
    float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
    gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));

    // --- Device side output arrays
    float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
    float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /*******************************************/
    /* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
    /*******************************************/
    cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
    fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Memory movements elapsed time:  %3.3f ms \n", time);
    gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V1 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
    cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
    cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v1 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }

    printf("Chessboard v1 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V2 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
    cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v2 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }

    printf("Chessboard v2 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V3 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
    cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v3 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }

    printf("Chessboard v3 test passed!\n");

    return 0;
}

RESULTS ON A GTX 480

N         Mem mov   v1      v2      v3 
131072    0.552     0.136   0.354   0.183 
262144    0.536     0.175   0.451   0.237 
524288    0.661     0.283   0.822   0.290 
1048576   0.784     0.565   1.548   0.548 
2097152   1.298     0.952   2.973   0.944 

RESULTS ON A TESLA C2050

N         Mem mov   v1      v2      v3 
131072    0.278     0.130   0.236   0.132 
262144    0.344     0.202   0.374   0.206 
524288    0.544     0.378   0.696   0.387 
1048576   0.909     0.695   1.294   0.695 
2097152   1.656     1.349   2.531   1.349 

RESULTS ON A KEPLER K20c

N         Mem mov   v1      v2      v3 
131072    0.077     0.076   0.136   0.076 
262144    0.142     0.128   0.202   0.127 
524288    0.268     0.229   0.374   0.230 
1048576   0.516     0.433   0.717   0.435 
2097152   1.019     0.853   1.400   0.855 

Some more details have recently appeared at The 1D fftshift in CUDA by chessboard multiplication and at the GitHub page.

Upvotes: 3

Pavan Yalamanchili
Pavan Yalamanchili

Reputation: 12109

If space is not a concern (and are using fftshift for only one dimension), create u_d with size 1.5 x N, and write the first N/2 elements at the end. You can then move u_d to u_d + N / 2

Here is how you could do it.

double2 *u_d, *u_d_begin;
size_t bytes = N * sizeof(double2);
// This is different from bytes / 2 when N is odd
size_t half_bytes = (N / 2) * sizeof(double2);
CUDA_CHK(cudaMalloc( &u_d, bytes + half_bytes ));
u_d_begin = u_d;
...
// Do some processing and populate u_d;
...
// Copy first half to the end
CUDA_CHK(cudaMemcpy(u_d + N, u_d, half_bytes, cudaMemcpyDeviceToDevice));
u_d = u_d + N /2;

Upvotes: 2

Related Questions