cuda - CUDA 中的一维 fftshift

标签 cuda fft

我正在 CUDA 中设置一维 fftshift。我的代码如下

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    double2 temp;

    if(i< N/2)
    {
        temp.x =  u_d[i].x;
        temp.y =  u_d[i].y;

        u_d[i].x =u_d[i+N/2].x;
        u_d[i].y =u_d[i+N/2].y;

        u_d[i+N/2].x = temp.x;
        u_d[i+N/2].y = temp.y;
    }
}

有没有比上面显示的方法更智能的方法来在 CUDA 中执行 fftshift?

提前致谢。

也许是更好的解决方案

我发现以下解决方案可能是一个不错的选择

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if(i < N)
    {
        double a = pow(-1.0,i&1);
        u_d[i].x *= a;
        u_d[i].y *= a;
    }
}

它包括将要变换的向量乘以 1-1 的序列,这相当于乘以 exp(-j npi),从而导致共轭域的转变。

您必须在应用 CUFFT 之前和之后调用此内核。

一个优点是可以避免内存移动/交换,并且该想法可以立即扩展到 2D 情况,请参阅 CUDA Device To Device transfer expensive .

关于对称数据

这个解决方案似乎并不局限于对称数据。例如,尝试以下 Matlab 代码,将该思想应用于完全复杂的随机矩阵(高斯幅度和均匀相位)。

N1=512;
N2=256;

Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);

Im=Magnitude.*exp(j*Phase);

Transform=fftshift(fft2(ifftshift(Im)));

n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);

100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))

返回的归一化均方根误差将为 0,确认 Transform=Im4

速度提高

按照 NVIDIA Forum 收到的建议,通过改变指令可以提高速度

double a = pow(-1.0,i&1);

double a = 1-2*(i&1);

避免使用慢速例程pow

最佳答案

经过很长时间并引入了 cuFFT 的回调功能,我可以为我自己的问题提供有意义的答案。

上面我提出了一个“也许更好的解决方案”。经过一些测试,我意识到,如果不使用回调 cuFFT 功能,该解决方案会更慢,因为它使用 pow。然后,我探索了使用 pow 的两种替代方案,例如

float a     = (float)(1-2*((int)offset%2));

float2  out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;

float2  out = ((float2*)d_in)[offset];

if ((int)offset&1) {

    out.x = -out.x;
    out.y = -out.y;

}

但是,对于标准 cuFFT,上述所有解决方案都需要两个单独的内核调用,一个用于 fftshift,另一个用于 cuFFT 执行调用。但是,借助新的 cuFFT 回调功能,上述替代解决方案可以作为 __device__ 函数嵌入到代码中。

所以,最后我得到了下面的比较代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <assert.h>

#include <cufft.h>
#include <cufftXt.h>

//#define DEBUG

#define BLOCKSIZE 256

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
    switch (error)
    {
        case CUFFT_SUCCESS:
            return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
            return "CUFFT_INVALID_PLAN";

        case CUFFT_ALLOC_FAILED:
            return "CUFFT_ALLOC_FAILED";

        case CUFFT_INVALID_TYPE:
            return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
            return "CUFFT_INVALID_VALUE";

        case CUFFT_INTERNAL_ERROR:
            return "CUFFT_INTERNAL_ERROR";

        case CUFFT_EXEC_FAILED:
            return "CUFFT_EXEC_FAILED";

        case CUFFT_SETUP_FAILED:
            return "CUFFT_SETUP_FAILED";

        case CUFFT_INVALID_SIZE:
             return "CUFFT_INVALID_SIZE";

        case CUFFT_UNALIGNED_DATA:
            return "CUFFT_UNALIGNED_DATA";
    }

    return "<unknown>";
}
#endif

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
    if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                            _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/2)
    {
        float2 temp = d_inout[tid];
        d_inout[tid] = d_inout[tid + (N / 2)];
        d_inout[tid + (N / 2)] = temp;
    }
}

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a     = (float)(1-2*((int)offset%2));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a = pow(-1.,(double)(offset&1));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float2  out = ((float2*)d_in)[offset];

    if ((int)offset&1) {

        out.x = -out.x;
        out.y = -out.y;

    }
return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;

/********/
/* MAIN */
/********/
int main()
{
    const int N = 131072;

    printf("N = %d\n", N);

    // --- Host side input array
    float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
    for (int i=0; i<N; i++) {
        h_vect[i].x = (float)rand() / (float)RAND_MAX;
        h_vect[i].y = (float)rand() / (float)RAND_MAX;
    }

    // --- Host side output arrays
    float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));

    // --- Device side input arrays
    float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
    float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
    float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
    float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
    gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));

    // --- Device side output arrays
    float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
    float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /*******************************************/
    /* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
    /*******************************************/
    cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
    fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Memory movements elapsed time:  %3.3f ms \n", time);
    gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V1 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
    cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
    cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v1 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }

    printf("Chessboard v1 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V2 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
    cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v2 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }

    printf("Chessboard v2 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V3 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
    cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v3 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }

    printf("Chessboard v3 test passed!\n");

    return 0;
}

GTX 480 的结果

N         Mem mov   v1      v2      v3 
131072    0.552     0.136   0.354   0.183 
262144    0.536     0.175   0.451   0.237 
524288    0.661     0.283   0.822   0.290 
1048576   0.784     0.565   1.548   0.548 
2097152   1.298     0.952   2.973   0.944 

特斯拉 C2050 的结果

N         Mem mov   v1      v2      v3 
131072    0.278     0.130   0.236   0.132 
262144    0.344     0.202   0.374   0.206 
524288    0.544     0.378   0.696   0.387 
1048576   0.909     0.695   1.294   0.695 
2097152   1.656     1.349   2.531   1.349 

KEPLER K20c 的结果

N         Mem mov   v1      v2      v3 
131072    0.077     0.076   0.136   0.076 
262144    0.142     0.128   0.202   0.127 
524288    0.268     0.229   0.374   0.230 
1048576   0.516     0.433   0.717   0.435 
2097152   1.019     0.853   1.400   0.855 

更多详细信息最近出现在 The 1D fftshift in CUDA by chessboard multiplication并在 GitHub page .

关于cuda - CUDA 中的一维 fftshift,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14187481/

相关文章:

c - 你如何在cuda中创建一个二维数组

c++ - 如何在 Cuda 中使用 struct inside struct

c++ - kissfft - 逆实数 FFT 给出 NaN

python - 如何理解音频分析中的傅里叶变换结果

matlab图像处理错误

MATLAB FFT -> 均衡器 -> iFFT

cuda - 运行多 GPU CUDA 示例时 P2P 内存访问失败 (simpleP2P)

cuda - 有效的最小 GPU 线程数

c - FFTW 库 : verifying fourier transform derivative property

multidimensional-array - 在CUDA中的设备内存上分配2D阵列