cuda - CUDA 内存副本和 cuFFT 的异步执行

我有一个 CUDA 程序用于计算大小为 50000 的 FFT。目前，我将整个数组复制到 GPU 并执行 cuFFT。现在，我正在尝试优化程序，NVIDIA Visual Profiler 告诉我通过并行计算并发隐藏 memcopy。我的问题是:

是否可以，例如，复制前 5000 个元素，然后开始计算，然后并行复制下一堆数据进行计算等？

由于 DFT 基本上是时间值乘以复指数函数的和，我认为应该可以“按 block ”计算 FFT。

cufft 支持这个吗？这通常是一个好的计算想法吗？

编辑

更清楚地说，我不想在不同阵列上并行计算不同的 FFT。假设我在时域中有大量正弦信号，我想知道信号中有哪些频率。例如，我的想法是将三分之一的信号长度复制到 GPU，然后是下一个三分之一，并使用已复制的输入值的前三分之一并行计算 FFT。然后复制最后三分之一并更新输出值，直到处理完所有时间值。所以最后应该有一个峰值在正弦波频率的输出阵列。

最佳答案

请考虑上述意见，尤其是:

如果您计算 Npartial 个元素的 FFT，您将得到 Npartial 个元素的输出；
(跟随 Robert Crovella)在 cuFFT 调用启动之前，cuFFT 所需的所有数据都必须驻留在设备上，这样您就无法为单个 cuFFT 操作将数据分解成多个部分，然后开始在所有部分都在 GPU 上之前的那个操作；此外，cuFFT 调用是不透明的；

考虑到以上两点，我认为只有按照下面代码所示的方式正确使用零填充，您才能“模拟”您想要实现的目标。正如您将看到的，让 N 成为数据大小，通过将数据分成 NUM_STREAMS 个 block ，代码执行 NUM_STREAMS 零填充 和流式 cuFFT 调用大小N。在 cuFFT 之后，您必须合并(求和)部分结果。

#include <stdio.h>

#include <cufft.h>

#define BLOCKSIZE 32
#define NUM_STREAMS 3

/**********/
/* iDivUp */
/*********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/******************/
/* SUMMING KERNEL */
/******************/
__global__ void kernel(float2 *vec1, float2 *vec2, float2 *vec3, float2 *out, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        out[tid].x = vec1[tid].x + vec2[tid].x + vec3[tid].x;
        out[tid].y = vec1[tid].y + vec2[tid].y + vec3[tid].y;
    }

}


/********/
/* MAIN */
/********/
int main()
{
    const int N = 600000;
    const int Npartial = N / NUM_STREAMS;

    // --- Host input data initialization
    float2 *h_in1 = new float2[Npartial];
    float2 *h_in2 = new float2[Npartial];
    float2 *h_in3 = new float2[Npartial];
    for (int i = 0; i < Npartial; i++) {
        h_in1[i].x = 1.f;
        h_in1[i].y = 0.f;
        h_in2[i].x = 1.f;
        h_in2[i].y = 0.f;
        h_in3[i].x = 1.f;
        h_in3[i].y = 0.f;
    }

    // --- Host output data initialization
    float2 *h_out = new float2[N];

    // --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    gpuErrchk(cudaHostRegister(h_in1, Npartial*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in2, Npartial*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in3, Npartial*sizeof(float2), cudaHostRegisterPortable));

    // --- Device input data allocation
    float2 *d_in1;          gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
    float2 *d_in2;          gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
    float2 *d_in3;          gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
    float2 *d_out1;         gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2;         gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3;         gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
    float2 *d_out;          gpuErrchk(cudaMalloc((void**)&d_out, N*sizeof(float2)));

    // --- Zero padding
    gpuErrchk(cudaMemset(d_in1, 0, N*sizeof(float2)));
    gpuErrchk(cudaMemset(d_in2, 0, N*sizeof(float2)));
    gpuErrchk(cudaMemset(d_in3, 0, N*sizeof(float2)));

    // --- Creates CUDA streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));

    // --- Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int i = 0; i < NUM_STREAMS; i++) {
        cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
        cufftSetStream(plans[i], streams[i]);
    }

    // --- Async memcopyes and computations
    gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
    gpuErrchk(cudaMemcpyAsync(&d_in2[Npartial], h_in2, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
    gpuErrchk(cudaMemcpyAsync(&d_in3[2*Npartial], h_in3, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
    cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
    cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
    cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);

    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));

    kernel<<<iDivUp(BLOCKSIZE,N), BLOCKSIZE>>>(d_out1, d_out2, d_out3, d_out, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_out, d_out, N*sizeof(float2), cudaMemcpyDeviceToHost));

    for (int i=0; i<N; i++) printf("i = %i; real(h_out) = %f; imag(h_out) = %f\n", i, h_out[i].x, h_out[i].y);

    // --- Releases resources
    gpuErrchk(cudaHostUnregister(h_in1));
    gpuErrchk(cudaHostUnregister(h_in2));
    gpuErrchk(cudaHostUnregister(h_in3));
    gpuErrchk(cudaFree(d_in1));
    gpuErrchk(cudaFree(d_in2));
    gpuErrchk(cudaFree(d_in3));
    gpuErrchk(cudaFree(d_out1));
    gpuErrchk(cudaFree(d_out2));
    gpuErrchk(cudaFree(d_out3));
    gpuErrchk(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));

    delete[] h_in1;
    delete[] h_in2;
    delete[] h_in3;
    delete[] h_out;

    cudaDeviceReset();  

    return 0;
}

这是在 Kepler K20c 卡上运行时上述代码的时间线。如您所见，计算与异步内存传输重叠。

enter image description here

关于cuda - CUDA 内存副本和 cuFFT 的异步执行，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/25093958/

cuda - CUDA 内存副本和 cuFFT 的异步执行

上一篇：windows-phone-8 - 当应用程序关闭时，WP8 在后台任务中使用 http 请求保存文件

下一篇：wpf 工具包颜色选择器