c++ - 为什么虚拟 cudaMalloc 会加速现代GPU中的interval_gather？

我正在使用moderngpu CUDA库( https://github.com/moderngpu/moderngpu )

如果在调用 cudaMalloc 之前分配了一些字节，moderngpu 中的 interval_gather 函数会出现奇怪的加速。

我调用一个方法 mgpu::interval_gather(m0.data(), 1, m1.data(), 1, m2.data(), m3.data(), context); m0、m1、m2、m3 中的每一个都是一个 mgpu::mem_t 数组，大小为 1 000 000，并用 0 填充。

我做了 10 000 次，大约需要 3 秒。

但是，如果我在它之前分配一些虚拟内存:

cudaMalloc((void **)(&tmpPtr), sizeof(int));

速度提高了约 10 倍，耗时 0.3 秒。为什么会这样呢？

我尝试在分配虚拟字节之前或之后分配 mem_t 内存，结果相同。我尝试了分配不同大小的虚拟内存、多次调用函数或禁用 -O2 编译标志 - 似乎没有任何变化。

但是，当我尝试减少传递给interval_gather的数组的大小时，效果停止了。然后无论是否有虚拟分配都需要 0.3 秒。

我在 GTX 980 上运行了所有这些。

这是完整的代码片段。如果magic变量设置为true，它会加速。

#include <iostream>
#include <chrono>
#include <string>

#include <moderngpu/kernel_intervalmove.hxx>

void print_ms(std::clock_t start, std::clock_t end, std::string desc)
{
    double ms = 1000.0 * (end - start) / CLOCKS_PER_SEC;
    std::cout << desc << ": " << ms << " ms." << std::endl;
}

void fun(bool magic, mgpu::context_t &context)
{
    int n = 10000000;
    mgpu::mem_t<int> m0(n, context);
    mgpu::mem_t<int> m1(n, context);
    mgpu::mem_t<int> m2(n, context);
    mgpu::mem_t<int> m3(n, context);

    void *tmpPtr;
    cudaMalloc((void **)(&tmpPtr), sizeof(int));
    if (!magic)
        cudaFree(tmpPtr);

    for (int aa = 0; aa < 10000; aa++)
        mgpu::interval_gather(m0.data(), 1,
                              m1.data(), 1,
                              m2.data(), 
                              m3.data(),
                              context);

    if (magic)
        cudaFree(tmpPtr);
}

int main(int argc, char *argv[])
{
    mgpu::standard_context_t context(false);
    std::clock_t c0 = std::clock();

    fun(false, context);
    context.synchronize();
    std::clock_t c1 = std::clock();
    print_ms(c0, c1, "1st");

    fun(false, context);
    context.synchronize();
    std::clock_t c2 = std::clock();
    print_ms(c1, c2, "2nd");

    fun(true, context);
    context.synchronize();
    std::clock_t c3 = std::clock();
    print_ms(c2, c3, "3rd");

    return 0;
}

最佳答案

前言:我可以告诉你发生了什么，但我无法解释为什么。

TLDR:mgpu::interval_gather 之前的 cudaMalloc 调用正在改变内部 cudaMalloc 和 cudaFree 的性能> interval_gather 操作 中的调用大约增加了 10 倍，这就是您看到的性能差异。关键是在 mgpu 调用之前分配的内存必须在调用之后才能释放。

在代码中一次调用 mgpu::interval_gather 的典型 API 计时示例，无需使用神奇的 cudaMalloc 调用:

API 跟踪:

262.99ms  80.334us  cudaMalloc
263.07ms  11.912us  cudaLaunchKernel (_ZN4mgpu16launch_box_cta_kINS_15launch_params_tILi128ELi1ELi1ELi0EEENS_6detail11transform_fIS2_EEJZNS_21merge_path_partitionsILNS_8bounds_tE1ENS_19counting_iterator_tIiiEEPiNS_6less_tIiEEEENS_5mem_tIiEET0_lT1_llT2_RNS_9context_tEEUliE_mE
263.08ms  7.3310us  cudaLaunchKernel (_ZN4mgpu16launch_box_cta_kINS_12launch_box_tIJNS_7arch_20INS_12launch_cta_tILi128ELi11ELi9ELi0EEENS_7empty_tEEENS_7arch_35INS3_ILi128ELi7ELi5ELi0EEES5_EENS_7arch_52IS4_S5_EEEEEZNS_13transform_lbsIS5_ZNS_15interval_gatherIS5_PiSF_SF_SF_EE
263.09ms  51.357us  cudaFree

GPU 跟踪:

273.40ms  1.4400us              (1 1 1)       (128 1 1)        11        0B        0B  GeForce GTX 970         1         7  _ZN4mgpu16launch_box_cta_kINS_15launch_params_tILi128ELi1ELi1ELi0EEENS_6detail11transform_fIS2_EEJZNS_21merge_path_partitionsILNS_8bounds_tE1ENS_19counting_iterator_tIiiEEPiNS_6less_tIiEEEENS_5mem_tIiEET0_lT1_llT2_RNS_9context_tEEUliE_mEEEvSF_iDpSG_ [126]
273.40ms  5.7600us              (1 1 1)       (128 1 1)        56  5.5156KB        0B  GeForce GTX 970         1         7  _ZN4mgpu16launch_box_cta_kINS_12launch_box_tIJNS_7arch_20INS_12launch_cta_tILi128ELi11ELi9ELi0EEENS_7empty_tEEENS_7arch_35INS3_ILi128ELi7ELi5ELi0EEES5_EENS_7arch_52IS4_S5_EEEEEZNS_13transform_lbsIS5_ZNS_15interval_gatherIS5_PiSF_SF_SF_EEvT0_iT1_iT2_T3_RNS_9context_tEEUliiiNS_5tupleIJiEEESF_SF_E_SF_NSM_IJSF_EEEJSF_SF_EEEvSG_iSH_iSI_SL_DpSJ_EUliiSF_SF_E_JSF_SF_EEEvSG_iDpSH_ [127]

与使用神奇的cudaMalloc相比:

API 跟踪:

2.32306s  4.7240us  cudaMalloc
2.32307s  7.9970us  cudaLaunchKernel (_ZN4mgpu16launch_box_cta_kINS_15launch_params_tILi128ELi1ELi1ELi0EEENS_6detail11transform_fIS2_EEJZNS_21merge_path_partitionsILNS_8bounds_tE1ENS_19counting_iterator_tIiiEEPiNS_6less_tIiEEEENS_5mem_tIiEET0_lT1_llT2_RNS_9context_tEEUliE_mE
2.32308s  6.7660us  cudaLaunchKernel (_ZN4mgpu16launch_box_cta_kINS_12launch_box_tIJNS_7arch_20INS_12launch_cta_tILi128ELi11ELi9ELi0EEENS_7empty_tEEENS_7arch_35INS3_ILi128ELi7ELi5ELi0EEES5_EENS_7arch_52IS4_S5_EEEEEZNS_13transform_lbsIS5_ZNS_15interval_gatherIS5_PiSF_SF_SF_EE
2.32308s  8.2070us  cudaFree

GPU 跟踪:

2.37275s  1.0240us              (1 1 1)       (128 1 1)        11        0B        0B  GeForce GTX 970         1         7  _ZN4mgpu16launch_box_cta_kINS_15launch_params_tILi128ELi1ELi1ELi0EEENS_6detail11transform_fIS2_EEJZNS_21merge_path_partitionsILNS_8bounds_tE1ENS_19counting_iterator_tIiiEEPiNS_6less_tIiEEEENS_5mem_tIiEET0_lT1_llT2_RNS_9context_tEEUliE_mEEEvSF_iDpSG_ [120129]
2.37276s  4.0000us              (1 1 1)       (128 1 1)        56  5.5156KB        0B  GeForce GTX 970         1         7  _ZN4mgpu16launch_box_cta_kINS_12launch_box_tIJNS_7arch_20INS_12launch_cta_tILi128ELi11ELi9ELi0EEENS_7empty_tEEENS_7arch_35INS3_ILi128ELi7ELi5ELi0EEES5_EENS_7arch_52IS4_S5_EEEEEZNS_13transform_lbsIS5_ZNS_15interval_gatherIS5_PiSF_SF_SF_EEvT0_iT1_iT2_T3_RNS_9context_tEEUliiiNS_5tupleIJiEEESF_SF_E_SF_NSM_IJSF_EEEJSF_SF_EEEvSG_iSH_iSI_SL_DpSJ_EUliiSF_SF_E_JSF_SF_EEEvSG_iDpSH_ [120130]

您可以清楚地看到 cudaMalloc 和 cudaFree 性能变化很大，但没有其他变化:

有趣的是，如果您保留分配的魔术内存，则在调用测试函数之间性能变化将持续存在，例如:

void fun(bool magic, mgpu::context_t &context)
{
    int n = 10000000;
    mgpu::mem_t<int> m0(n, context);
    mgpu::mem_t<int> m1(n, context);
    mgpu::mem_t<int> m2(n, context);
    mgpu::mem_t<int> m3(n, context);

    void *tmpPtr = 0;
    if (magic) cudaMalloc((void **)(&tmpPtr), sizeof(int));

    for (int aa = 0; aa < 10000; aa++)
        mgpu::interval_gather(m0.data(), 1,
                              m1.data(), 1,
                              m2.data(), 
                              m3.data(),
                              context);

}

int main(int argc, char *argv[])
{
    {
    mgpu::standard_context_t context(false);
    std::clock_t c0 = std::clock();

    fun(false, context);
    context.synchronize();
    std::clock_t c1 = std::clock();
    print_ms(c0, c1, "1st");

    fun(true, context);
    context.synchronize();
    std::clock_t c2 = std::clock();
    print_ms(c1, c2, "2nd");

    fun(false, context);
    context.synchronize();
    std::clock_t c3 = std::clock();
    print_ms(c2, c3, "3rd");
    }

    return 0;
}

这样做:

$ nvcc -arch=sm_52 -std=c++11 --expt-extended-lambda -I ~/mgpu/moderngpu/src -o mgpuspeed mgpuspeed.cu 
$ ./mgpuspeed 
1st: 1287.37 ms.
2nd: 201.205 ms.
3rd: 202.275 ms.

[GTX970、CUDA 10.1、Ubuntu 18.04 LTS、440.59 驱动程序上的所有计时]

尽管事实上每个 mem_t 构造函数调用都会调用 cudaMalloc 调用，并在对象掉落时调用相应的 cudaFree 调用超出范围。因此，它与小型“神奇”cudaMalloc 调用具体相关，而不是与一般的 cudaMalloc 调用相关。这让我认为性能变化实际上是在内存管理器中，而不是调度或异步程序流中的某种微妙变化。它可能很简单，例如将平板分配器移动到不同的对齐方式或页面大小，或者改变其内部 cudaMalloc 调用的执行方式。

我会向 NVIDIA 提交错误报告，看看他们怎么说(警告，这在 MGPU 中可能有点奇怪，但我探索该代码库的兴趣非常有限)。

关于c++ - 为什么虚拟 cudaMalloc 会加速现代GPU中的interval_gather？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/61075640/

c++ - 为什么虚拟 cudaMalloc 会加速现代GPU中的interval_gather？

上一篇：c++ - 为什么不能在不创建节点作为指针的情况下创建链表？

下一篇：c++ - C++中程序执行时带cout和不带cout的时间差