c++ - CUDA 并发内核启动不起作用

我正在编写一个用于图像处理的 CUDA 程序。将为 RGB channel 启动相同的内核“processOneChannel”。

下面我尝试为三个内核启动指定流，以便它们可以同时处理。但是nvprof说他们还在陆续推出...

这三个内核前后还有另外两个内核，我不想让它们同时运行。

基本上我想要以下内容: seperateChannels --> processOneChannel(x3) --> recombineChannels

请指教我哪里做错了..

void kernelLauncher(const ushort4 * const h_inputImageRGBA, ushort4 * const d_inputImageRGBA,
                        ushort4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
                        unsigned short *d_redProcessed, 
                        unsigned short *d_greenProcessed, 
                        unsigned short *d_blueProcessed,
                        unsigned short *d_prand)
{
    int MAXTHREADSx = 512;
    int MAXTHREADSy = 1; 
    int nBlockX = numCols / MAXTHREADSx + 1;
    int nBlockY = numRows / MAXTHREADSy + 1;

  const dim3 blockSize(MAXTHREADSx,MAXTHREADSy,1);

  const dim3 gridSize(nBlockX,nBlockY,1);

  // cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  int nstreams = 5;
  cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));

  for (int i = 0; i < nstreams; i++)
  {
      checkCudaErrors(cudaStreamCreateWithFlags(&(streams[i]),cudaStreamNonBlocking));
  }

  separateChannels<<<gridSize,blockSize>>>(d_inputImageRGBA, 
                                          (int)numRows, 
                                          (int)numCols, 
                                          d_red, 
                                          d_green, 
                                          d_blue);
  cudaDeviceSynchronize(); 

  checkCudaErrors(cudaGetLastError());

    processOneChannel<<<gridSize,blockSize,0,streams[0]>>>(d_red,
                                                          d_redProcessed,
                                                          (int)numRows,(int)numCols,
                                                          d_filter,d_prand);

    processOneChannel<<<gridSize,blockSize,0,streams[1]>>>(d_green,
                                                          d_greenProcessed,
                                                          (int)numRows,(int)numCols,
                                                          d_filter,d_prand);

    processOneChannel<<<gridSize,blockSize,0,streams[2]>>>(d_blue,
                                                          d_blueProcessed,
                                                          (int)numRows,(int)numCols,
                                                          d_filter,d_prand);
  cudaDeviceSynchronize(); 
    checkCudaErrors(cudaGetLastError());

  recombineChannels<<<gridSize, blockSize>>>(d_redProcessed,
                                             d_greenProcessed,
                                             d_blueProcessed,
                                             d_outputImageRGBA,
                                             numRows,
                                             numCols);
      for (int i = 0; i < nstreams; i++)
    {
        cudaStreamDestroy(streams[i]);
    }

    free(streams);
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}

这是 nvprof gpu 跟踪输出。请注意，内核启动前的 memcpy 是传递用于处理的过滤数据，因此它们不能与内核启动并发运行。

==10001== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
1.02428s  2.2400us                    -               -         -         -         -  28.125MB   1e+04GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.02855s  18.501ms                    -               -         -         -         -  28.125MB  1.4846GB/s  GeForce GT 750M         1        13  [CUDA memcpy HtoD]
1.21959s  1.1371ms                    -               -         -         -         -  1.7580MB  1.5098GB/s  GeForce GT 750M         1        13  [CUDA memcpy HtoD]
1.22083s  1.3440us                    -               -         -         -         -  7.0313MB   5e+03GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.22164s  1.3440us                    -               -         -         -         -  7.0313MB   5e+03GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.22243s  3.6480us                    -               -         -         -         -  7.0313MB   2e+03GB/s  GeForce GT 750M         1        13  [CUDA memset]
1.22349s  10.240us                    -               -         -         -         -  8.0000KB  762.94MB/s  GeForce GT 750M         1        13  [CUDA memcpy HtoD]
1.22351s  6.6021ms           (6 1441 1)       (512 1 1)        12        0B        0B         -           -  GeForce GT 750M         1        13  separateChannels(...) [123]
1.23019s  10.661ms           (6 1441 1)       (512 1 1)        36      192B        0B         -           -  GeForce GT 750M         1        14  processOneChannel(...) [133]
1.24085s  10.518ms           (6 1441 1)       (512 1 1)        36      192B        0B         -           -  GeForce GT 750M         1        15  processOneChannel(...) [141]
1.25137s  10.779ms           (6 1441 1)       (512 1 1)        36      192B        0B         -           -  GeForce GT 750M         1        16  processOneChannel(...) [149]
1.26372s  5.7810ms           (6 1441 1)       (512 1 1)        15        0B        0B         -           -  GeForce GT 750M         1        13  recombineChannels(...) [159]
1.26970s  19.859ms                    -               -         -         -         -  28.125MB  1.3831GB/s  GeForce GT 750M         1        13  [CUDA memcpy DtoH]

这是我将 -default-stream per-thread 传递给 nvcc 的 CMakeList.txt

cmake_minimum_required(VERSION 2.6 FATAL_ERROR)

find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)

set(
    CUDA_NVCC_FLAGS
    ${CUDA_NVCC_FLAGS};
     -default-stream per-thread
)

file( GLOB  hdr *.hpp *.h )
file( GLOB  cu  *.cu)

SET (My_files main.cpp)

# Project Executable
CUDA_ADD_EXECUTABLE(My ${My_files} ${hdr} ${cu})
target_link_libraries(My ${OpenCV_LIBS})

最佳答案

每个内核启动 6*1441，即超过 8000 个 block ，每个 block 有 512 个线程。那就是填满机器，阻止后续内核启动的 block 执行。

机器有一个容量。以 block 为单位的最大瞬时容量等于 GPU 中的 SM 数量乘以每个 SM 的最大块数，这两者都是您可以使用 deviceQuery 应用程序检索的规范。当你填满它时，它无法处理更多的 block ，直到一些已经运行的 block 退出。这个过程将在第一次内核启动时继续，直到大多数 block 都退役了。然后第二个内核将开始执行。

关于c++ - CUDA 并发内核启动不起作用，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/36523179/

c++ - CUDA 并发内核启动不起作用

上一篇：c++ - 当我在 C++/SFML 中按下鼠标按钮时如何绘制球

下一篇：c++ - Composite in C++ 错误 [没有匹配的成员函数来调用 'push_back']