c - 用CUDA提取矩阵列？

使用nvprof，我发现以下内核是我的CUDA应用程序的瓶颈

__global__ void extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
  int stride = gridDim.x * blockDim.x;
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  for (int j = tid; j < len; j += stride) {
    int colId = j / numRows;
    int rowId = j % numRows;
    tgt[j] = src[indices[colId]*numRows + rowId];
  }
}

它打算将src中列出的矩阵indices的列提取到矩阵tgt中。请注意，矩阵src和tgt都有numRows行，并且以列主维度存储。此外，len = length(indices)*numRows是矩阵tgt的条目总数。
我的问题是：有没有更有效的方法来做到这一点？我们也很感激提到老问题。我很惊讶以前没有问过这个问题，因为这是在MATLAB中使用的非常常见的操作。
非常感谢！

最佳答案

作为一个拷贝内核，最好的性能可能会受到可用内存带宽的限制。可以通过运行bandwidthTestcuda sample code并参考设备到设备传输报告的数量（将根据GPU而变化）来获得对此的粗略估计。
您的内核已经写得相当好了，因为加载和存储操作应该很好地结合在一起。通过检查代码可以明显看出这一点，但是您可以通过使用nvprof运行--metrics gld_efficiency和--metrics gst_efficiency来证明这一点。这两个数字应该接近100%。（根据我的测试。）
所以在我的例子中，当我在Quadro5000 GPU上运行内核时，取传输大小除以内核执行时间，我得到一个约为可用带宽60%的数字。您的内核中没有太多其他内容，因此我们只需关注循环中的这两行：

int colId = j / numRows;
int rowId = j % numRows;

事实证明，整数除法和模运算在GPU上都相当昂贵；它们是由编译器生成的指令序列创建的——没有本机除法或模运算机器指令。因此，如果我们能在主循环中找到一种方法来消除这些问题，我们就可以更接近我们的目标，即100%的带宽由bandwidthTest（设备到设备）报告。
由于循环中的增量是固定的（stride），我相信我们可以（主要）为循环的每个迭代预先计算需要添加到colId和rowId中的增量，并在循环中使用加减法，而不是除法。修改后的内核如下所示：

__global__ void my_extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
  int stride = gridDim.x * blockDim.x;
  int div = stride/numRows;
  int rem = stride%numRows;
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  int colId = tid / numRows;
  int rowId = tid % numRows;
  for (int j = tid; j < len; j += stride) {
    tgt[j] = src[indices[colId]*numRows + rowId];
    colId += div;
    rowId += rem;
    if (rowId >= numRows) {rowId-=numRows; colId++;}
  }
}

因此，通过预先计算循环每次迭代的增量，我们可以避免主循环中“昂贵的”除法类型操作。那么表演呢？这个内核更接近100%带宽的目标。下面是完整的测试代码：

#include <stdio.h>

#define NUMR 1000
#define NUMC 20000
#define DSIZE (NUMR*NUMC)
#define EXTC 10000
#define EXSZ (NUMR*EXTC)

#define nTPB 256
#define nBLK 64
typedef float real_t;

__global__ void extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
  int stride = gridDim.x * blockDim.x;
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  for (int j = tid; j < len; j += stride) {
    int colId = j / numRows;
    int rowId = j % numRows;
    tgt[j] = src[indices[colId]*numRows + rowId];
  }
}

__global__ void my_extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
  int stride = gridDim.x * blockDim.x;
  int div = stride/numRows;
  int rem = stride%numRows;
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  int colId = tid / numRows;
  int rowId = tid % numRows;
  for (int j = tid; j < len; j += stride) {
    tgt[j] = src[indices[colId]*numRows + rowId];
    colId += div;
    rowId += rem;
    if (rowId >= numRows) {rowId-=numRows; colId++;}
  }
}


__global__ void copy_kernel(real_t *tgt, real_t *src, int len){
  int tid = threadIdx.x+blockDim.x*blockIdx.x;
  while (tid < len){
    tgt[tid] = src[tid];
    tid+=blockDim.x*gridDim.x;
  }
}


int main(){

  real_t *h_a, *d_a, *h_b, *d_b, *h_bi;
  h_a = (real_t *) malloc(DSIZE*sizeof(real_t));
  cudaMalloc(&d_a, DSIZE*sizeof(real_t));
  h_b = (real_t *) malloc(EXSZ*sizeof(real_t));
  cudaMalloc(&d_b, EXSZ*sizeof(real_t));
  h_bi = (real_t *) malloc(EXSZ*sizeof(real_t));
  int *h_ind, *d_ind;
  h_ind = (int *) malloc(EXTC*sizeof(int));
  cudaMalloc(&d_ind, EXTC*sizeof(int));

  for (int i = 0; i < EXTC; i++) h_ind[i] = i;
  for (int i = 0; i < DSIZE; i++) h_a[i] = i;

  cudaMemcpy(d_a, h_a, DSIZE*sizeof(real_t), cudaMemcpyHostToDevice);
  cudaMemcpy(d_ind, h_ind, EXTC*sizeof(int), cudaMemcpyHostToDevice);
  extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, d_ind, NUMR, NUMR*EXTC);
  cudaMemcpy(h_b, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
  copy_kernel<<<nBLK, nTPB>>>(d_b, d_a, NUMR*EXTC);
  cudaDeviceSynchronize();
  my_extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, d_ind, NUMR, NUMR*EXTC);
  cudaMemcpy(h_bi, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
  for (int i = 0; i < EXSZ; i++)
    if (h_bi[i] != h_b[i]) {printf("mismatch at %d, was: %f, should be: %f\n", i, h_bi[i], h_b[i]); return 1;}
  printf("Success!\n");
  return 0;
}

我包括了对你的内核，我的内核的测试，以及对“复制内核”的测试，它只是对相同数量的数据进行纯拷贝。这有助于我们确认可用带宽上限的概念（见下文）。
现在是性能数据。bandwidthTest在这个GPU上告诉我们：

$ /usr/local/cuda/samples/bin/x86_64/linux/release/bandwidthTest
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Quadro 5000
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     5855.8

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     6334.8

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     101535.4

Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
$

所以我们有大约100GB/s的可用带宽。现在运行nvprof --print-gpu-trace我们看到：

$ nvprof --print-gpu-trace ./t822
==17985== NVPROF is profiling process 17985, command: ./t822
Success!
==17985== Profiling application: ./t822
==17985== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
781.98ms  29.400ms                    -               -         -         -         -  80.000MB  2.7211GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy HtoD]
811.40ms  9.0560us                    -               -         -         -         -  40.000KB  4.4170GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy HtoD]
811.44ms  1.3377ms             (64 1 1)       (256 1 1)        15        0B        0B         -           -  Quadro 5000 (0)         1         7  extractColumn_kernel(float*, float*, int*, int, int) [188]
812.78ms  21.614ms                    -               -         -         -         -  40.000MB  1.8507GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy DtoH]
834.94ms  816.10us             (64 1 1)       (256 1 1)         9        0B        0B         -           -  Quadro 5000 (0)         1         7  copy_kernel(float*, float*, int) [194]
835.77ms  911.39us             (64 1 1)       (256 1 1)        18        0B        0B         -           -  Quadro 5000 (0)         1         7  my_extractColumn_kernel(float*, float*, int*, int, int) [202]
836.69ms  20.661ms                    -               -         -         -         -  40.000MB  1.9360GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$

这里的传输大小是1000行*10000列*4字节/元素*每个元素2次传输（一读一写）=8000000字节。您的原始内核在1.34ms内传输这些数据，平均带宽约为60GB/s。“纯拷贝”内核在0.816ms内传输相同的数据，平均带宽为98GB/s-非常接近我们的100GB/s目标。我修改的“列拷贝”内核需要0.911ms，因此它可以传输约88GB/s。
寻找更多的表现？因为我的计算变量rem和div对于每个线程都是相同的，所以您可以在主机代码中预计算这些数量，并将它们作为内核参数传递。我不确定这会有什么不同，但你可以试试。现在您有了一个评估性能效果的路线图（如果有的话）。
笔记：
请注意，我相信我更新修改后的内核中的索引的逻辑是正确的，但是我没有对它进行详尽的测试。它通过了我在这里介绍的简单测试用例。
我漏掉了proper cuda error checking。但是，如果遇到问题，应该使用它，并/或使用cuda-memcheck运行代码。
正如我所提到的，从合并的角度来看，您的内核已经写得相当好了，所以它已经达到了“最佳案例”的60%。所以如果你想在这里获得2倍、5倍或10倍的加速，你就找不到它，而且期望它是不合理的。
编辑：进一步改进
在这种情况下，我们不能更接近纯拷贝内核的一个可能原因是这种间接性：

    tgt[j] = src[indices[colId]*numRows + rowId];
                 ^^^^^^^^^^^^^^

read fromindices（全局变量）表示纯拷贝内核不必进行的“额外”数据访问。可能还有一些聪明的方法来优化这种访问的处理。由于它是重复的（不同于对src的读取和对tgt的写入），这表明可能有一些缓存或专用内存的使用可能会有帮助。
如果我们仔细检查访问的性质，我们可以观察到（对于相当大的矩阵），在大多数情况下，对indices的访问将在一个扭曲的线程中是一致的。这意味着，通常在给定的扭曲中，所有线程的值都是相同的colId，因此将从indices请求相同的元素。这种类型的模式建议使用CUDA__constant__内存进行优化。这里需要的更改并不广泛；我们基本上需要将indices数据移动到__constant__数组。
下面是修改后的代码：

$ cat t822.cu
#include <stdio.h>

#define NUMR 1000
#define NUMC 20000
#define DSIZE (NUMR*NUMC)
#define EXTC 10000
#define EXSZ (NUMR*EXTC)

#define nTPB 256
#define nBLK 64



typedef float real_t;

__constant__ int my_indices[EXTC];

__global__ void extractColumn_kernel(real_t *tgt, real_t *src, int *indices, int numRows, int len) {
  int stride = gridDim.x * blockDim.x;
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  for (int j = tid; j < len; j += stride) {
    int colId = j / numRows;
    int rowId = j % numRows;
    tgt[j] = src[indices[colId]*numRows + rowId];
  }
}

__global__ void my_extractColumn_kernel(real_t *tgt, real_t *src, int numRows, int len, int div, int rem) {
  int stride = gridDim.x * blockDim.x;
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  int colId = tid / numRows;
  int rowId = tid % numRows;
  for (int j = tid; j < len; j += stride) {
    tgt[j] = src[my_indices[colId]*numRows + rowId];
    colId += div;
    rowId += rem;
    if (rowId >= numRows) {rowId-=numRows; colId++;}
  }
}


__global__ void copy_kernel(real_t *tgt, real_t *src, int len){
  int tid = threadIdx.x+blockDim.x*blockIdx.x;
  while (tid < len){
    tgt[tid] = src[tid];
    tid+=blockDim.x*gridDim.x;
  }
}


int main(){

  real_t *h_a, *d_a, *h_b, *d_b, *h_bi;
  h_a = (real_t *) malloc(DSIZE*sizeof(real_t));
  cudaMalloc(&d_a, DSIZE*sizeof(real_t));
  h_b = (real_t *) malloc(EXSZ*sizeof(real_t));
  cudaMalloc(&d_b, EXSZ*sizeof(real_t));
  h_bi = (real_t *) malloc(EXSZ*sizeof(real_t));
  int *h_ind, *d_ind;
  h_ind = (int *) malloc(EXTC*sizeof(int));
  cudaMalloc(&d_ind, EXTC*sizeof(int));

  for (int i = 0; i < EXTC; i++) h_ind[i] = i;
  for (int i = 0; i < DSIZE; i++) h_a[i] = i;

  cudaMemcpy(d_a, h_a, DSIZE*sizeof(real_t), cudaMemcpyHostToDevice);
  cudaMemcpy(d_ind, h_ind, EXTC*sizeof(int), cudaMemcpyHostToDevice);
  extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, d_ind, NUMR, NUMR*EXTC);
  cudaMemcpy(h_b, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
  copy_kernel<<<nBLK, nTPB>>>(d_b, d_a, NUMR*EXTC);
  cudaDeviceSynchronize();
  cudaMemcpyToSymbol(my_indices, h_ind, EXTC*sizeof(int));
  int mydiv = (nBLK*nTPB)/NUMR;
  int myrem = (nBLK*nTPB)%NUMR;
  my_extractColumn_kernel<<<nBLK, nTPB>>>(d_b, d_a, NUMR, NUMR*EXTC, mydiv, myrem);
  cudaMemcpy(h_bi, d_b, EXSZ*sizeof(real_t), cudaMemcpyDeviceToHost);
  for (int i = 0; i < EXSZ; i++)
    if (h_bi[i] != h_b[i]) {printf("mismatch at %d, was: %f, should be: %f\n", i, h_bi[i], h_b[i]); return 1;}
  printf("Success!\n");
  return 0;
}


$

从性能结果来看：

$ nvprof --print-gpu-trace ./t822
==18998== NVPROF is profiling process 18998, command: ./t822
Success!
==18998== Profiling application: ./t822
==18998== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
773.01ms  28.300ms                    -               -         -         -         -  80.000MB  2.8269GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy HtoD]
801.33ms  9.0240us                    -               -         -         -         -  40.000KB  4.4326GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy HtoD]
801.38ms  1.3001ms             (64 1 1)       (256 1 1)        15        0B        0B         -           -  Quadro 5000 (0)         1         7  extractColumn_kernel(float*, float*, int*, int, int) [188]
802.68ms  20.773ms                    -               -         -         -         -  40.000MB  1.9256GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy DtoH]
823.98ms  811.75us             (64 1 1)       (256 1 1)         9        0B        0B         -           -  Quadro 5000 (0)         1         7  copy_kernel(float*, float*, int) [194]
824.82ms  8.9920us                    -               -         -         -         -  40.000KB  4.4484GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy HtoD]
824.83ms  824.65us             (64 1 1)       (256 1 1)        13        0B        0B         -           -  Quadro 5000 (0)         1         7  my_extractColumn_kernel(float*, float*, int, int, int, int) [204]
825.66ms  21.023ms                    -               -         -         -         -  40.000MB  1.9027GB/s  Quadro 5000 (0)         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.

我们看到，我们现在几乎达到了100%利用可用带宽的目标（824us与811us的拷贝内核时间相比）。__constant__内存限制为64KB，因此这意味着只有当索引（实际上需要复制的列数）小于16000时，才可用这种方式。

关于c - 用CUDA提取矩阵列？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/31127484/

c - 用CUDA提取矩阵列？

上一篇：c - 如何将带小数点的有符号数解释为 C 代码中的实际值(256 -> 4)

下一篇：c - 带有 %d 和 %c 的 char scanf 的不同答案