c++ - 使用 CUDA 查找数组中未知大小区域的最大值

假设我有一个数组 A[4000]包含所有不同数字的值 [45,21,764,234,7,0,12,55,...]

然后我有另一个数组B[4000]表示区域在数组 A 中的位置编号 1如果它是一个区域的一部分，并且0如果不是。如果1's彼此相邻，这意味着它们属于同一区域，如果它们彼此不相邻(0 之间有一个 1's)，则它们属于不同区域。

例如。 B = [1,1,1,0,1,1,0,0...]意味着我想在 first three numbers in array A 的区域中找到最大值, 以及 5th and 6th numbers in array A, etc. 中的最大数量这样我就可以生成一个数组 C[4000]拥有 A 的最大值在 B 表示的每个区域中, 和一个 0在不属于地区的地区。

所以在这种情况下 C = [764,764,764,0,7,7,0,0...]

可以是 0 to 2,000 regions 的任何地方, 并且区域的长度范围可以是 2 to 4,000 numbers long .我事先不知道有多少区域或区域的大小。

我一直在尝试在 CUDA 中想出一个内核来实现这个结果。它需要尽可能快地完成，因为它实际上将用于图像，这只是一个简化的例子。我所有的想法，比如使用缩减，只有在只有一个区域跨越所有 4000 的情况下才有效。数组的数量 A .但是，我不认为我可以在这里使用缩减，因为数组中可以有多个区域由 1 分隔。至 3996空格 ( 0's ) 和减少会让我失去对分离区域的跟踪。或者，内核有太多的循环，其中的 if 语句速度不够快，例如

int intR = 0;
 while(B[blockIdx.x * blockDim.x + threadIdx.x + intR] > 0){
     intMaxR = intMaxR < A[blockIdx.x * blockDim.x + threadIdx.x + intR] ? A[blockIdx.x * blockDim.x + threadIdx.x + intR] : intMaxR;
     intR++;
 }

 int intL = 0;
 while(B[blockIdx.x * blockDim.x + threadIdx.x - intL] > 0){
     intMaxL = intMaxL < A[blockIdx.x * blockDim.x + threadIdx.x - intL] ? A[blockIdx.x * blockDim.x + threadIdx.x + intL] : intMaxL;
     intL++;
 }

 intMax =  intMaxR > intMaxL ? intMaxR : intMaxL;

 for(int i = 0; i < intR; i++){
     C[blockIdx.x * blockDim.x + threadIdx.x + i] = intMax;
 }
 for(int i = 0; i < intL; i++){
     C[blockIdx.x * blockDim.x + threadIdx.x - i] = intMax;
 }

很明显，即使使用共享内存，代码也很慢，并且没有真正利用 CUDA 的并行特性。有没有人知道如何或是否可以在 CUDA 中有效地完成这项工作？

提前致谢。

最佳答案

一种可能的方法是使用 thrust .

一个可能的顺序是这样的:

使用 thrust::reduce_by_key 为每个范围生成最大值。
使用 thrust::adjacent_difference 来描绘每个范围的开始
对第 2 步的结果使用包容性扫描来生成收集索引，即用于选择输出 vector 每个位置的缩减值(第 1 步的结果)的索引。<
使用thrust::gather_if使用在步骤 3 中生成的收集索引，有选择地将减少的值放入输出 vector 中的适当位置(B vector 中有 1)。

这是一个完整的代码来演示这一点，使用 A 和 B vector 就像你的例子一样:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/adjacent_difference.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/transform_scan.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/functional.h>

#define DSIZE 8

template <typename T>
struct abs_val : public thrust::unary_function<T, T>
{
  __host__ __device__
  T operator()(const T& x) const
  {
    if (x<0) return -x;
    else return x;
  }
};

template <typename T>
struct subtr : public thrust::unary_function<T, T>
{
  const T val;
  subtr(T _val): val(_val) {}
  __host__ __device__
  T operator()(const T& x) const
  {
    return  x-val;
  }
};

int main(){

  int A[DSIZE] = {45,21,764,234,7,0,12,55};
  int B[DSIZE] = {1,1,1,0,1,1,0,0};
  thrust::device_vector<int> dA(A, A+DSIZE);
  thrust::device_vector<int> dB(B, B+DSIZE);
  thrust::device_vector<int> dRed(DSIZE);
  thrust::device_vector<int> diffB(DSIZE);
  thrust::device_vector<int> dRes(DSIZE);

  thrust::reduce_by_key(dB.begin(), dB.end(), dA.begin(), thrust::make_discard_iterator(), dRed.begin(), thrust::equal_to<int>(), thrust::maximum<int>());
  thrust::adjacent_difference(dB.begin(), dB.end(), diffB.begin());
  thrust::transform_inclusive_scan(diffB.begin(), diffB.end(), diffB.begin(), abs_val<int>(), thrust::plus<int>());
  thrust::gather_if(thrust::make_transform_iterator(diffB.begin(), subtr<int>(B[0])), thrust::make_transform_iterator(diffB.end(), subtr<int>(B[0])), dB.begin(), dRed.begin(), dRes.begin());
  thrust::copy(dRes.begin(), dRes.end(), std::ostream_iterator<int>(std::cout, " "));
  std::cout  << std::endl;
  return 0;
}

例子注释:

reduce_by_key 正在为每个生成减少的值(最大值) B 中连续的 0 序列或 1 序列。你只需要 1 个序列的最大值。我们将丢弃 0 序列通过 gather_if 函数的最大值。
我考虑到 B vector 可能以任何一个开头的可能性 1 序列或 0 序列，通过使用 transform_iterator 处理步骤 2 的 vector 结果，减去第一个来自每个集合索引的 B vector 的值。
adjacent_difference 操作将产生 1 或 -1 到划定一个新序列的开始。我用具有 abs_val 仿函数的 transform_inclusive_scan 变体可以平等对待这些，用于扫描目的(即生成收集索引)。
上面的代码应该会产生与您所需的 C 输出 vector 相匹配的结果，如下所示:
```
$ nvcc -arch=sm_20 -o t53 t53.cu
$ ./t53
764 764 764 0 7 7 0 0
$
```

我们可以使用thrust::placeholders进一步简化上面的代码，消除额外的仿函数定义的需要:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/adjacent_difference.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/transform_scan.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/functional.h>

#define DSIZE 2000000
using namespace thrust::placeholders;

typedef int mytype;

int main(){

  mytype *A = (mytype *)malloc(DSIZE*sizeof(mytype));
  int *B = (int *)malloc(DSIZE*sizeof(int));
  for (int i = 0; i < DSIZE; i++){
    A[i] = (rand()/(float)RAND_MAX)*10.0f;
    B[i] = rand()%2;}
  thrust::device_vector<mytype> dA(A, A+DSIZE);
  thrust::device_vector<int> dB(B, B+DSIZE);
  thrust::device_vector<mytype> dRed(DSIZE);
  thrust::device_vector<int> diffB(DSIZE);
  thrust::device_vector<mytype> dRes(DSIZE);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);
  thrust::reduce_by_key(dB.begin(), dB.end(), dA.begin(), thrust::make_discard_iterator(), dRed.begin(), thrust::equal_to<mytype>(), thrust::maximum<mytype>());
  thrust::adjacent_difference(dB.begin(), dB.end(), diffB.begin());
  thrust::transform_inclusive_scan(diffB.begin(), diffB.end(), diffB.begin(), _1*_1, thrust::plus<int>());
  thrust::gather_if(thrust::make_transform_iterator(diffB.begin(), _1 - B[0]), thrust::make_transform_iterator(diffB.end(), _1 - B[0]), dB.begin(), dRed.begin(), dRes.begin());
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float et;
  cudaEventElapsedTime(&et, start, stop);
  std::cout<< "elapsed time: " << et << "ms " << std::endl;
  thrust::copy(dRes.begin(), dRes.begin()+10, std::ostream_iterator<mytype>(std::cout, " "));
  std::cout  << std::endl;
  return 0;
}

(我修改了上面的占位符代码，还包括生成更大尺寸的数据集，以及一些基本的计时装置。)

关于c++ - 使用 CUDA 查找数组中未知大小区域的最大值，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/25605361/

c++ - 使用 CUDA 查找数组中未知大小区域的最大值

上一篇：c - 如何在参数中传递结构类型？

下一篇：c - 如何使用 MPI_Scatterv 将矩阵的行发送到所有进程？