我正在使用 opencv 3.1 cv::cuda 模板匹配,但是 cv::cuda::minMaxLoc()
函数对我来说太慢了。我的匹配结果的最小尺寸为 128x128
,最大尺寸为 512x512
。对于 128x128
,minMaxLoc()
平均需要 1.65 毫秒,对于类似 350x350
的东西,平均需要 25 毫秒,这太长了,因为这已经完成数百次。
我知道我的匹配尺寸对于您通常在 GPU 中使用的尺寸来说可能太小了。但我想按照 Robert Crovella 在 thrust::max_element slow in comparison cublasIsamax - More efficient implementation? 所做的那样进行测试看看我能否获得更好的表现。
我的问题是,所有这些缩减数据都是使用线性索引读取的,而 cv::cuda::PtrStepSzf
不允许这样做(至少我没有找到方法)。我尝试 reshape 我的匹配结果,但我无法这样做,因为数据不连续。我是否需要使用 cudaMallocPitch
和 cudaMemcpy2D
如果是这样,我如何使用 cv::cuda::GPUMat
读取为 cv::cuda::PtrStepSzf
对象?
__global__ void minLoc(const cv::cuda::PtrStepSzf data,
float* minVal,
float * minValLoc
)
{
int dsize = data.cols*data.rows
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
T my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
//data(idx) is an illegal call;The legal one is data(x,y)
// How do I do it?
if (data(idx) > my_val)
{
my_val = data(idx); my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
dim3 cthreads(32, 32);
dim3 cblocks(
static_cast<int>(std::ceil(input1.size().width /
static_cast<double>(cthreads.x))),
static_cast<int>(std::ceil(input1.size().height /
static_cast<double>(cthreads.y))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
int main()
{
//read Background and template
cv::Mat input = imread("cat.jpg",0);
cv::Mat templ = imread("catNose.jpg",0)
//convert to floats
cv::Mat float_input, float_templ;
input.convertTo(float_input,CV_32FC1);
input.convertTo(float_templ,CV_32FC1);
//upload Bckg and template to gpu
cv::cuda::GpuMat d_src,d_templ, d_match;
Size size = float_input.size();
d_src.upload(float_input);
d_templ.upload(float_templ);
double min_val, max_val;
Point min_loc, max_loc;
Ptr<cv::cuda::TemplateMatching> alg = cuda::createTemplateMatching(d_src.type(), CV_TM_SQDIFF);
alg->match(d_src, d_templ, d_match);
cv::cuda::Normalize(d_match,d_match);
//Too slow
//cv::cuda::minMaxLoc(d_match, &min_val, &max_val, &min_loc, &max_loc);
callMinLocKernel(d_match,min_val,min_loc);
return 0;
}
最佳答案
我没有找到在 cv::cuda::PtrStepSzf
中实际使用线性索引的方法。我不确定有没有。看起来当使用这种格式时它只能使用 2 个下标。相反,我在内核包装器中的 cv::cuda::GpuMat input
变量上使用指针 ptr
如下:
#define nTPB 1024
#define FLOAT_MAX 9999.0f
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
const float* linSrc = input.ptr<float>();
size_t step = input.step;
dim3 cthreads(nTPB);
dim3 cblocks(
static_cast<int>(std::ceil(input.size().width*input1.size().height /
static_cast<double>(nTPB))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
在内核内部为:
__global__ void minLoc(const float* data,
const size_t step,
cv::Size dataSz,
float* minVal,
float * minValLoc
)
{
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
const int dsize = dataSz.height*dataSz.width;
last_block = 0;
float my_val = FLOAT_MAX;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
int row = idx / dataSz.width;
int id = ( row*step / sizeof( float ) ) + idx % dataSz.width;
if ( data[id] < my_val )
{
my_val = data[id];
my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
step
以字节为单位,因此需要除以 sizeof(typeVariable)
希望对您有所帮助!
关于c++ - 如何在 cv::cuda::PtrStepSzf 数据上使用线性索引,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41111415/