c++ - 如何在 cv::cuda::PtrStepSzf 数据上使用线性索引

标签 c++ opencv cuda

我正在使用 opencv 3.1 cv::cuda 模板匹配,但是 cv::cuda::minMaxLoc() 函数对我来说太慢了。我的匹配结果的最小尺寸为 128x128,最大尺寸为 512x512。对于 128x128minMaxLoc() 平均需要 1.65 毫秒,对于类似 350x350 的东西,平均需要 25 毫秒,这太长了,因为这已经完成数百次。

我知道我的匹配尺寸对于您通常在 GPU 中使用的尺寸来说可能太小了。但我想按照 Robert Crovella 在 thrust::max_element slow in comparison cublasIsamax - More efficient implementation? 所做的那样进行测试看看我能否获得更好的表现。

我的问题是,所有这些缩减数据都是使用线性索引读取的,而 cv::cuda::PtrStepSzf 不允许这样做(至少我没有找到方法)。我尝试 reshape 我的匹配结果,但我无法这样做,因为数据不连续。我是否需要使用 cudaMallocPitchcudaMemcpy2D 如果是这样,我如何使用 cv::cuda::GPUMat 读取为 cv::cuda::PtrStepSzf 对象?

    __global__ void minLoc(const    cv::cuda::PtrStepSzf data,
                                float* minVal,
                                float * minValLoc
                        )
    {
        int dsize = data.cols*data.rows
        __shared__ volatile T   vals[nTPB];
        __shared__ volatile int idxs[nTPB];
        __shared__ volatile int last_block;
        int idx = threadIdx.x+blockDim.x*blockIdx.x;
        last_block = 0;
        T   my_val = FLOAT_MIN;
        int my_idx = -1;
        // sweep from global memory
        while (idx < dsize)
        {
            //data(idx) is an illegal call;The legal one is data(x,y)
            // How do I do it?
            if (data(idx) > my_val) 
            {
                my_val = data(idx); my_idx = idx;
            }
                idx += blockDim.x*gridDim.x;
        }

                // ... rest of the kernel
    }   

   void callMinLocKernel(cv::InputArray _input,       
                cv::Point minValLoc,
                float minVal,
                cv::cuda::Stream _stream)
    {
        const cv::cuda::GpuMat input = _input.getGpuMat();
        dim3 cthreads(32, 32);
        dim3 cblocks(
            static_cast<int>(std::ceil(input1.size().width /
                static_cast<double>(cthreads.x))),
            static_cast<int>(std::ceil(input1.size().height / 
                static_cast<double>(cthreads.y))));

        // code that creates and upload d_min, d_minLoc
        float h_min    = 9999;
        int h_minLoc   = -1;
        float * d_min  = 0;
        int * d_minLoc = 0;
        //gpuErrchk is defined on other place
        gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
        gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
        gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
        gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );

        cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
        minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
        gpuErrchk(cudaGetLastError());
        //code to read the answer
        gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
        gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );

        minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
        minVal = h_min;     
    }

    int main()
    {
        //read Background and template
        cv::Mat input = imread("cat.jpg",0);
        cv::Mat templ = imread("catNose.jpg",0)

        //convert to floats
        cv::Mat float_input, float_templ;
        input.convertTo(float_input,CV_32FC1);
        input.convertTo(float_templ,CV_32FC1);

        //upload Bckg and template to gpu
        cv::cuda::GpuMat d_src,d_templ, d_match;
        Size size = float_input.size();
        d_src.upload(float_input);
        d_templ.upload(float_templ);


        double min_val, max_val;
        Point min_loc, max_loc;   

        Ptr<cv::cuda::TemplateMatching> alg = cuda::createTemplateMatching(d_src.type(), CV_TM_SQDIFF);
        alg->match(d_src, d_templ, d_match);
        cv::cuda::Normalize(d_match,d_match);
        //Too slow
        //cv::cuda::minMaxLoc(d_match, &min_val, &max_val, &min_loc, &max_loc);

        callMinLocKernel(d_match,min_val,min_loc);

        return 0;
}

最佳答案

我没有找到在 cv::cuda::PtrStepSzf 中实际使用线性索引的方法。我不确定有没有。看起来当使用这种格式时它只能使用 2 个下标。相反,我在内核包装器中的 cv::cuda::GpuMat input 变量上使用指针 ptr 如下:

#define nTPB 1024
#define FLOAT_MAX 9999.0f
void callMinLocKernel(cv::InputArray _input,       
        cv::Point minValLoc,
        float minVal,
        cv::cuda::Stream _stream)
{
    const cv::cuda::GpuMat input = _input.getGpuMat();
    const float* linSrc = input.ptr<float>();
    size_t step         = input.step;
    dim3 cthreads(nTPB);
    dim3 cblocks(
    static_cast<int>(std::ceil(input.size().width*input1.size().height /
        static_cast<double>(nTPB))));

    // code that creates and upload d_min, d_minLoc
    float h_min    = 9999;
    int h_minLoc   = -1;
    float * d_min  = 0;
    int * d_minLoc = 0;
    //gpuErrchk is defined on other place
    gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
    gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
    gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );

    cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
    minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
    gpuErrchk(cudaGetLastError());
    //code to read the answer
    gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );

    minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
    minVal = h_min;
}

在内核内部为:

__global__ void minLoc(const float* data,
                       const size_t step,
                       cv::Size dataSz,
                       float* minVal,
                       float * minValLoc
                    )
{

    __shared__ volatile T   vals[nTPB];
    __shared__ volatile int idxs[nTPB];
    __shared__ volatile int last_block;

    int idx         = threadIdx.x+blockDim.x*blockIdx.x;
    const int dsize = dataSz.height*dataSz.width;
    last_block = 0;
    float  my_val = FLOAT_MAX;
    int my_idx = -1;
    // sweep from global memory
    while (idx < dsize)
    {
        int row = idx / dataSz.width;
        int id = ( row*step / sizeof( float ) ) + idx % dataSz.width;
        if ( data[id] < my_val )
        {
           my_val = data[id];
           my_idx = idx;
        }
        idx += blockDim.x*gridDim.x;
    }

            // ... rest of the kernel
}  

step 以字节为单位,因此需要除以 sizeof(typeVariable) 希望对您有所帮助!

关于c++ - 如何在 cv::cuda::PtrStepSzf 数据上使用线性索引,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41111415/

相关文章:

c++ - 嵌入 LuaJIT - 创建包含文件夹

c++ - CUDA 驱动 CUmodule

c++ - Cuda 错误未定义对 'cufftPlan1d' 的引用?

c++ - CUDA:获取 3D 表面的子集

c++ - C++ 中的 Stackdump 不显示堆栈跟踪

c# - 如何将 .h 文件 +dll 变成某种 .Net 包装器?

c++ - 为什么在返回兼容类型时需要显式 std::move?

c++ - 如何使用 OpenCV 显示 PGM 图像

python - 什么以及为什么 "TypeError: Required argument ' layout' (pos 2) not found“Python 中的错误?

java - sikuli classpath UnsatisfiedLinkError 没有 opencv_core with macosx intellij Junit