android - GPU 与 CPU 编程 : inconsistencies in processing times

标签 android c++ opencv opencl

我目前正在研究图像跟踪:借助摄像头,我正在跟踪与 Android 系统交互的手指触摸。图像处理是在带有 OpenCL 的 GPU 上完成的:我将相机输出转换为黑白帧,以便获得白色的 Blob 。该方法的处理时间为 65ms。 由于我的目标是使程序更流畅,因此我使用 OpenCV 方法在 CPU 上执行了相同的操作。这给出了 115ms 的处理时间。问题是使用 OpenCV 方法程序感觉 react 更灵敏、速度更快,我不明白在那种情况下处理时间怎么会更长:这对我来说似乎是矛盾的。 对于测量,我是这样进行的:

start= clock();
finish = clock();
double time =((double)finish -start)/CLOCKS_PER_SEC;
std::cout<<"process time : "<< time<<std::endl;

这是我的代码:

static cv::Mat              original_Right,binary_Right;
static cv::Mat              original_Left, binary_Left;
int                 width, height;
clock_t                 start,finish;
double time = 0.0;

width = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_WIDTH);
height = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT);
original_Right.create(height, width, CV_8UC3);


//--------------------------- Camera 2 ---------------------------------
int width_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_WIDTH);
int height_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT);
original_Left.create(height_2, width_2, CV_8UC3);


binary_Right.create(height, width, CV_32F); // FOR GPU
binary_Left.create(height_2, width_2, CV_32F); // FOR GPU
//binary_Right.create(height, width, CV_8UC1); // FOR CPU
//binary_Left.create(height_2, width_2, CV_8UC1); // FOR CPU

Core::running_ = true;


//------------------------------------ SET UP THE GPU -----------------------------------------
cl_context              context;
cl_context_properties   properties [3];
cl_kernel               kernel;
cl_command_queue        command_queue;
cl_program              program;
cl_int                  err;
cl_uint                 num_of_platforms=0;
cl_platform_id          platform_id;
cl_device_id            device_id;
cl_uint                 num_of_devices=0;
cl_mem                  input, output;

size_t                  global;

int                     data_size =height*width*3;


//load opencl source
FILE *fp;
char fileName[] = "./helloTedKrissV2.cl";
char *source_str;

 //Load the source code containing the kernel
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
global = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);


//retreives a list of platforms available
if(clGetPlatformIDs(1,&platform_id, &num_of_platforms)!=CL_SUCCESS){
    std::cout<<"unable to get a platform_id"<<std::endl;
};

// to get a supported GPU device
if(clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_GPU,1,&device_id, &num_of_devices)!= CL_SUCCESS){
    std::cout<<"unable to get a device_id"<<std::endl;      
};

//context properties list - must be terminated with 0
properties[0]=CL_CONTEXT_PLATFORM;
properties[1]=(cl_context_properties) platform_id;
properties[2]=0;

// create a context with the gpu device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);

//create command queue using the context and device
command_queue = clCreateCommandQueue(context,device_id,0,&err);

//create a program from the kernel source code
program= clCreateProgramWithSource(context,1,(const char **) &source_str, NULL,&err);

// compile the program
if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){
    size_t length;
    std::cout<<"Error building program"<<std::endl;
    char buffer[4096];
    clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG, sizeof(buffer),buffer,&length);
    std::cout<< buffer <<std::endl;
}

//specify which kernel from the program to execute
kernel = clCreateKernel(program,"imageProcessing",&err);




while (this->isRunning() == true) { 

    start= clock(); //--------------------- START----------------------

    //----------------------FRAME---------------------
    this->camera_Right.readFrame(original_Right);
    if (original_Right.empty() == true ) {
        std::cerr << "[Core/Error] Original  frame is empty." << std::endl;
        break;
    }

    this->camera_Left.readFrame(original_Left);
    if (original_Left.empty() == true ) {
        std::cerr << "[Core/Error] Original 2  frame is empty." << std::endl;
        break;
    }
    //----------------------FRAME---------------------



  //------------------------------------------------IMP GPU ------------------------------------------------------

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR  , sizeof(unsigned char)*data_size,NULL,NULL);
    output =clCreateBuffer(context,CL_MEM_READ_WRITE   | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL);

   if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Right.data ,0,NULL,NULL )!= CL_SUCCESS){};

    //set the argument list for the kernel command
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input);
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output);
    global = data_size  ;
    //enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL);
    clFinish(command_queue);
    //copy the results from out of the  output buffer
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Right.data,0,NULL,NULL )!= CL_SUCCESS){};

    clReleaseMemObject(input);
    clReleaseMemObject(output);

    //------------------------------------------------IMP GPU ------------------------------------------------------

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR  , sizeof(unsigned char)*data_size,NULL,NULL);
    output =clCreateBuffer(context,CL_MEM_READ_WRITE   | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL);

   if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Left.data ,0,NULL,NULL )!= CL_SUCCESS){};

    //set the argument list for the kernel command
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input);
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output);
    global = data_size  ;
    //enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL);
    clFinish(command_queue);
    //copy the results from out of the  output buffer
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Left.data,0,NULL,NULL )!= CL_SUCCESS){};

   clReleaseMemObject(input);
   clReleaseMemObject(output);

    //------------------------------------------------IMP GPU ------------------------------------------------------

  // CPU METHOD
  // adok::processing::doImageProcessing(original_Right, binary_Right);
  // adok::processing::doImageProcessing(original_Left, binary_Left);

    //-------------------------------------------------------------- TRACKING ------------------------------------------------------

adok::tracking::doFingerContoursTracking(binary_Right,binary_Left, this->fingerContours, this->perspective_Right,this->perspective_Left, this->distortion_Right,this->distortion_Left, this);

    //------------------------------------------- TRACKING -----------------------------------------

 //------------------------------SEND COORDINATES TO ANDROID BOARD--------------------
if (getSideRight() && !getSideLeft() ) {
        std::cout<<"RIGHT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), RIGHT);
    }else if (!getSideRight() && getSideLeft() ){
        std::cout<<"LEFT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Left.getPerspectiveMatrix(), LEFT);
    }else if (getSideRight() && getSideLeft() ){
        std::cout<<"RIGHT & LEFT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), this->perspective_Left.getPerspectiveMatrix());

    }

this->setSideRight(0);
this->setSideLeft(0);

finish = clock();
time =(double)(finish - start)/CLOCKS_PER_SEC;
std::cout << "Time: " << time << std::endl; // ------------END-----------

}
clReleaseCommandQueue(command_queue);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseContext(context);
this->stop();

还有一些奇怪的地方,当我在 CPU 上抓取帧的时间是 5ms,而在 GPU 上是 15ms,我不知道为什么会增加。

我正在开发 android xu4。

最佳答案

GPU 计算中,有时它可能比 CPU 计算花费更多时间。因为,对于 GPU 计算,主进程将数据发送到 GPU 内存,并在数学计算后 GPU 将数据发送回 CPU。因此,数据传输和接收回 CPU 需要时间。如果计算出的缓冲区大小较大且传输时间较长,则 GPU 计算可能需要更多时间。 CUDNN 库与 GPU 处理器一起使它快很多倍。因此,如果您的程序未使用 CUDNN,它可能会变慢。

关于android - GPU 与 CPU 编程 : inconsistencies in processing times,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40259492/

相关文章:

c++ - 为什么着色器和程序在 OpenGL 中存储为整数?

opencv - 等效于emgu中的层次结构

java - SlidingDrawer 启动时半打开

Android:getSpeed 是否返回用户的速度?

android - 在 Android Transfuse 中,如何使用 startActivityForResult 和 onActivityResult 传回 In​​tent

Android gradle 失败一次,第二次成功

c++ - GCC,clang 不同意 MSVC 缩小转换

c++ - 如何将 QEvents 传递给子部件?

c++ - 如何在 OpenCV 中找到 x 和 y 梯度?

c++ - 使用 OpenCV 在 iOS 应用程序中进行圆检测