android - GPU 与 CPU 编程 : inconsistencies in processing times

我目前正在研究图像跟踪:借助摄像头，我正在跟踪与 Android 系统交互的手指触摸。图像处理是在带有 OpenCL 的 GPU 上完成的:我将相机输出转换为黑白帧，以便获得白色的 Blob 。该方法的处理时间为 65ms。由于我的目标是使程序更流畅，因此我使用 OpenCV 方法在 CPU 上执行了相同的操作。这给出了 115ms 的处理时间。问题是使用 OpenCV 方法程序感觉 react 更灵敏、速度更快，我不明白在那种情况下处理时间怎么会更长:这对我来说似乎是矛盾的。对于测量，我是这样进行的:

start= clock();
finish = clock();
double time =((double)finish -start)/CLOCKS_PER_SEC;
std::cout<<"process time : "<< time<<std::endl;

这是我的代码:

static cv::Mat              original_Right,binary_Right;
static cv::Mat              original_Left, binary_Left;
int                 width, height;
clock_t                 start,finish;
double time = 0.0;

width = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_WIDTH);
height = (int) this->camera_Right.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT);
original_Right.create(height, width, CV_8UC3);


//--------------------------- Camera 2 ---------------------------------
int width_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_WIDTH);
int height_2 = (int) this->camera_Left.getCapture().get(cv::CAP_PROP_FRAME_HEIGHT);
original_Left.create(height_2, width_2, CV_8UC3);


binary_Right.create(height, width, CV_32F); // FOR GPU
binary_Left.create(height_2, width_2, CV_32F); // FOR GPU
//binary_Right.create(height, width, CV_8UC1); // FOR CPU
//binary_Left.create(height_2, width_2, CV_8UC1); // FOR CPU

Core::running_ = true;


//------------------------------------ SET UP THE GPU -----------------------------------------
cl_context              context;
cl_context_properties   properties [3];
cl_kernel               kernel;
cl_command_queue        command_queue;
cl_program              program;
cl_int                  err;
cl_uint                 num_of_platforms=0;
cl_platform_id          platform_id;
cl_device_id            device_id;
cl_uint                 num_of_devices=0;
cl_mem                  input, output;

size_t                  global;

int                     data_size =height*width*3;


//load opencl source
FILE *fp;
char fileName[] = "./helloTedKrissV2.cl";
char *source_str;

 //Load the source code containing the kernel
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
global = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);


//retreives a list of platforms available
if(clGetPlatformIDs(1,&platform_id, &num_of_platforms)!=CL_SUCCESS){
    std::cout<<"unable to get a platform_id"<<std::endl;
};

// to get a supported GPU device
if(clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_GPU,1,&device_id, &num_of_devices)!= CL_SUCCESS){
    std::cout<<"unable to get a device_id"<<std::endl;      
};

//context properties list - must be terminated with 0
properties[0]=CL_CONTEXT_PLATFORM;
properties[1]=(cl_context_properties) platform_id;
properties[2]=0;

// create a context with the gpu device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);

//create command queue using the context and device
command_queue = clCreateCommandQueue(context,device_id,0,&err);

//create a program from the kernel source code
program= clCreateProgramWithSource(context,1,(const char **) &source_str, NULL,&err);

// compile the program
if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){
    size_t length;
    std::cout<<"Error building program"<<std::endl;
    char buffer[4096];
    clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG, sizeof(buffer),buffer,&length);
    std::cout<< buffer <<std::endl;
}

//specify which kernel from the program to execute
kernel = clCreateKernel(program,"imageProcessing",&err);




while (this->isRunning() == true) { 

    start= clock(); //--------------------- START----------------------

    //----------------------FRAME---------------------
    this->camera_Right.readFrame(original_Right);
    if (original_Right.empty() == true ) {
        std::cerr << "[Core/Error] Original  frame is empty." << std::endl;
        break;
    }

    this->camera_Left.readFrame(original_Left);
    if (original_Left.empty() == true ) {
        std::cerr << "[Core/Error] Original 2  frame is empty." << std::endl;
        break;
    }
    //----------------------FRAME---------------------



  //------------------------------------------------IMP GPU ------------------------------------------------------

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR  , sizeof(unsigned char)*data_size,NULL,NULL);
    output =clCreateBuffer(context,CL_MEM_READ_WRITE   | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL);

   if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Right.data ,0,NULL,NULL )!= CL_SUCCESS){};

    //set the argument list for the kernel command
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input);
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output);
    global = data_size  ;
    //enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL);
    clFinish(command_queue);
    //copy the results from out of the  output buffer
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Right.data,0,NULL,NULL )!= CL_SUCCESS){};

    clReleaseMemObject(input);
    clReleaseMemObject(output);

    //------------------------------------------------IMP GPU ------------------------------------------------------

    input = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR  , sizeof(unsigned char)*data_size,NULL,NULL);
    output =clCreateBuffer(context,CL_MEM_READ_WRITE   | CL_MEM_ALLOC_HOST_PTR, sizeof(float)*data_size/3,NULL,NULL);

   if(clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0,sizeof(unsigned char)*data_size, original_Left.data ,0,NULL,NULL )!= CL_SUCCESS){};

    //set the argument list for the kernel command
    clSetKernelArg(kernel,0,sizeof(cl_mem), &input);
    clSetKernelArg(kernel,1,sizeof(cl_mem), &output);
    global = data_size  ;
    //enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL,0,NULL,NULL);
    clFinish(command_queue);
    //copy the results from out of the  output buffer
    if(clEnqueueReadBuffer(command_queue,output,CL_TRUE ,0,sizeof(float)*data_size/3,binary_Left.data,0,NULL,NULL )!= CL_SUCCESS){};

   clReleaseMemObject(input);
   clReleaseMemObject(output);

    //------------------------------------------------IMP GPU ------------------------------------------------------

  // CPU METHOD
  // adok::processing::doImageProcessing(original_Right, binary_Right);
  // adok::processing::doImageProcessing(original_Left, binary_Left);

    //-------------------------------------------------------------- TRACKING ------------------------------------------------------

adok::tracking::doFingerContoursTracking(binary_Right,binary_Left, this->fingerContours, this->perspective_Right,this->perspective_Left, this->distortion_Right,this->distortion_Left, this);

    //------------------------------------------- TRACKING -----------------------------------------

 //------------------------------SEND COORDINATES TO ANDROID BOARD--------------------
if (getSideRight() && !getSideLeft() ) {
        std::cout<<"RIGHT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), RIGHT);
    }else if (!getSideRight() && getSideLeft() ){
        std::cout<<"LEFT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Left.getPerspectiveMatrix(), LEFT);
    }else if (getSideRight() && getSideLeft() ){
        std::cout<<"RIGHT & LEFT : "<<std::endl;
        this->uart_.sendAll(this->fingerContours, this->perspective_Right.getPerspectiveMatrix(), this->perspective_Left.getPerspectiveMatrix());

    }

this->setSideRight(0);
this->setSideLeft(0);

finish = clock();
time =(double)(finish - start)/CLOCKS_PER_SEC;
std::cout << "Time: " << time << std::endl; // ------------END-----------

}
clReleaseCommandQueue(command_queue);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseContext(context);
this->stop();

还有一些奇怪的地方，当我在 CPU 上抓取帧的时间是 5ms，而在 GPU 上是 15ms，我不知道为什么会增加。

我正在开发 android xu4。

最佳答案

在 GPU 计算中，有时它可能比 CPU 计算花费更多时间。因为，对于 GPU 计算，主进程将数据发送到 GPU 内存，并在数学计算后 GPU 将数据发送回 CPU。因此，数据传输和接收回 CPU 需要时间。如果计算出的缓冲区大小较大且传输时间较长，则 GPU 计算可能需要更多时间。 CUDNN 库与 GPU 处理器一起使它快很多倍。因此，如果您的程序未使用 CUDNN，它可能会变慢。

关于android - GPU 与 CPU 编程 : inconsistencies in processing times，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/40259492/

android - GPU 与 CPU 编程 : inconsistencies in processing times

上一篇：java - 单元测试时的 RxJava Schedulers.immediate() 行为

下一篇：android - 如何在带有sqlbrite的sqldelight中使用新的编译语句？