c++ - 当明显不相关的代码被更改时,性能差异很大

标签 c++ performance inline compiler-optimization icc

我有一个奇怪的问题。 我使用 OpenMP 库在 C++ 中编写了一个简单的异步优化算法。 我编写的代码运行良好,没有任何错误。

然后我尝试评估某段代码的计算时间。 在我的系统上,该部分大约需要 12 秒。

然后,我注意到如果我注释掉与该部分完全无关的代码行,该部分的计算时间会减少很多!它下降到大约 1 秒。

我不知道如何向您提供显示我的问题的简单代码。 我在下面附加的代码是我的原始代码,我从中删除了所有不会造成时间问题的部分。 不幸的是,我无法从代码中删除其他行,因为我尝试删除的每一行都会更改我感兴趣的部分的执行时间。

我指的是这一节,它在代码的末尾:

double gradientD_time = omp_get_wtime();
compute_function_gradient_D(gradient_D, DX, K, M, N);
double gradientD_total = (omp_get_wtime()- gradientD_time);

您可能会看到,我在这里正在评估 compute_function_gradient_D() 函数的计算时间。如果我运行这段代码,执行大约需要 12 秒。 如果从代码中删除行,该部分的执行时间会下降到 1 秒。 您可能尝试删除的行示例:

    std::string str_1 = folder + "parameters.dat";
    std::string str_2 = folder + "times.dat";
    std::string str_3 = folder + "merits.dat";
    std::string str_4 = folder + "values.dat";
    std::string str_5 = folder + "lipx.dat";
    std::string str_6 = folder + "lipd.dat";

throw std::exception();

merits[iter] = max_br_init;

这些行与我正在为其计算执行时间的部分完全无关...如果我删除其中一行,为什么执行时间会改变?这里发生了什么?

#include <omp.h>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <iostream>
#include <stdexcept>
#include <algorithm>
#include "mkl.h"
 void compute_function_gradient_D(double *gradient_D, double *DX, int K, int M, int N) {
    for (int j = 0; j < K; j++){
        for (int i = 0; i < M; i++){
            gradient_D[j*M+i] = 0;
            for (int k = 0; k < N; k++)
                gradient_D[i+M*j] += DX[i+k*M];
        }   
    }
 }

  double compute_D_const(double *D, int M, int K){
    double L1norm_col = 0.0, err0=0, err1 = 0.0, tol=1e-6, normx = 0.0, normy= 0.0, nrm2= 0.0;
    int count = 0;  
    double *Dt_col = new double[K]();
    double *DDtb = new double[M]();
    double *Dtb = new double[K]();
    for (int i = 0; i < M; i++){
        Dt_col[0:K:1] = D[i:K:M];
        L1norm_col = cblas_dasum(K, Dt_col, 1);
        DDtb[i] = L1norm_col; 
    }
    nrm2 = cblas_dnrm2(M, DDtb, 1);
    cblas_dscal(M, 1.0/nrm2, DDtb, 1);    
    err1 = nrm2;
    while(std::abs(err1-err0)>tol*err1 && count<20){
        err0 = err1;
        cblas_dgemv(CblasColMajor, CblasTrans, M, K, 1.0, D, M, DDtb, 1 , 0.0, Dtb, 1);
        cblas_dgemv(CblasColMajor, CblasNoTrans, M, K, 1.0, D, M, Dtb, 1, 0.0, DDtb, 1);
        normx = cblas_dnrm2(M, DDtb, 1);
        normy = cblas_dnrm2(K, Dtb, 1);
        err1 = normx/normy;
        cblas_dscal(M, 1.0/normx, DDtb, 1);
        count++;        
        if(count>100) break;
    }
    err1*= err1;
    delete [] Dt_col; delete [] DDtb; delete [] Dtb;
    return err1;
 }

void compute_function_gradient_X(double *gradient_X, double *D, double *DX,     int over_X, int fe_X, int K, int M, int kn) {
    int current_index_X = 0, col = 0, row = 0; 
    for (int i = 0; i < (kn+over_X); i++){
        gradient_X[i] = 0.0;
        current_index_X = fe_X + i;
        col = std::floor(current_index_X/K);
        row = current_index_X - col*K;
        for(int j = 0; j < M; j++)
            gradient_X[i] +=  D[M*row+j]*DX[M*col+j];
    }   
}

int main (int argc, char **argv) {
    srand(time(NULL));
    int max_time = 15000;
    int max_iter = 1;
    int time_flag = 0;
    int merit_flag = 0;
    int iter_flag = 0;
    int iter = 0;
    int core_count = 0;     
    double merit_limit = 1e-6;
    double tau_0 = 1;
    int number_of_threads = 1;
    int M = 0;
    int K = 0;
    int N = 0;
    double entry = 0.0;
    int kn = 0.0;
    int uneven_X = 0;
    int uneven_D = 0;
    int k = 0;
    double lambda = 1;
    double constr = 1;
    double warm_up = 10;
    std::string data = "../../data/param.dat";
    FILE *file = fopen(data.c_str(), "r");
    if (file == NULL) { 
        std::cout << "ERROR" << std::endl; 
        throw std::exception(); 
    }
    fscanf(file, "%lf", &entry); M = entry; fscanf(file, "\n");     
    fscanf(file, "%lf", &entry); K = entry; fscanf(file, "\n");                                     
    fscanf(file, "%lf", &entry); N = entry; fscanf(file, "\n"); 
    fscanf(file, "%lf", &entry); lambda = entry; fscanf(file, "\n");
    fscanf(file, "%lf", &entry); constr = entry; fscanf(file, "\n");    
    fscanf(file, "%lf", &entry); warm_up = entry;
    fclose(file);
    double *X = new double[N*K]();    
    double *D = new double[M*K]();  
    double *S = new double[N*M]();  
    double *times = new double[max_iter+2*number_of_threads+1]();
    double *merits = new double[max_iter+2*number_of_threads+1]();
    double *values = new double[max_iter+2*number_of_threads+1]();
    double *Lip_X = new double[max_iter+2*number_of_threads+1]();
    double *Lip_D = new double[max_iter+2*number_of_threads+1]();
    int *actual_iteration_vector = new int[number_of_threads]();
    double f_value = 0.0;
    for (int i = 0; i < M*N; i++)
        f_value += S[i]*S[i];
    double *nabla_X_init = new double[K*N]();
    double max_br_init = 0.0;
    double x_hat_init = 0.0, gradient_init = 0.0, parameter_init = 0.0, tauX_init = 0.0, LipD_init = 0.0;
    double m_value = 9999;
    int t_warm_up = warm_up*number_of_threads;
    LipD_init = compute_D_const(D, M, K);
    tauX_init = std::max(LipD_init, tau_0);
    cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, K, N, M, -1.0, D, M, S, M, 0.0, nabla_X_init, K);
    for (int i = 0; i < (K*N); i++){
        gradient_init = nabla_X_init[i];
        x_hat_init = X[i] - gradient_init/tauX_init;
        parameter_init = lambda/tauX_init;
        if (x_hat_init >= parameter_init) 
            x_hat_init -= parameter_init;
        else {
            if (x_hat_init <= -parameter_init) 
            x_hat_init += parameter_init; 
            else 
                 x_hat_init = 0.0; 
        }
        if(std::abs(x_hat_init-X[i]) >= max_br_init)
            max_br_init = std::abs(x_hat_init-X[i]);
    }
    double *D_col_init = new double[M]();
    double *D_hat_init = new double[M*K]();
    double *max_br = new double[number_of_threads]();
    std::fill(max_br, max_br+number_of_threads, -9999);
    D_hat_init[0:M*K:1] = D[0:M*K:1];
    double col_norm_init = 0.0;
    for (int i = 0; i < K; i++){
        D_col_init[0:M:1] = D[(i*M):M:1];
        col_norm_init = cblas_dnrm2(M, D_col_init, 1);
        if(col_norm_init > constr)
            D_hat_init[(i*M):M:1] *= constr/col_norm_init;
    }
    for (int i = 0; i < (M*K); i++){
        if(std::abs(D_hat_init[i]-D[i]) >= max_br_init)
            max_br_init = std::abs(D_hat_init[i]-D[i]);
    }
    values[iter] = 0.5*f_value; 
    merits[iter] = max_br_init;
    times[iter] = 0.0;
    iter++;         
    kn = std::floor((K*N)/number_of_threads);
    uneven_X = (K*N % number_of_threads);       
    k = std::floor(K/number_of_threads);
    uneven_D = (K % number_of_threads); 
    delete [] nabla_X_init; delete [] D_col_init; delete [] D_hat_init;
    double total = omp_get_wtime();
    double init_time = omp_get_wtime() - total; 
    int thread_id = 0;
    thread_id = omp_get_thread_num();
    int over_X = 0;
    int over_D = 0;
    if ((uneven_X != 0) && (thread_id == (number_of_threads-1)))
        over_X = uneven_X; 
    if ((uneven_D != 0) && (thread_id == (number_of_threads-1)))
        over_D = uneven_D;     
    double *gradient_X = new double[kn+over_X]();   
    double *delta_X = new double[kn+over_X]();
    double *delta_D = new double[(k+over_D)*M]();
    double *D_col = new double[M]();
    int fe_X = thread_id*kn;
    int fe_D = thread_id*k;         
    double end  = 0.0, LipX = 0.0, LipD = 0.0, tauX = 0.0, tauD = 0.0, X_hat = 0.0, col_norm = 0.0, max_br_local = 0.0;
    double *D_hat = new double[(k+over_D)*M]();
    double *times_local = new double[max_iter+1]();         
    double *merits_local = new double[max_iter+1](); 
    double *values_local = new double[max_iter+1]();            
    int current_index_X = 0, current_index_D = 0;
    int actual_iteration = 1;
    times_local[0] = times[0];
    merits_local[0] = merits[0];
    values_local[0] = values[0];
    actual_iteration_vector[thread_id] = 1;
    double start = omp_get_wtime(); 
    double gradientX_total = 0.0;               
    double *gradient_D = new double[(k+over_D)*M](); 
    double *DX = new double[M*N]();
    while (iter_flag == 0 && merit_flag == 0 && time_flag == 0){
        double gradientX_time = omp_get_wtime();
        compute_function_gradient_X(gradient_X, D, DX, over_X, fe_X, K, M, kn);
        gradientX_total += (omp_get_wtime()-gradientX_time);
        double gradientD_time = omp_get_wtime();
        compute_function_gradient_D(gradient_D, DX, K, M, N);
        double gradientD_total = (omp_get_wtime()- gradientD_time);
        printf("Gradient D total = %f \n", gradientD_total); 
        iter++;
        if ((omp_get_wtime() - total) >= max_time)
            time_flag = 1; 
        if (m_value <= merit_limit)
            merit_flag = 1; 
        if (iter >= max_iter)
            iter_flag = 1;
        }
        end = omp_get_wtime();
        #pragma omp barrier
        int value = 0;
        for(int i = 0; i < thread_id; i++)
            value +=  (actual_iteration_vector[i]-1);
        for (int i = 0; i < (actual_iteration_vector[thread_id]-1); i++){
            times[value+1+i] = times_local[i+1];
            merits[value+1+i] = merits_local[i+1];
            values[value+1+i] = values_local[i+1];
        }    
        delete [] X; delete [] D; delete [] S; delete [] times; delete [] merits; delete [] values; delete [] Lip_X;
        delete [] Lip_D; delete [] actual_iteration_vector; delete [] max_br; delete [] gradient_D; delete [] DX;
        delete [] gradient_X; delete [] delta_X; delete [] delta_D; delete [] D_col; delete [] D_hat;
        delete [] times_local; delete [] merits_local; delete [] values_local;
        std::string folder = "../results/";
        std::string str_1 = folder + "parameters.dat";
        std::string str_2 = folder + "times.dat";
        std::string str_3 = folder + "merits.dat";
        std::string str_4 = folder + "values.dat";
        std::string str_5 = folder + "lipx.dat";
        std::string str_6 = folder + "lipd.dat";
        return 0;
}

代码的含义真的不重要。实际上,由于我删除了很多行,代码已经没有意义了。 一开始读取了一个名为“param”的文件:它只包含六个不同于零的输入值:

64 
64
255025
0.125
1
1000000

为了运行代码,我使用以下 cmake 文件:

project(example)
cmake_minimum_required(VERSION 2.8)
set(CMAKE_CXX_COMPILER "icc")
set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS)
set(CMAKE_CXX_FLAGS "-qopenmp -mkl=sequential")
add_executable(example main.cpp)

其中,通过命令 cmake 创建 makefile。然后我执行 make,最后运行二进制文件。

最佳答案

幸运的是我能够重现这个问题。有一个对英特尔编译器非常有用的标志,-qopt-report=5 .这将创建一个文件 main.optrpt关于编译器优化的各种有趣的事情,特别是对于版本:

  -> (225,5) compute_function_gradient_D(double *, double *, int, int, int) (isz = 56) (sz = 69)
     [[ Inlining inhibited by overrideable criterion  <2>]]

快速 版本只是缺少第二行。

基于这些:

INLINING OPTION VALUES:
  -inline-factor: 100
  -inline-min-size: 30
  -inline-max-size: 230
  -inline-max-total-size: 2000
  -inline-max-per-routine: 10000
  -inline-max-per-compile: 500000

我猜 criterion <2>-inline-max-size . 确实添加了-inline-max-size=999将代码的慢速版本加速到相同的水平!所以它是 main大小被看似不相关的语句更改的函数阻止内联

您可能仍然想知道内联和非内联之间的巨大差异从何而来。函数调用本身当然不相关。但是让我们看看各自的输出:

对于函数本身:

Begin optimization report for: compute_function_gradient_D(double *, double *, int, int, int)

    Report from: Interprocedural optimizations [ipo]

INLINE REPORT: (compute_function_gradient_D(double *, double *, int, int, int)) [10/60=16.7%] main.cpp(17,41)


    Report from: Loop nest, Vector & Auto-parallelization optimizations [loop, vec, par]


LOOP BEGIN at main.cpp(18,3)
   remark #15344: loop was not vectorized: vector dependence prevents vectorization
   remark #15346: vector dependence: assumed FLOW dependence between gradient_D[j*M+i] (20:7) and DX[i+k*M] (22:9)
   remark #15346: vector dependence: assumed ANTI dependence between DX[i+k*M] (22:9) and gradient_D[j*M+i] (20:7)

   LOOP BEGIN at main.cpp(19,5)
      remark #15344: loop was not vectorized: vector dependence prevents vectorization
      remark #15346: vector dependence: assumed FLOW dependence between gradient_D[j*M+i] (20:7) and DX[i+k*M] (22:9)
      remark #15346: vector dependence: assumed ANTI dependence between DX[i+k*M] (22:9) and gradient_D[j*M+i] (20:7)

      LOOP BEGIN at main.cpp(21,7)
         remark #15344: loop was not vectorized: vector dependence prevents vectorization
         remark #15346: vector dependence: assumed FLOW dependence between gradient_D[i+M*j] (22:9) and gradient_D[i+M*j] (22:9)
         remark #15346: vector dependence: assumed ANTI dependence between gradient_D[i+M*j] (22:9) and gradient_D[i+M*j] (22:9)
         remark #25439: unrolled with remainder by 2  
      LOOP END

      LOOP BEGIN at main.cpp(21,7)
      <Remainder>
      LOOP END
   LOOP END
LOOP END

内联版本:

LOOP BEGIN at main.cpp(18,3) inlined into main.cpp(225,5)
<Distributed chunk1>
   remark #25426: Loop Distributed (2 way) 
   remark #15542: loop was not vectorized: inner loop was already vectorized

   LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
   <Distributed chunk1>
      remark #25426: Loop Distributed (2 way) 
      remark #25408: memset generated
      remark #15542: loop was not vectorized: inner loop was already vectorized

      LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
      <Distributed chunk1>
         remark #15389: vectorization support: reference U55_V[j*M+i] has unaligned access   [ main.cpp(20,7) ]
         remark #15381: vectorization support: unaligned access used inside loop body
         remark #15305: vectorization support: vector length 2
         remark #15399: vectorization support: unroll factor set to 2
         remark #15309: vectorization support: normalized vectorization overhead 0.300
         remark #15301: PARTIAL LOOP WAS VECTORIZED
         remark #15451: unmasked unaligned unit stride stores: 1 
         remark #15475: --- begin vector cost summary ---
         remark #15476: scalar cost: 4 
         remark #15477: vector cost: 2.500 
         remark #15478: estimated potential speedup: 1.450 
         remark #15488: --- end vector cost summary ---
         remark #25015: Estimate of max trip count of loop=3
      LOOP END

      LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
      <Remainder loop for vectorization, Distributed chunk1>
         remark #25015: Estimate of max trip count of loop=12
      LOOP END
   LOOP END
LOOP END

LOOP BEGIN at main.cpp(18,3) inlined into main.cpp(225,5)
<Distributed chunk2>
   remark #25444: Loopnest Interchanged: ( 1 2 3 ) --> ( 1 3 2 )
   remark #15542: loop was not vectorized: inner loop was already vectorized

   LOOP BEGIN at main.cpp(21,7) inlined into main.cpp(225,5)
   <Distributed chunk2>
      remark #15542: loop was not vectorized: inner loop was already vectorized

      LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
      <Peeled loop for vectorization>
         remark #25015: Estimate of max trip count of loop=1
      LOOP END

      LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
         remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access   [ main.cpp(22,9) ]
         remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access   [ main.cpp(22,9) ]
         remark #15388: vectorization support: reference U58_V[i+k*M] has aligned access   [ main.cpp(22,34) ]
         remark #15305: vectorization support: vector length 2
         remark #15399: vectorization support: unroll factor set to 4
         remark #15309: vectorization support: normalized vectorization overhead 0.700
         remark #15301: PERMUTED LOOP WAS VECTORIZED
         remark #15442: entire loop may be executed in remainder
         remark #15448: unmasked aligned unit stride loads: 2 
         remark #15449: unmasked aligned unit stride stores: 1 
         remark #15475: --- begin vector cost summary ---
         remark #15476: scalar cost: 8 
         remark #15477: vector cost: 2.500 
         remark #15478: estimated potential speedup: 3.050 
         remark #15488: --- end vector cost summary ---
      LOOP END

      LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
      <Alternate Alignment Vectorized Loop>
      LOOP END

      LOOP BEGIN at main.cpp(19,5) inlined into main.cpp(225,5)
      <Remainder loop for vectorization>
         remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access   [ main.cpp(22,9) ]
         remark #15388: vectorization support: reference U55_V[i+M*j] has aligned access   [ main.cpp(22,9) ]
         remark #15389: vectorization support: reference U58_V[i+k*M] has unaligned access   [ main.cpp(22,34) ]
         remark #15381: vectorization support: unaligned access used inside loop body
         remark #15335: remainder loop was not vectorized: vectorization possible but seems inefficient. Use vector always directive or -vec-threshold0 to override 
         remark #15305: vectorization support: vector length 2
         remark #15309: vectorization support: normalized vectorization overhead 1.083
      LOOP END
   LOOP END
LOOP END

在内联版本中,编译器对参数的了解更多,而函数本身需要为一般参数工作。但是,该报告披露了一般优化。那就是将循环分成两部分,并将第二部分的循环顺序更改为线性通过内存的更优化版本。这也可以应用于 C 代码本身:

void compute_function_gradient_D(double *gradient_D, double *DX, int K, int M,
                                 int N) {
  for (int j = 0; j < K; j++) {
    for (int i = 0; i < M; i++) {
      gradient_D[j * M + i] = 0;
    }
  }

  for (int j = 0; j < K; j++) {
    for (int k = 0; k < N; k++) {
      for (int i = 0; i < M; i++) {
        gradient_D[i + M * j] += DX[i + k * M];
      }
    }
  }
}

对于这段代码,编译器还会对第二个循环进行向量化以获得类似的性能,即使该函数未内联也是如此。

如您所见,整件事与 OpenMP 毫无关系。

使用 icpc 17.0.1 的所有结果,-fopenmp -mkl=sequential -Wall -g -O3

关于c++ - 当明显不相关的代码被更改时,性能差异很大,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43006468/

相关文章:

c++ - 如何使用 "Modern CMake"设置编译器标志?

java - Hystrix - 如何计算线程池大小

php - 如何使用@import 连接 CSS 文件?

c++ - 使用内联链接在定义冲突时生成错误

c++ - 为 iOS 构建 ICU

c++ - C++ 中 float_max + 1 是如何定义的?

c++ - 内联 asm 跳转后抛出 C++ 异常

c++ - C/C++ 编译器会优化这个 if 语句吗?

css - XHTML换行空格问题

c++ - 解释一下C++中的链接(外部/内部)?