c++ - OpenMP 实现比串行实现慢

标签 c++ parallel-processing openmp greedy

<分区>

我目前正在尝试熟悉 OpenMP。为了练习,我用 OpenMP 实现了一个贪婪的“学习”算法。然后我用

测量了时间
time ./a.out

我与我的串行实现进行了比较,无论我的程序执行多少次迭代,OpenMP 总是慢得多。

这是我的代码,评论应该能解释一切:

#include <omp.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <ctime>

#define THREADS 4

using namespace std;

struct TrainData {
    double input;
    double output;
};

//Long Term Memory struct 
struct LTM {
        double a; //paramter a of the polynom
        double b;
        double c;
        double score; //score to be minimized!

        LTM()
        {
            a=0;
            b=0;
            c=0;
            score=0;
        }

        //random LTM with paramters from low to high (including low and high)
        LTM(int low, int high)
        {
            score=0;
            a= rand() % high + low;
            b= rand() % high + low;
            c= rand() % high + low; 

        }

        LTM(double _a, double _b, double _c)
        {
            a=_a;
            b=_b;
            c=_c;
        }

        void print()
        {
            cout<<"Score: "<<score<<endl;
            cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl;
        }
};

//the acutal polynom function evaluating with passed LTM
inline double evaluate(LTM &ltm, const double &x)
{
    double ret;
    ret = ltm.a*x*x + ltm.b*x + ltm.c;

    return ret; 
}


//scoring function calculates the Root Mean Square error (RMS)
inline double score_function(LTM &ltmnew, vector<TrainData> &td)
{
    double score;
    double val;
    int tdsize=td.size();
    score=0;

    for(int i=0; i< tdsize; i++)
    {
        val = (td.at(i)).output -  evaluate(ltmnew, (td.at(i)).input);
        val *=  val;
        score += val;
    }

    score /= (double)tdsize;

    score = sqrt(score);

    return score;
}

LTM iterate(int iterations, vector<TrainData> td, int low, int high)
{
    LTM fav = LTM(low,high);
    fav.score = score_function(fav, td);
    fav.print();
    LTM favs[THREADS]; // array for collecting the favorites of each thread

    #pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td)
    {
        #pragma omp master
        printf("Threads: %d\n", omp_get_num_threads());

        LTM cand;
        #pragma omp for private(cand)
        for(int i=0; i<iterations; i++)
        {
            cand = LTM(low, high);
            cand.score = score_function(cand, td);

            if(cand.score < fav.score)
                fav = cand;
        }

        //save the favorite before ending the parallel section
        #pragma omp critical
        favs[omp_get_thread_num()] = fav;
    }

    //search for the best one in the array
    for(int i=0; i<THREADS; i++)
    {
        if(favs[i].score < fav.score)
            fav=favs[i];
    }

    return fav;
}

//generate training data from -50 up to 50 with the train LTM
void generateTrainData(vector<TrainData> *td, LTM train)
{
    #pragma omp parallel for schedule(dynamic, 25) 
    for(int i=-50; i< 50; i++)
    {
        struct TrainData d;
        d.input = i;
        d.output = evaluate(train, (double)i);
        #pragma omp critical
        td->push_back(d);

        //cout<<"input: "<<d.input<<" -> "<<d.output<<endl;
    }

}

int main(int argc, char *argv[])
{

    int its= 10000000; //number of iterations 
    int a=2;
    int b=4;
    int c=6;

    srand(time(NULL));
    LTM pol = LTM(a,b,c); //original polynom parameters
    vector<TrainData> td;

    //first genarte some training data and save it to td
    generateTrainData(&td, pol); 

    //try to find the best solution
    LTM fav = iterate( its, td, 1, 6);


    printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score);

    return 0;
}

在我的家用 PC 上,这个实现花费了 12 秒。序列号只有6s。 如果我将迭代次数增加 10 倍,则大约为 2 分钟/1 分钟(omp/serial)。

谁能帮帮我?

最佳答案

好的,感谢我最初问题的评论,我可以解决性能问题。

就像评论中所说的问题是我正在使用的 rand() 函数。 我用适当的线程安全 drand48_r() 替换了它们。

喜欢:

...
LTM(double low, double high, struct drand48_data *buff)
{
    score=0;
    double x;
    drand48_r(buff,&x);
    a= low + x * (high - low);
    drand48_r(buff,&x);
    b= low + x * (high - low);
    drand48_r(buff,&x);
    c= low + x * (high - low);

}
...

现在我得到的时间不到一秒! 谢谢! :)

关于c++ - OpenMP 实现比串行实现慢,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44068468/

相关文章:

c - 在 C 中使用 openMP 的简单 for 循环

c - 是否可以评估 OpenMP 开销成本?

c++ - 为什么类型变量;不调用默认的构造函数?

c++ - 如何检查包含 utf8 文本的 std::string 在 Windows 中是否以大写字母开头?

c++ - 未调用重载赋值运算符

c++ - 英特尔 TBB 和微软 PPL 有什么区别?

parallel-processing - MPI_Send 在数据量大的环形通信中阻塞

c++ - 使非 const 对象成员函数在 openMP 中可用

c++ - 所有的公共(public)方法都应该对应业务逻辑吗?

amazon-web-services - 如何简化 Step Functions 的复杂并行分支相互依赖关系