c - openmp串行比并行快

标签 c openmp

代码是并行的,但我不知道为什么它比我的串行代码慢,当我将线程添加到 7 到 10 时,程序也会变慢。

我一直在努力找出问题所在,但对我来说一直很困难

我使 for 循环并行,但它似乎不起作用。我在运行代码时没有收到任何错误。

#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>


int main(int argc, char *argv[])
{

    int m; 
    int n;
    double tol;// = 0.0001;
    double tstart, tstop;

    int i, j, iter, nthreads;



    m = atoi(argv[1]);
    n = atoi(argv[2]);
    tol = atof(argv[3]);

    double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;

    printf("%d %d %lf\n",m,n,tol);
    printf("Enter the number of threads (max 10) ");
    scanf("%d",&nthreads);

    omp_set_num_threads(nthreads);
    tstart = omp_get_wtime ();

    //** initialise temperature array*


    #pragma omp parallel for schedule(static)\
    default(shared) private(i,j)
    for (i=0; i <= m+1; i++) {
        for (j=0; j <= n+1; j++) {
            t[i][j] = 30.0;
        }
    }

    //*** fix boundary conditions***


    for (i=1; i <= m; i++) {
        t[i][0] = 20.0;
        t[i][n+1] = 100.0;
    }
    for (j=1; j <= n; j++) {
        t[0][j] = 10.0;
        t[m+1][j] = 140.0;
    }


    //** main loop**


    iter = 0;
    difmax = 1000000.0;
    while (difmax > tol) {
        iter++;

        // **update temperature for next iteration**


        #pragma omp parallel for schedule(static) \
        default(shared) private(i,j)
        for (i=1; i <= m; i++) {
            for (j=1; j <= n; j++) {
                tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
            }
        }

        // **work out maximum difference between old and new temperatures**

        difmax = 0.0;

        #pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
        {
            priv_difmax = 0.0;
            #pragma omp for schedule(static)
            for (i=1; i <= m; i++) {
                for (j=1; j <= n; j++) {
                    diff = fabs(tnew[i][j]-t[i][j]);

                    if (diff > priv_difmax) {
                        priv_difmax = diff;
                    }

                    //** copy new to old temperatures**
                    t[i][j] = tnew[i][j];
                }
                #pragma omp critical 
                if (priv_difmax > difmax){
                    difmax = priv_difmax;
                }
            }
        }

    }
    tstop = omp_get_wtime ();

    // print results

    printf("iter = %d  difmax = %9.11lf", iter, difmax);

    for (i=0; i <= m+1; i++) {
        printf("\n");
        for (j=0; j <= n+1; j++) {
            printf("%3.5lf ", t[i][j]);
        }
    }

    printf("\n");
    tstop = omp_get_wtime ();

    printf("time taken is %4.3lf\n", (tstop-tstart));
    printf("\n");
}

最佳答案

除了可能在以下代码中,我没有看到明显的问题:

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
        #pragma omp critical 
        if (priv_difmax > difmax){
            difmax = priv_difmax;
        }
    }
}

还原部分,复制priv_difmaxdifmax , 应该移出循环,以便线程通过 critical节只有一次,而不是在外循环的每次迭代中。

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static) nowait //no need to wait after the loop
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }
    // Finish the loop first, then update difmax
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }
} //Implicit barrier

现在,并行化会产生开销成本,并且只有大的 m 和 n 值才有可能实现加速。您正在考虑的问题可能太小了。减少开销的方法是合并两个 parallel构造使得线程池不必产生两次。或者更好的是,将 while 循环放在 parallel 中构造,以便我们只需要在每次迭代时同步现有线程,而不是创建和销毁它们:

difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {

    // have one thread reset difmax and increment iter
    #pragma omp single nowait
    iter++,difmax=0.0;

    // loop to update tnew - distributed among threads
    #pragma omp parallel for schedule(static) \
    default(shared) private(i,j)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
        }
    } //implicit barrier here

    // each thread resets its private difmax
    priv_difmax=0.0;

    // loop to compute difmax - distributed among threads
    #pragma omp for schedule(static) nowait
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);

            if (diff > priv_difmax) {
                priv_difmax = diff;
            }

            //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }

    // each thread now updates difmax if needed, one at a time
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }

    // put a barrier here to make sure that diffmax have been updated 
    // before any thread tests the condition for next iteration of the
    // while-loop condition
    #pragma omp barrier
}

比较代码串行和并行运行方式的最佳方法是在支持和不支持 OpenMP 的情况下对其进行编译(例如,使用 gcc,使用和不使用 -fopenmp 编译器和链接器标志进行编译)。这将有助于指出问题实际上是与并行化有关,还是与原始串行代码和“并行就绪”版本之间的其他修改有关。

想法是要知道从原始串行代码并行代码(在没有并行支持的情况下编译)并行代码(使用OpenMP)

需要使用一些预处理头文件,因为编译器不会识别像omp_get_thread_num()这样的函数。没有 OpenMP 支持。 omp_get_wtime()也不应该使用;因为你所有的时间都是在平行区域完成的,所以不需要使用那个特定的函数,也不需要调用 time()将是准确的(这需要 #include <time.h> )。

// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
   #include <omp.h>
#else
   # ifndef _ESCAPE_OMPENMP
      #define omp_get_num_threads() 1
      #define omp_get_thread_num() 0
      #define omp_get_max_threads() 0
      #define _ESCAPE_OMPENMP
   #endif
#endif

关于c - openmp串行比并行快,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/53675903/

相关文章:

c - 使用 C 读取文件并删除字符

c - 编译为 LLVM 的问题

c++ - 增加线程数,但程序无法更快地运行 C++ OpenMP 选择排序

c - 并行执行时间

c++ - 在cpp中读取Fortran指数表示法

c - 如何在 C 中传递指向多维数组的指针?

c++ - 对 OpenMP 中静态调度开销的影响

c++ - OpenMP 中 omp_set_max_active_levels 的最佳值是多少?

c - 如何检查特定字符是否按相同顺序出现?

multithreading - 缺少OpenMP功能: Thread Priority