c - 使用 OpenMP 优化循环后没有性能提升

标签 c performance parallel-processing openmp

我正在尝试优化循环 100 万次的 for 循环,并添加两个正在使用 pow 的数组。我的系统有 48 个内核。我已经使用 malloc 获取数组并使用来自主进程创建的 pthread 的编译指示。不幸的是,代码的并行版本花费的时间几乎是串行版本(在同一系统中)的 20 倍。我正在使用 gettimeofday 检查执行时间。我的 gcc 版本是 4.3.4。请帮助我理解并解决这个问题。

我的代码:

#define N 1000000
#define CHUNKSIZE 20833
:
:
double                          *a, *b, *c;
struct timeval                  st, et;
long double                     time_used[48], tot_time;
:
:
a = malloc(sizeof(double) * N);
b = malloc(sizeof(double) * N);
c = malloc(sizeof(double) * N);
for (i=0; i<N; i++)
     a[i] = b[i] = i * 1.0;
chunk = CHUNKSIZE;
:
:
#pragma omp parallel shared(a,b,c,chunk,time_used) private(i)
    {
        int tid = omp_get_thread_num();
        gettimeofday(&st, NULL);
        long double st_in_micro = (st.tv_sec)*1000000 + (st.tv_usec); 
        #pragma omp for schedule (dynamic,chunk) nowait
        for (i=0; i<N; i++)
            c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);

        gettimeofday(&et, NULL);
        long double et_in_micro = (et.tv_sec)*1000000 + (et.tv_usec);
        time_used[tid] = et_in_micro - st_in_micro;
        printf ("time taken by thread %d = %Lf\n", tid, time_used[tid]);
    }

    tot_time = 0;
    for (i=0; i<48; i++)
    {
        if (time_used[i] < 0)
            continue;
        tot_time += time_used[i];
    }
    printf("Total time taken by all the threads = %Lf\n", tot_time);

并行版本的输出:

time taken by thread 20 = 936.000000
time taken by thread 35 = 1826.000000
time taken by thread 17 = 2.000000
time taken by thread 38 = 603.000000
time taken by thread 22 = 2009.000000
time taken by thread 43 = 0.000000
time taken by thread 13 = 1703.000000
time taken by thread 14 = 1750.000000
time taken by thread 31 = 2128.000000
time taken by thread 1 = 2298.000000
time taken by thread 47 = 602.000000
time taken by thread 34 = 1749.000000
time taken by thread 7 = 1642.000000
time taken by thread 15 = 2542.000000
time taken by thread 9 = 2628.000000
time taken by thread 42 = 3294.000000
time taken by thread 12 = 3446.000000
time taken by thread 30 = 2290.000000
time taken by thread 23 = 3711.000000
time taken by thread 5 = 0.000000
time taken by thread 4 = 2457.000000
time taken by thread 16 = 2573.000000
time taken by thread 6 = 2715.000000
time taken by thread 41 = 2456.000000
time taken by thread 2 = 2877.000000
time taken by thread 0 = 2721.000000
time taken by thread 26 = 4209.000000
time taken by thread 37 = 2796.000000
time taken by thread 24 = 2846.000000
time taken by thread 46 = 2999.000000
time taken by thread 39 = 2569.000000
time taken by thread 45 = 2128.000000
time taken by thread 29 = 2855.000000
time taken by thread 44 = 3075.000000
time taken by thread 36 = 1.000000
time taken by thread 32 = 3035.000000
time taken by thread 3 = 1544.000000
time taken by thread 27 = 3132.000000
time taken by thread 25 = 3076.000000
time taken by thread 33 = 1.000000
time taken by thread 28 = 3042.000000
time taken by thread 21 = 3237.000000
time taken by thread 19 = 1594.000000
time taken by thread 18 = 2202.000000
time taken by thread 10 = 1655.000000
time taken by thread 8 = 3931.000000
time taken by thread 40 = 2726.000000
time taken by thread 11 = 2060.000000
Total time taken by all the threads = 105671.000000

串行版本的输出:

Total time taken by all the threads = 5574.000000

请帮助我理解这段代码有什么问题。

最佳答案

您将每个线程使用的时间添加到 tot_time并将其与仅使用一个线程的时间进行比较。

这样做 tot_time对于所有线程,在大多数情况下 (exception for super linear speedups) 将大于或等于仅使用一个线程时的时间。理想情况是它们相等,这意味着时间在所有线程中平均分配。

所以你对 tot_time 的定义是一个有趣的指标,用于测试负载的分布情况,但我认为这不是您要找的。

相反,您可以报告花费最长时间的线程的时间。但仅报告并行区域内使用的时间消除了 OpenMP 实现工作共享的开销成本。相反,我会报告整个工作共享区域使用的时间,如以下代码所示。

#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>

void foo(double * restrict a, double * restrict b, double * restrict c, int N) {
    double tot_time = -omp_get_wtime();
    #pragma omp parallel
    {
        double dtime = -omp_get_wtime();
        #pragma omp for nowait
        for (int i=0; i<N; i++) c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);
        dtime += omp_get_wtime();
        #pragma omp critical
        printf ("time taken by thread %d = %.2f seconds\n", omp_get_thread_num(), dtime);
    }
    tot_time += omp_get_wtime();
    printf("Total time taken by all the threads = %.2f seconds\n", tot_time);
}

int main(void) {
    int N = 1<<28;
    double *a = malloc(sizeof *a * N);
    double *b = malloc(sizeof *b * N);
    double *c = malloc(sizeof *c * N);
    memset(a,1,sizeof *a * N);
    memset(b,1,sizeof *a * N);
    memset(c,1,sizeof *a * N);
    foo(a,b,c,N);
}

您的代码也有一些我修复的竞争条件。

最后,第一次分配的内存在首次写入之前不会(通常)分配所有页面。足够有趣calloc也不分配页面,而是仅指向单个零页面。更复杂的是 GCC 会转换 malloc其次是 memset(0)calloc .因此,为了实际分配页面,您需要将非零值写入数组 ( .e.g memset(a,1,sizeof *a * N) )。

这是我的 4 核/8 硬件线程系统的计时结果。

time taken by thread 1 = 0.33 seconds
time taken by thread 5 = 0.33 seconds
time taken by thread 7 = 0.33 seconds
time taken by thread 6 = 0.34 seconds
time taken by thread 3 = 0.34 seconds
time taken by thread 4 = 0.34 seconds
time taken by thread 0 = 0.34 seconds
time taken by thread 2 = 0.33 seconds
Total time taken by all the threads = 0.36 seconds

然后export OMP_NUM_THREADS=2

time taken by thread 0 = 0.31 seconds
time taken by thread 1 = 0.33 seconds
Total time taken by all the threads = 0.33 seconds

然后export OMP_NUM_THREADS=1

time taken by thread 0 = 0.53 seconds
Total time taken by all the threads = 0.53 seconds

您的操作受内存带宽限制,因此在我的双 channel DDR4 系统上使用两个线程后我看不到太多好处。


对于 NUMA 系统,内存局部性可以产生很大的影响。由于页面仅在第一次被触摸时分配,因此为了进行基准测试,首先并行写入数组可能是有意义的。以下代码将执行此操作,但它仅适用于静态调度。

确保禁用线程数的动态调整(不要与动态调度混淆),例如export OMP_DYNAMIC=false .

#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>

void foo(double * restrict a, double * restrict b, double * restrict c, int N) {

    #pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) c[i] = b[i] = a[i] = 1;

    double tot_time = -omp_get_wtime();
    #pragma omp parallel
    {
        double dtime = -omp_get_wtime();
        #pragma omp parallel schedule(static) nowait
        for (int i=0; i<N; i++) c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);
        dtime += omp_get_wtime();
        #pragma omp critical
        printf ("time taken by thread %d = %.2f seconds\n", omp_get_thread_num(), dtime);
    }
    tot_time += omp_get_wtime();
    printf("Total time taken by all the threads = %.2f seconds\n", tot_time);
}

int main(void) {
    int N = 1<<28;
    double *a = malloc(sizeof *a * N);
    double *b = malloc(sizeof *b * N);
    double *c = malloc(sizeof *c * N);
    //memset(a,1,sizeof *a * N);
    //memset(b,1,sizeof *a * N);
    //memset(c,1,sizeof *a * N);    
    foo(a,b,c,N);
}

关于c - 使用 OpenMP 优化循环后没有性能提升,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43679323/

相关文章:

sbrk(0) 会失败吗?

sql - 在数据库中存储现实世界 "events"的最佳方式?

java - 循环内声明的变量是否受单次迭代的范围限制?

c# - 为什么 View 在 Prism 复合 wpf 应用程序中第一次加载非常慢

计算程序执行时间

c - 我应该使用什么方法来知道在进程 fork 然后执行后打开了哪些 fds

javascript - 如何在 Node.js 中使用传入的函数初始化子进程

python - Python 中的多处理 : execute two functions at the exact same time

c - 将大整数txt文件读入二维数组

java - 使用蒙特卡洛方法并行计算 Pi