c - OpenMP omp fork 2 个线程比 fork 4 个线程快得多，为什么？

我正在学习并行计算。我写了下面的代码

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
void Usage(char* prog_name);
double f(double x);    /* Function we're integrating */
double Local_trap(double a, double b, int n);
int main(int argc, char* argv[])
{
    double  global_result = 0.0;  /* Store result in global_result */
    double  a, b;                 /* Left and right endpoints      */
    int     n;                    /* Total number of trapezoids    */
    int     thread_count, repeat_times;
    double *time_fork, *time_elapsed, *time_join, *time_end;
    double global_start, global_finish, global_time = 0.0;

    printf("Enter a, b, n, thread_count and repeat times, n mod thread_count should = 0.\n");
    scanf("%lf %lf %d %d %d", &a, &b, &n, &thread_count, &repeat_times);

    if (n % thread_count != 0)
        Usage(argv[0]);

    time_fork    = malloc(thread_count * sizeof(double));
    time_elapsed = malloc(thread_count * sizeof(double));
    time_join    = malloc(thread_count * sizeof(double));
    time_end     = malloc(thread_count * sizeof(double));

    if ((NULL == time_elapsed) || (NULL == time_fork) || (NULL == time_join) || (NULL == time_end))
    {
        return 0;
    }

    #pragma omp parallel for num_threads(thread_count)
    for (int i = 0;i < thread_count;i++)
    {
        time_fork[i] = 0.0;
        time_elapsed[i] = 0.0;
        time_join[i] = 0.0;
    }

    for (int i = 0;i < repeat_times;i++)
    {
        global_start = omp_get_wtime();
        #pragma omp parallel num_threads(thread_count) reduction(+:global_result)
        {
            /* new code to calculate time elapsed */
            #pragma omp barrier
            double my_start, my_finish, my_elapsed;
            int my_rank = omp_get_thread_num();
            my_start = omp_get_wtime();
            time_fork[my_rank] += (my_start - global_start);

            /* original code to calculate trap */
            global_result += Local_trap(a, b, n);

            /* new code to calculate time elapsed */
            my_finish  = omp_get_wtime();
            my_elapsed = my_finish - my_start;
            time_elapsed[my_rank] += my_elapsed;
            time_end[my_rank] = my_finish;
        }
        global_finish = omp_get_wtime();

        #pragma omp parallel for num_threads(thread_count)
        for (int j = 0;j < thread_count;j++)
        {
            time_join[j] += (global_finish - time_end[j]);
        }

        global_time += (global_finish - global_start);
    }

    printf("The global run time is %.14f seconds.\n", global_time/repeat_times);
    for(int i = 0; i < thread_count;i++)
    {
        printf("The thread %d runs  %.14f seconds.\n", i, time_elapsed[i]/repeat_times);
        printf("The thread %d forks %.14f seconds.\n", i, time_fork[i]/repeat_times);
        printf("The thread %d joins %.14f seconds.\n", i, time_join[i]/repeat_times);
    }

    printf("With n = %d trapezoids, our estimate\n", n);
    printf("of the integral from %f to %f = %.14e\n", a, b, global_result);

    free(time_fork);
    free(time_elapsed);
    free(time_join);
    free(time_end);
    return 0;
}  /* main */
void Usage(char* prog_name)
{
    fprintf(stderr, "usage: %s <number of threads>\n", prog_name);
    fprintf(stderr, "   number of trapezoids must be evenly divisible by\n");
    fprintf(stderr, "   number of threads\n");
    exit(0);
}
double f(double x)
{
    double return_val;
    return_val = x*x;
    return return_val;
}  /* f */

double Local_trap(double a, double b, int n)
{
    double  h, x, my_result;
    double  local_a, local_b;
    int  i, local_n;
    int my_rank = omp_get_thread_num();
    int thread_count = omp_get_num_threads();

    h = (b-a)/n;
    local_n = n/thread_count;
    local_a = a + my_rank*local_n*h;
    local_b = local_a + local_n*h;
    my_result = (f(local_a) + f(local_b))/2.0;
    for (i = 1; i <= local_n-1; i++)
    {
        x = local_a + i*h;
        my_result += f(x);
    }
    my_result = my_result*h;

    return my_result;
}

我在 ubuntu 14.04 上编译，我的笔记本是 i3，4 线程，命令是 gcc -g3 -Wall -fopenmp -std=c99 -o Assignment2 Assignment2.c

1 2 12000 2 10 的输出

The global run time is 0.00013472399987 seconds.
The thread 0 runs  0.00013350439967 seconds.
The thread 0 forks 0.00000079790025 seconds.
The thread 0 joins 0.00000042169995 seconds.
The thread 1 runs  0.00013322920022 seconds.
The thread 1 forks 0.00000062119998 seconds.
The thread 1 joins 0.00000087359967 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01

1 2 12000 4 10 的输出

The global run time is 0.00781751800023 seconds.
The thread 0 runs  0.00006403259995 seconds.
The thread 0 forks 0.00621278830040 seconds.
The thread 0 joins 0.00154069709988 seconds.
The thread 1 runs  0.00006628699975 seconds.
The thread 1 forks 0.00575844590039 seconds.
The thread 1 joins 0.00199278510008 seconds.
The thread 2 runs  0.00006636039980 seconds.
The thread 2 forks 0.00551087460044 seconds.
The thread 2 joins 0.00224028299999 seconds.
The thread 3 runs  0.00006544990010 seconds.
The thread 3 forks 0.00564311910020 seconds.
The thread 3 joins 0.00210894899992 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01

我不知道为什么fork 4个线程的成本比fork 2个线程的成本那么贵。是不是我的计时方式有问题？

最佳答案

你的测量时间没问题。如果不进行任何优化，线程数量的增加将需要更多的管理/同步。但是，通过 -O3 优化，您会看到明显的加速。我有 8 个线程，所以我的 2、4、6、8 运行时分别是:

The global run time is 0.00013457789901 seconds.
The global run time is 0.00006983749627 seconds.
The global run time is 0.00004531119484 seconds.
The global run time is 0.00032387300453 seconds.

注意当达到 8 个线程时运行时间如何再次增加。这是并行编程中的常见问题之一。您在计算中的潜在加速需要足够大，以证明与更多处理器的通信成本增加是合理的。在这种情况下，8 线程甚至比单线程程序还慢(全局运行时间为 0.00025965359819 秒。)

编辑: 物理核心数和线程数确实不一样。一种检查方法是 cat/proc/cpuinfo。输出将列出您的线程。我的猜测是有 4 个。这叫做 hyper-threading来增加并行度。但是，在这两种情况下，您的程序都使用线程而不是内核进行计算。

关于c - OpenMP omp fork 2 个线程比 fork 4 个线程快得多，为什么？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/34848452/

c - OpenMP omp fork 2 个线程比 fork 4 个线程快得多，为什么？

上一篇：linux - Clearcase 客户端安装问题 - Linux RHEL5

下一篇：linux -/proc/self/maps 使用 fwrite 坏地址错误将内存写入文件