我正在学习并行计算。我写了下面的代码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
void Usage(char* prog_name);
double f(double x); /* Function we're integrating */
double Local_trap(double a, double b, int n);
int main(int argc, char* argv[])
{
double global_result = 0.0; /* Store result in global_result */
double a, b; /* Left and right endpoints */
int n; /* Total number of trapezoids */
int thread_count, repeat_times;
double *time_fork, *time_elapsed, *time_join, *time_end;
double global_start, global_finish, global_time = 0.0;
printf("Enter a, b, n, thread_count and repeat times, n mod thread_count should = 0.\n");
scanf("%lf %lf %d %d %d", &a, &b, &n, &thread_count, &repeat_times);
if (n % thread_count != 0)
Usage(argv[0]);
time_fork = malloc(thread_count * sizeof(double));
time_elapsed = malloc(thread_count * sizeof(double));
time_join = malloc(thread_count * sizeof(double));
time_end = malloc(thread_count * sizeof(double));
if ((NULL == time_elapsed) || (NULL == time_fork) || (NULL == time_join) || (NULL == time_end))
{
return 0;
}
#pragma omp parallel for num_threads(thread_count)
for (int i = 0;i < thread_count;i++)
{
time_fork[i] = 0.0;
time_elapsed[i] = 0.0;
time_join[i] = 0.0;
}
for (int i = 0;i < repeat_times;i++)
{
global_start = omp_get_wtime();
#pragma omp parallel num_threads(thread_count) reduction(+:global_result)
{
/* new code to calculate time elapsed */
#pragma omp barrier
double my_start, my_finish, my_elapsed;
int my_rank = omp_get_thread_num();
my_start = omp_get_wtime();
time_fork[my_rank] += (my_start - global_start);
/* original code to calculate trap */
global_result += Local_trap(a, b, n);
/* new code to calculate time elapsed */
my_finish = omp_get_wtime();
my_elapsed = my_finish - my_start;
time_elapsed[my_rank] += my_elapsed;
time_end[my_rank] = my_finish;
}
global_finish = omp_get_wtime();
#pragma omp parallel for num_threads(thread_count)
for (int j = 0;j < thread_count;j++)
{
time_join[j] += (global_finish - time_end[j]);
}
global_time += (global_finish - global_start);
}
printf("The global run time is %.14f seconds.\n", global_time/repeat_times);
for(int i = 0; i < thread_count;i++)
{
printf("The thread %d runs %.14f seconds.\n", i, time_elapsed[i]/repeat_times);
printf("The thread %d forks %.14f seconds.\n", i, time_fork[i]/repeat_times);
printf("The thread %d joins %.14f seconds.\n", i, time_join[i]/repeat_times);
}
printf("With n = %d trapezoids, our estimate\n", n);
printf("of the integral from %f to %f = %.14e\n", a, b, global_result);
free(time_fork);
free(time_elapsed);
free(time_join);
free(time_end);
return 0;
} /* main */
void Usage(char* prog_name)
{
fprintf(stderr, "usage: %s <number of threads>\n", prog_name);
fprintf(stderr, " number of trapezoids must be evenly divisible by\n");
fprintf(stderr, " number of threads\n");
exit(0);
}
double f(double x)
{
double return_val;
return_val = x*x;
return return_val;
} /* f */
double Local_trap(double a, double b, int n)
{
double h, x, my_result;
double local_a, local_b;
int i, local_n;
int my_rank = omp_get_thread_num();
int thread_count = omp_get_num_threads();
h = (b-a)/n;
local_n = n/thread_count;
local_a = a + my_rank*local_n*h;
local_b = local_a + local_n*h;
my_result = (f(local_a) + f(local_b))/2.0;
for (i = 1; i <= local_n-1; i++)
{
x = local_a + i*h;
my_result += f(x);
}
my_result = my_result*h;
return my_result;
}
我在 ubuntu 14.04 上编译,我的笔记本是 i3,4 线程,命令是 gcc -g3 -Wall -fopenmp -std=c99 -o Assignment2 Assignment2.c
1 2 12000 2 10 的输出
The global run time is 0.00013472399987 seconds.
The thread 0 runs 0.00013350439967 seconds.
The thread 0 forks 0.00000079790025 seconds.
The thread 0 joins 0.00000042169995 seconds.
The thread 1 runs 0.00013322920022 seconds.
The thread 1 forks 0.00000062119998 seconds.
The thread 1 joins 0.00000087359967 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01
1 2 12000 4 10 的输出
The global run time is 0.00781751800023 seconds.
The thread 0 runs 0.00006403259995 seconds.
The thread 0 forks 0.00621278830040 seconds.
The thread 0 joins 0.00154069709988 seconds.
The thread 1 runs 0.00006628699975 seconds.
The thread 1 forks 0.00575844590039 seconds.
The thread 1 joins 0.00199278510008 seconds.
The thread 2 runs 0.00006636039980 seconds.
The thread 2 forks 0.00551087460044 seconds.
The thread 2 joins 0.00224028299999 seconds.
The thread 3 runs 0.00006544990010 seconds.
The thread 3 forks 0.00564311910020 seconds.
The thread 3 joins 0.00210894899992 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01
我不知道为什么fork 4个线程的成本比fork 2个线程的成本那么贵。 是不是我的计时方式有问题?
最佳答案
你的测量时间没问题。如果不进行任何优化,线程数量的增加将需要更多的管理/同步。但是,通过 -O3
优化,您会看到明显的加速。我有 8 个线程,所以我的 2、4、6、8 运行时分别是:
The global run time is 0.00013457789901 seconds.
The global run time is 0.00006983749627 seconds.
The global run time is 0.00004531119484 seconds.
The global run time is 0.00032387300453 seconds.
注意当达到 8 个线程时运行时间如何再次增加。这是并行编程中的常见问题之一。您在计算中的潜在加速需要足够大,以证明与更多处理器的通信成本增加是合理的。在这种情况下,8 线程甚至比单线程程序还慢(全局运行时间为 0.00025965359819 秒。
)
编辑:
物理核心数和线程数确实不一样。一种检查方法是 cat/proc/cpuinfo
。输出将列出您的线程。我的猜测是有 4 个。这叫做 hyper-threading来增加并行度。但是,在这两种情况下,您的程序都使用线程而不是内核进行计算。
关于c - OpenMP omp fork 2 个线程比 fork 4 个线程快得多,为什么?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/34848452/