我不明白为什么这种矩阵乘法的实现在 C++ 中的运行速度比相应的 Fortran 代码在并行运行时慢 3 倍。这是大约。串行版本相同。
program scheduling
!$ use omp_lib
implicit none
integer :: i,j,k,y,x,z
integer, parameter :: tile = 8, N = 1000
double precision, dimension(:), allocatable :: a,b,c,d ! data must be allocated on the heap, otherwise OpenMP would allocate it in the stack and an stackoverflow would occur for larger matrices.
double precision :: E,S
allocate(a(N*N))
allocate(b(N*N))
allocate(c(N*N))
allocate(d(N*N))
call random_seed()
call random_number(b)
call random_number(a)
!transpose b
do i = 1,N
do j = 1,N
d((i-1)*N+j) = b((j-1)*N+i)
end do
end do
S = omp_get_wtime()
!$OMP PARALLEL DO SHARED(a,d,c) PRIVATE(i,j,k,x,y,z) SCHEDULE(static)
do i = 1,N,tile
do j = 1,N,tile
do k = 1,N,tile
do x = i, min( i+tile-1,N)
do y = j, min( j+tile-1,N)
do z = k, min( k+tile-1,N)
c((x-1)*N+y) = c((x-1)*N+y) + a((x-1)*N+z) * d(z+(y-1)*N)
enddo
enddo
enddo
enddo
enddo
enddo
!$OMP END PARALLEL DO
E = omp_get_wtime()
print*, (E-S)
! Deallocation of memory
deallocate(a)
deallocate(b)
deallocate(c)
deallocate(d)
end program scheduling
并编译:
$ gfortran -O3 -fopenmp scheduling.f08 -o scheduling
$ ./scheduling
0.901.... !for the parallel version and
1.3496... !for the serial version
(顺便说一句,比 a(i,j) 等索引版本慢)
和C++代码:
#include <iostream>
#include <cmath>
#include <omp.h>
#include <cstdlib>
int main(int argc, char *argv[])
{
int i,j,k,x,y,z;
const int N = 1000;
double* a = new double[N*N];
double* b = new double[N*N];
double* c = new double[N*N];
double* d = new double[N*N];
int tile = 8;
for(int i = 0; i < N; i++){
for(int j = 0; j < N; j++){
a[i*N+j] = rand()%1000;
b[i*N+j] = rand()%1000;
}
}
// transpose
for(int i = 0; i < N; i++){
for(int j = 0; j < N; j++){
d[i*N+j] = b[i+j*N];
}
}
double start = omp_get_wtime();
//#pragma omp parallel for shared(a,c,d) private(i,j,k,x,y,z) schedule(static)
for( i = 0; i < N; i+=tile){
for( j = 0; j < N; j+=tile){
for( k = 0; k < N; k+=tile){
for( x = i; x < std::min(i+tile,N); x++){
for( y = j; y < std::min(j+tile,N); y++){
for( z = k; z < std::min(k+tile,N); z++){
c[x*N+y] = c[x*N+y] + a[x*N+z] * d[z+y*N];
}
}
}
}
}
}
double end = omp_get_wtime();
std::cout << (end-start) << std::endl;
delete[] a;
delete[] b;
delete[] c;
delete[] d;
return 0;
}
$g++ -O3 -fopenmp parallel.cpp -o parallel
$./parallel
2.347... //for the parallel version and
1.47... //for the serial one
我真的看不出这两个代码之间的区别。它应该是大约。相同但事实并非如此。我不知道为什么。从串行版本来看,代码似乎是正确的(或者至少是我所期望的)但是并行版本的运行方式却大不相同。
最佳答案
有必要明确指定线程数。这是我更改的内容:
... ceteris paribus ...
#pragma omp **omp_set_num_threads(2)** parallel for shared(a,c,d) private(i,j,k,x,y,z)
...
$ ./MM
0.98...
比串行版本快一点。但至少不会慢很多。
希望对您有所帮助。
关于C++ 与 Fortran 并行 MM 速度差异循环平铺,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27187457/