我希望得到像别担心,编译器会处理这个问题的答案,但我不确定。
当我在 Fortran 中的某些自定义类型/类中创建某些方法时,是否会因引用/取消引用对象的字段而导致性能下降,例如 this%a(i) = this%b(i) + this %c(i)
与仅使用像 a(i) = b(i) + c(i)
更复杂的示例:
例如,我有一个函数,它应该在 3D 网格上插入一个值,这确实对性能至关重要(它将在其他 3D 数组上的三重循环内调用)。所以我在想是否更好(为了性能)使用类的方法,或者更确切地说,创建一个以数组作为参数的普通子例程。
type grid3D ! 3D grid maps of observables
real, dimension (3) :: Rmin, Rmax, Rspan, step ! grid size and spacing (x,y,z)
integer, dimension (3) :: N ! dimension in x,y,z
real, dimension (3,:, :, :), allocatable :: f ! array storing values of othe observable
contains
procedure :: interpolate => grid3D_interpolate
end type grid3D
function grid3D_interpolate(this, R ) result(ff)
implicit none
! variables
class (grid3D) :: this
real, dimension (3), intent (in) :: R
real :: ff
integer ix0,iy0,iz0
integer ix1,iy1,iz1
real dx,dy,dz
real mx,my,mz
! function body
ix0 = int( (R(1)/this%step(1)) + fastFloorOffset ) - fastFloorOffset
iy0 = int( (R(2)/this%step(2)) + fastFloorOffset ) - fastFloorOffset
iz0 = int( (R(3)/this%step(3)) + fastFloorOffset ) - fastFloorOffset
dx = R(1) - x0*this%step(1)
dy = R(2) - y0*this%step(2)
dz = R(3) - z0*this%step(3)
ix0 = modulo( x0 , this%N(1) )+1
iy0 = modulo( y0 , this%N(2) )+1
iz0 = modulo( z0 , this%N(3) )+1
ix1 = modulo( x0+1 , this%N(1) )+1
iy1 = modulo( y0+1 , this%N(2) )+1
iz1 = modulo( z0+1 , this%N(3) )+1
mx=1.0-dx
my=1.0-dy
mz=1.0-dz
ff = mz*(my*(mx*this%f(ix0,iy0,iz0) &
+dx*this%f(ix1,iy0,iz0)) &
+dy*(mx*this%f(ix0,iy1,iz0) &
+dx*this%f(ix1,iy1,iz0))) &
+dz*(my*(mx*this%f(ix0,iy0,iz1) &
+dx*this%f(ix1,iy0,iz1)) &
+dy*(mx*this%f(ix0,iy1,iz1) &
+dx*this%f(ix1,iy1,iz1)))
end if
end function grid3D_interpolate
end module T_grid3Dvec
最佳答案
并非如此。
- 只要你的代码结构(对于编译器来说)非常清晰,它就可以很容易地对其进行优化。
- 一旦您的 OOP 结构变得过于复杂,或者取消引用的级别变得太大,您可能会通过手动取消引用方案获得一些改进。 (我经常使用它,尽管通常是为了让我的代码保持可读性。但我曾经在这里有过一点改进,但代码使用了 >5 个级别的取消引用。)
这是一些示例:
module vec_mod
implicit none
type t_vector
real :: x = 0.
real :: y = 0.
real :: z = 0.
end type
type t_group
type(t_vector),allocatable :: vecs(:)
end type
contains
subroutine sum_vec( vecs, res )
implicit none
type(t_vector),intent(in) :: vecs(:)
type(t_vector),intent(out) :: res
integer :: i
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecs)
res%x = res%x + vecs(i)%x
res%y = res%y + vecs(i)%y
res%z = res%z + vecs(i)%z
enddo
end subroutine
subroutine sum_vec_ptr( vecs, res )
implicit none
type(t_vector),intent(in),target :: vecs(:)
type(t_vector),intent(out) :: res
integer :: i
type(t_vector),pointer :: curVec
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecs)
curVec => vecs(i)
res%x = res%x + curVec%x
res%y = res%y + curVec%y
res%z = res%z + curVec%z
enddo
end subroutine
subroutine sum_vecGrp( vecGrp, res )
implicit none
type(t_group),intent(in) :: vecGrp
type(t_vector),intent(out) :: res
integer :: i
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecGrp%vecs)
res%x = res%x + vecGrp%vecs(i)%x
res%y = res%y + vecGrp%vecs(i)%y
res%z = res%z + vecGrp%vecs(i)%z
enddo
end subroutine
subroutine sum_vecGrp_ptr( vecGrp, res )
implicit none
type(t_group),intent(in),target :: vecGrp
type(t_vector),intent(out) :: res
integer :: i
type(t_vector),pointer :: curVec, vecs(:)
res%x = 0. ; res%y = 0. ; res%z = 0.
vecs => vecGrp%vecs
do i=1,size(vecs)
curVec => vecs(i)
res%x = res%x + curVec%x
res%y = res%y + curVec%y
res%z = res%z + curVec%z
enddo
end subroutine
end module
program test
use omp_lib
use vec_mod
use,intrinsic :: ISO_Fortran_env
implicit none
type(t_vector),allocatable :: vecs(:)
type(t_vector) :: res
type(t_group) :: vecGrp
integer,parameter :: N=100000000
integer :: i, stat
real(REAL64) :: t1, t2
allocate( vecs(N), vecGrp%vecs(N), stat=stat )
if (stat /= 0) stop 'Cannot allocate memory'
do i=1,N
call random_number(vecs(i)%x)
call random_number(vecs(i)%y)
call random_number(vecs(i)%z)
enddo
print *,''
print *,'1 Level'
t1 = omp_get_wtime()
call sum_vec( vecs, res )
print *,res
t2 = omp_get_wtime()
print *,'Normal [s]:', t2-t1
t1 = omp_get_wtime()
call sum_vec_ptr( vecs, res )
print *,res
t2 = omp_get_wtime()
print *,'Pointer [s]:', t2-t1
print *,''
print *,'2 Levels'
vecGrp%vecs = vecs
t1 = omp_get_wtime()
call sum_vecGrp( vecGrp, res )
print *,res
t2 = omp_get_wtime()
print *,'Normal [s]:', t2-t1
t1 = omp_get_wtime()
call sum_vecGrp_ptr( vecGrp, res )
print *,res
t2 = omp_get_wtime()
print *,'Pointer [s]:', t2-t1
end program
使用默认选项(gfortran test.F90 -fopenmp
)编译,手动取消引用对三有一点好处,特别是对于两级取消引用:
OMP_NUM_THREADS=1 ./a.out
1 Level
16777216.0 16777216.0 16777216.0
Normal [s]: 0.69216769299237058
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.67321390099823475
2 Levels
16777216.0 16777216.0 16777216.0
Normal [s]: 0.84902219301147852
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.71247501399193425
一旦打开优化(gfortran test.F90 -fopenmp -O3
),您可以看到编译器实际上自动做得更好:
OMP_NUM_THREADS=1 ./a.out
1 Level
16777216.0 16777216.0 16777216.0
Normal [s]: 0.13888958499592263
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.19099253200693056
2 Levels
16777216.0 16777216.0 16777216.0
Normal [s]: 0.13436777899914887
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.21104205500159878
关于performance - Fortran 中取消引用类属性的速度,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24326157/