fortran - 带有 BLAS 的 OpenMP

标签 fortran openmp gfortran blas

related question

我尝试扩展上述链接答案中的代码,以包括交叉检查和 openmp。

Program reshape_for_blas

  Use, Intrinsic :: iso_fortran_env, Only :  wp => real64, li => int64

  Implicit None

  Real( wp ), Dimension( :, :    ), Allocatable :: a
  Real( wp ), Dimension( :, :, : ), Allocatable :: b
  Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
  Real( wp ), Dimension( :, :    ), Allocatable :: d
  Real( wp ), Dimension( :, :    ), Allocatable :: e

  Integer :: na, nb, nc, nd, ne
  Integer :: la, lb, lc, ld  
  Integer( li ) :: start, finish, rate, numthreads

  numthreads = 2

  call omp_set_num_threads(numthreads)  
  
  
  Write( *, * ) 'na, nb, nc, nd ?'
  Read( *, * ) na, nb, nc, nd
  ne = nc * nd
  Allocate( a ( 1:na, 1:nb ) ) 
  Allocate( b ( 1:nb, 1:nc, 1:nd ) ) 
  Allocate( c1( 1:na, 1:nc, 1:nd ) ) 
  Allocate( c2( 1:na, 1:nc, 1:nd ) ) 
  Allocate( c3( 1:na, 1:nc, 1:nd ) )
  Allocate( c4( 1:na, 1:nc, 1:nd ) )
  Allocate( c5( 1:na, 1:nc, 1:nd ) )  
  Allocate( d ( 1:nb, 1:ne ) ) 
  Allocate( e ( 1:na, 1:ne ) ) 

  ! Set up some data
  Call Random_number( a )
  Call Random_number( b )

  ! With reshapes
  Call System_clock( start, rate )
  
  !write (*,*) 'clock', start, rate
  
  
  d = Reshape( b, Shape( d ) )
  Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
                                            d, Size( d, Dim = 1 ), &
                                    0.0_wp, e, Size( e, Dim = 1 ) )
  c1 = Reshape( e, Shape( c1 ) )
  Call System_clock( finish, rate )
  
  !write (*,*) 'clock', finish, rate
  
  
  
  Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
  Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
  
  
  ! Direct
  Call System_clock( start, rate )
  Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                            b , Size( b , Dim = 1 ), &
                                            0.0_wp, c2, Size( c2, Dim = 1 ) )
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for straight  method ', Real( finish - start, wp ) / rate
    
    
  
  Call System_clock( start, rate )
  
  !$omp parallel
  ! Direct
  Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                            b , Size( b , Dim = 1 ), &
                                            0.0_wp, c4, Size( c4, Dim = 1 ) )
  !$omp end parallel
  Call System_clock( finish, rate )  
  Write( *, * ) 'Time for straight  method omp', Real( finish - start, wp ) / rate
    
  
 
  !naive
  Call System_clock( start, rate )

  do la = 1, na 
    do lc = 1, nc
      do ld = 1, nd
         c3(la,lc,ld) = 0.0_wp
      enddo
    enddo
  enddo
  
  do la = 1, na 
    do lb = 1, nb
      do lc = 1, nc
        do ld = 1, nd
          c3(la,lc,ld) = c3(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
        enddo  
      enddo
    enddo
  enddo  
 
  
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate  
 
   

  !naive omp
  Call System_clock( start, rate )  
  !$omp parallel

  do la = 1, na 
    do lc = 1, nc
      do ld = 1, nd
         c5(la,lc,ld) = 0.0_wp
      enddo
    enddo
  enddo
  
  !$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)    
  do la = 1, na 
    do lb = 1, nb
      do lc = 1, nc
        do ld = 1, nd
          c5(la,lc,ld) = c5(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
        enddo  
      enddo
    enddo
  enddo  
  !$omp end do  
  !$omp end parallel
 
  
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate  
 
  
  
  
  
  do la = 1, na 
    do lc = 1, nc
      do ld = 1, nd
         
         if ( dabs(c3(la,lc,ld) - c1(la,lc,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
         endif         
         
         
         if ( dabs(c3(la,lc,ld) - c2(la,lc,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
         endif
         
         if ( dabs(c3(la,lc,ld) - c4(la,lc,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
         endif
         
         if ( dabs(c3(la,lc,ld) - c5(la,lc,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
         endif         
      enddo
    enddo
  enddo  
  
End Program reshape_for_blas

我有两个问题:

  1. BLAS 或朴素循环都没有显着的加速。例如,.,通过 gfortran -std=f2008 -Wall -Wextra -fcheck=all reshape.f90 -lblas -fopenmp ,然后输入30 30 30 60 ,我得到了
30 30 30 60
 Time for reshaping method    2.9443999999999998E-003
 Difference between result matrices    12.380937791257775
 Time for straight  method    1.0016000000000001E-003
 Time for straight  method omp   2.4878000000000001E-003
 Time for loop   6.6072500000000006E-002
 Time for loop omp  0.100242600000000002
  • 当维度变大时,例如60 60 60 60在输入中,openmp BLAS 结果可以获得与朴素循环不同的值,似乎我错过了一些控制选项。
  • OpenMP 会出现什么问题?

    编辑 我在 c5 的初始化中添加了 omp 行部分并注释掉两行打印行,

    
    Program reshape_for_blas
    
      Use, Intrinsic :: iso_fortran_env, Only :  wp => real64, li => int64
    
      Implicit None
    
      Real( wp ), Dimension( :, :    ), Allocatable :: a
      Real( wp ), Dimension( :, :, : ), Allocatable :: b
      Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
      Real( wp ), Dimension( :, :    ), Allocatable :: d
      Real( wp ), Dimension( :, :    ), Allocatable :: e
    
      Integer :: na, nb, nc, nd, ne
      Integer :: la, lb, lc, ld  
      Integer( li ) :: start, finish, rate, numthreads
    
      numthreads = 2
    
      call omp_set_num_threads(numthreads)  
      
      
      Write( *, * ) 'na, nb, nc, nd ?'
      Read( *, * ) na, nb, nc, nd
      ne = nc * nd
      Allocate( a ( 1:na, 1:nb ) ) 
      Allocate( b ( 1:nb, 1:nc, 1:nd ) ) 
      Allocate( c1( 1:na, 1:nc, 1:nd ) ) 
      Allocate( c2( 1:na, 1:nc, 1:nd ) ) 
      Allocate( c3( 1:na, 1:nc, 1:nd ) )
      Allocate( c4( 1:na, 1:nc, 1:nd ) )
      Allocate( c5( 1:na, 1:nc, 1:nd ) )  
      Allocate( d ( 1:nb, 1:ne ) ) 
      Allocate( e ( 1:na, 1:ne ) ) 
    
      ! Set up some data
      Call Random_number( a )
      Call Random_number( b )
    
      ! With reshapes
      Call System_clock( start, rate )
      
      !write (*,*) 'clock', start, rate
      
      
      d = Reshape( b, Shape( d ) )
      Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
                                                d, Size( d, Dim = 1 ), &
                                        0.0_wp, e, Size( e, Dim = 1 ) )
      c1 = Reshape( e, Shape( c1 ) )
      Call System_clock( finish, rate )
      
      !write (*,*) 'clock', finish, rate
      
      
      
      Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
      Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
      
      
      ! Direct
      Call System_clock( start, rate )
      Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                                b , Size( b , Dim = 1 ), &
                                                0.0_wp, c2, Size( c2, Dim = 1 ) )
      Call System_clock( finish, rate )
      Write( *, * ) 'Time for straight  method ', Real( finish - start, wp ) / rate
        
        
    
      !naive loop
      Call System_clock( start, rate )
    
      do la = 1, na 
        do lc = 1, nc
          do ld = 1, nd
             c3(la,lc,ld) = 0.0_wp
          enddo
        enddo
      enddo
      
      do la = 1, na 
        do lb = 1, nb
          do lc = 1, nc
            do ld = 1, nd
              c3(la,lc,ld) = c3(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
            enddo  
          enddo
        enddo
      enddo  
     
      
      Call System_clock( finish, rate )
      Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate  
       
    
    
    
      !dgemm omp 
      Call System_clock( start, rate )
      
      !$omp parallel
      ! Direct
      Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                                b , Size( b , Dim = 1 ), &
                                                0.0_wp, c4, Size( c4, Dim = 1 ) )
      !$omp end parallel
      Call System_clock( finish, rate )  
      Write( *, * ) 'Time for straight  method omp', Real( finish - start, wp ) / rate
        
      
     
    
      !loop omp
      Call System_clock( start, rate )  
      !$omp parallel
    
      do la = 1, na 
        do lc = 1, nc
          do ld = 1, nd
             c5(la,lc,ld) = 0.0_wp
          enddo
        enddo
      enddo
      
      !$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)    
      do la = 1, na 
        do lb = 1, nb
          do lc = 1, nc
            do ld = 1, nd
              c5(la,lc,ld) = c5(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
            enddo  
          enddo
        enddo
      enddo  
      !$omp end do  
      !$omp end parallel
     
      
      Call System_clock( finish, rate )
      Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate  
     
      
    
    
    !single core: c1 c2 c3
    ! c1 reshape blas
    ! c2 blas
    ! c3 naive loop (reference)
    ! parallel: c4 c5
    ! c4 dgemm parallel
    ! c5 naive loop parallel
    
    
      do la = 1, na 
        do lc = 1, nc
          do ld = 1, nd
             
             if ( dabs(c3(la,lc,ld) - c1(la,lc,ld))  > 1.e-6 ) then 
               write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
             endif         
             
             
             if ( dabs(c3(la,lc,ld) - c2(la,lc,ld))  > 1.e-6 ) then 
               write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
             endif
             
             if ( dabs(c3(la,lc,ld) - c4(la,lc,ld))  > 1.e-6 ) then 
               write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
             endif
             
             if ( dabs(c3(la,lc,ld) - c5(la,lc,ld))  > 1.e-6 ) then 
               write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
             endif         
          enddo
        enddo
      enddo  
      
    End Program reshape_for_blas
    
    

    然后gfortran reshape.f90 -lblas -fopenmp ,和30 30 30 30输入导致

     Time for reshaping method    1.3519000000000001E-003
     Difference between result matrices    12.380937791257775
     Time for straight  method    6.2549999999999997E-004
     Time for straight  method omp   1.2600000000000001E-003
     Time for naive loop   1.0008599999999999E-002
     Time for naive loop omp   1.6678999999999999E-002
    

    虽然速度不太好。

    最佳答案

    您正在使用同一组变量并行调用DGEMM(因为在 Fortran 中默认情况下共享并行区域中的变量)。这不起作用,并且会由于数据竞争而产生奇怪的结果。您有两个选择:

    • 找到一个并行 BLAS 实现,其中 DGEMM 已经线程化。英特尔 MKL 和 OpenBLAS 是主要候选者。 Intel MKL 使用 OpenMP,更具体地说,它是使用 Intel OpenMP 运行时构建的,因此它可能无法很好地与使用 GCC 编译的 OpenMP 代码配合使用,但它可以与非线程代码完美配合。

    • 并行调用 DGEMM,但不使用同一组参数。相反,对一个或两个张量执行 block 分解,并让每个线程对单独的 block 进行收缩。由于 Fortran 使用列优先存储,因此分解第二个张量可能是合适的:

      C[i,k,l=1..L] = A[i,j] * B[j,k,l=1..L]
      

      变成有两个线程:

      thread 0: C[i,k,l=1..L/2] = A[i,j] * B[j,k,l=1..L/2]
      thread 1: C[i,k,l=L/2+1..L] = A[i,j] * B[j,k,l=L/2+1..L]
      

      对于任意数量的线程,归结为计算每个线程中 l 索引的开始值和结束值,并相应地调整 DGEMM 的参数。

    就我个人而言,我会选择并行 BLAS 实现。使用英特尔 MKL,您只需链接并行驱动程序,它就会自动使用所有可用的 CPU。

    下面是 block 分解的示例实现。仅显示对原始代码的添加和更改:

      ! ADD: Use the OpenMP module
      Use :: omp_lib
    
      ! ADD: Variables used for the decomposition
      Integer :: ithr, istart, iend
    
      ! CHANGE: OpenMP with block decomposition
      !$omp parallel private(ithr, istart, iend)
        ithr = omp_get_thread_num()
    
        ! First index (plane) in B for the current thread
        istart = ithr * nd / omp_get_num_threads()
        ! First index (plane) in B for the next thread
        iend = (ithr + 1) * nd / opm_get_num_threads()
    
        Call dgemm('N', 'N', na, nc * (iend - istart), nb, 1.0_wp, a, nd, &
                   b(1, 1, 1 + istart), Size(b, Dim = 1), &
                   0.0_wp, c4(1, 1, 1 + istart), Size(c4, Dim = 1))
      !$omp end parallel
    

    istart 是每个单独线程工作的 B 第一个平面的索引。 iend 是下一个线程的第一个平面,因此 iend - istart 是当前线程的平面数。 b(1, 1, 1 + istart) 是 B 中平面 block 的开始位置。 c4(1, 1, 1 + istart) 是结果张量中 block 的开始位置。

    确保您执行其中一项操作,但不要同时执行两项操作。即,如果您的 BLAS 实现是线程化的,但您决定进行 block 分解,请禁用 BLAS 库中的线程化。相反,如果您在 BLAS 实现中使用线程,请不要在代码中执行 block 分解。

    关于fortran - 带有 BLAS 的 OpenMP,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/66296334/

    相关文章:

    floating-point - 实数相乘时如何避免下溢?

    c - 为什么以下 OpenMP 程序无法减少我的变量?

    c++ - OpenMP 线程似乎串行执行

    compiler-errors - 分配最大的4个字节和8个字节的整数时出错

    gcc - 浮点异常在新的 gfortran 版本中发出信号

    cuda - 什么会导致 nvprof 不返回数据?

    double - 扩展的 double

    c - Fortran 2003 绑定(bind)到 C : how to translate enums and #defines? 中的库

    performance - OpenMP 通过三重 for 循环并行化矩阵乘法(性能问题)

    R v3.4.0-2 在 Arch 上无法找到 libgfortran.so.3