python - 为什么 Fortran 中的单变量 Horner 比 NumPy 对应物更快,而双变量 Horner 却不是

标签 python arrays numpy fortran f2py

我想在 Python 中执行多项式微积分。 numpy 中的 polynomial 包对我来说不够快。因此,我决定用 Fortran 重写几个函数,并使用 f2py 创建共享库,这些库可以轻松导入 Python。目前,我正在将我的单变量和双变量多项式评估例程与它们的 numpy 对应例程进行基准测试。

在我使用的单变量例程中 Horner's methodnumpy.polynomial.polynomial.polyval 一样。我观察到 Fortran 程序比 numpy 对应程序更快的因素随着多项式阶数的增加而增加。

在双变量例程中,我两次使用霍纳的方法。首先在 y 中,然后在 x 中。不幸的是,我观察到对于递增的多项式阶数,numpy 对应物 catch 并最终超过了我的 Fortran 例程。由于 numpy.polynomial.polynomial.polyval2d 使用了与我类似的方法,我认为第二个观察结果很奇怪。

我希望这个结果源于我对 Fortran 和 f2py 缺乏经验。有人可能知道为什么单变量例程总是表现更好,而双变量例程只对低阶多项式更好?

编辑 这是我最新更新的代码、基准脚本和性能图:

多项式.f95

subroutine polyval(p, x, pval, nx)

    implicit none

    real(8), dimension(nx), intent(in) :: p
    real(8), intent(in) :: x
    real(8), intent(out) :: pval
    integer, intent(in) :: nx
    integer :: i

    pval = 0.0d0
    do i = nx, 1, -1
        pval = pval*x + p(i)
    end do

end subroutine polyval

subroutine polyval2(p, x, y, pval, nx, ny)

    implicit none

    real(8), dimension(nx, ny), intent(in) :: p
    real(8), intent(in) :: x, y
    real(8), intent(out) :: pval
    integer, intent(in) :: nx, ny
    real(8) :: tmp
    integer :: i, j

    pval = 0.0d0
    do j = ny, 1, -1
        tmp = 0.0d0
        do i = nx, 1, -1
            tmp = tmp*x + p(i, j)
        end do
        pval = pval*y + tmp
    end do

end subroutine polyval2

subroutine polyval3(p, x, y, z, pval, nx, ny, nz)

    implicit none

    real(8), dimension(nx, ny, nz), intent(in) :: p
    real(8), intent(in) :: x, y, z
    real(8), intent(out) :: pval
    integer, intent(in) :: nx, ny, nz
    real(8) :: tmp, tmp2
    integer :: i, j, k

    pval = 0.0d0
    do k = nz, 1, -1
        tmp2 = 0.0d0
        do j = ny, 1, -1
            tmp = 0.0d0
            do i = nx, 1, -1
                tmp = tmp*x + p(i, j, k)
            end do
            tmp2 = tmp2*y + tmp
        end do
        pval = pval*z + tmp2
    end do

end subroutine polyval3

benchmark.py(使用此脚本生成绘图)

import time
import os

import numpy as np
import matplotlib.pyplot as plt

# Compile and import Fortran module
os.system('f2py -c polynomial.f95 --opt="-O3 -ffast-math" \
--f90exec="gfortran-4.8" -m polynomial')
import polynomial

# Create random x and y value
x = np.random.rand()
y = np.random.rand()
z = np.random.rand()

# Number of repetition
repetition = 10

# Number of times to loop over a function
run = 100

# Number of data points
points = 26

# Max number of coefficients for univariate case
n_uni_min = 4
n_uni_max = 100

# Max number of coefficients for bivariate case
n_bi_min = 4
n_bi_max = 100

# Max number of coefficients for trivariate case
n_tri_min = 4
n_tri_max = 100

# Case on/off switch
case_on = [1, 1, 1]

case_1_done = 0
case_2_done = 0
case_3_done = 0

#=================#
# UNIVARIATE CASE #
#=================#

if case_on[0]:

    # Array containing the polynomial order + 1 for several univariate polynomials
    n_uni = np.array([int(x) for x in np.linspace(n_uni_min, n_uni_max, points)])

    # Initialise arrays for storing timing results
    time_uni_numpy = np.zeros(n_uni.size)
    time_uni_fortran = np.zeros(n_uni.size)

    for i in xrange(len(n_uni)):
        # Create random univariate polynomial of order n - 1
        p = np.random.rand(n_uni[i])

        # Time evaluation of polynomial using NumPy
        dt = []
        for j in xrange(repetition):
            t1 = time.time()
            for r in xrange(run): np.polynomial.polynomial.polyval(x, p)
            t2 = time.time()
            dt.append(t2 - t1)
        time_uni_numpy[i] = np.average(dt[2::])

        # Time evaluation of polynomial using Fortran
        dt = []
        for j in xrange(repetition):
            t1 = time.time()
            for r in xrange(run): polynomial.polyval(p, x)
            t2 = time.time()
            dt.append(t2 - t1)
        time_uni_fortran[i] = np.average(dt[2::])

    # Speed-up factor
    factor_uni = time_uni_numpy / time_uni_fortran

    results_uni = np.zeros([len(n_uni), 4])
    results_uni[:, 0] = n_uni
    results_uni[:, 1] = factor_uni
    results_uni[:, 2] = time_uni_numpy
    results_uni[:, 3] = time_uni_fortran
    print results_uni, '\n'

    plt.figure()
    plt.plot(n_uni, factor_uni)
    plt.title('Univariate comparison')
    plt.xlabel('# coefficients')
    plt.ylabel('Speed-up factor')
    plt.xlim(n_uni[0], n_uni[-1])
    plt.ylim(0, max(factor_uni))
    plt.grid(aa=True)

case_1_done = 1

#================#
# BIVARIATE CASE #
#================#

if case_on[1]:

    # Array containing the polynomial order + 1 for several bivariate polynomials
    n_bi = np.array([int(x) for x in np.linspace(n_bi_min, n_bi_max, points)])

    # Initialise arrays for storing timing results
    time_bi_numpy = np.zeros(n_bi.size)
    time_bi_fortran = np.zeros(n_bi.size)

    for i in xrange(len(n_bi)):
        # Create random bivariate polynomial of order n - 1 in x and in y
        p = np.random.rand(n_bi[i], n_bi[i])

        # Time evaluation of polynomial using NumPy
        dt = []
        for j in xrange(repetition):
            t1 = time.time()
            for r in xrange(run): np.polynomial.polynomial.polyval2d(x, y, p)
            t2 = time.time()
            dt.append(t2 - t1)
        time_bi_numpy[i] = np.average(dt[2::])

        # Time evaluation of polynomial using Fortran
        p = np.asfortranarray(p)
        dt = []
        for j in xrange(repetition):
            t1 = time.time()
            for r in xrange(run): polynomial.polyval2(p, x, y)
            t2 = time.time()
            dt.append(t2 - t1)
        time_bi_fortran[i] = np.average(dt[2::])

    # Speed-up factor
    factor_bi = time_bi_numpy / time_bi_fortran

    results_bi = np.zeros([len(n_bi), 4])
    results_bi[:, 0] = n_bi
    results_bi[:, 1] = factor_bi
    results_bi[:, 2] = time_bi_numpy
    results_bi[:, 3] = time_bi_fortran
    print results_bi, '\n'

    plt.figure()
    plt.plot(n_bi, factor_bi)
    plt.title('Bivariate comparison')
    plt.xlabel('# coefficients')
    plt.ylabel('Speed-up factor')
    plt.xlim(n_bi[0], n_bi[-1])
    plt.ylim(0, max(factor_bi))
    plt.grid(aa=True)

case_2_done = 1

#=================#
# TRIVARIATE CASE #
#=================#

if case_on[2]:

    # Array containing the polynomial order + 1 for several bivariate polynomials
    n_tri = np.array([int(x) for x in np.linspace(n_tri_min, n_tri_max, points)])

    # Initialise arrays for storing timing results
    time_tri_numpy = np.zeros(n_tri.size)
    time_tri_fortran = np.zeros(n_tri.size)

    for i in xrange(len(n_tri)):
        # Create random bivariate polynomial of order n - 1 in x and in y
        p = np.random.rand(n_tri[i], n_tri[i])

        # Time evaluation of polynomial using NumPy
        dt = []
        for j in xrange(repetition):
            t1 = time.time()
            for r in xrange(run): np.polynomial.polynomial.polyval3d(x, y, z, p)
            t2 = time.time()
            dt.append(t2 - t1)
        time_tri_numpy[i] = np.average(dt[2::])

        # Time evaluation of polynomial using Fortran
        p = np.asfortranarray(p)
        dt = []
        for j in xrange(repetition):
            t1 = time.time()
            for r in xrange(run): polynomial.polyval3(p, x, y, z)
            t2 = time.time()
            dt.append(t2 - t1)
        time_tri_fortran[i] = np.average(dt[2::])

    # Speed-up factor
    factor_tri = time_tri_numpy / time_tri_fortran

    results_tri = np.zeros([len(n_tri), 4])
    results_tri[:, 0] = n_tri
    results_tri[:, 1] = factor_tri
    results_tri[:, 2] = time_tri_numpy
    results_tri[:, 3] = time_tri_fortran
    print results_tri

    plt.figure()
    plt.plot(n_bi, factor_bi)
    plt.title('Trivariate comparison')
    plt.xlabel('# coefficients')
    plt.ylabel('Speed-up factor')
    plt.xlim(n_tri[0], n_tri[-1])
    plt.ylim(0, max(factor_tri))
    plt.grid(aa=True)
    print '\n'

case_3_done = 1

#==============================================================================

plt.show()

结果 enter image description here enter image description here enter image description here

EDIT 对 steabert 提案的更正

subroutine polyval(p, x, pval, nx)

    implicit none

    real*8, dimension(nx), intent(in) :: p
    real*8, intent(in) :: x
    real*8, intent(out) :: pval
    integer, intent(in) :: nx

    integer, parameter :: simd = 8
    real*8 :: tmp(simd), xpower(simd), maxpower
    integer :: i, j, k

    xpower(1) = x
    do i = 2, simd
        xpower(i) = xpower(i-1)*x
    end do
    maxpower = xpower(simd)

    tmp = 0.0d0
    do i = nx+1, simd+2, -simd
        do j = 1, simd
            tmp(j) = tmp(j)*maxpower + p(i-j)*xpower(simd-j+1)
        end do
    end do

    k = mod(nx-1, simd)
    if (k == 0) then
        pval = sum(tmp) + p(1)
    else
        pval = sum(tmp) + p(k+1)
        do i = k, 1, -1
            pval = pval*x + p(i)
        end do
    end if

end subroutine polyval

EDIT 测试代码以验证上面的代码是否为 x > 1 给出了糟糕的结果

import polynomial as P
import numpy.polynomial.polynomial as PP

import numpy as np

for n in xrange(2,100):
    poly1n = np.random.rand(n)
    poly1f = np.asfortranarray(poly1n)

    x = 2

    print np.linalg.norm(P.polyval(poly1f, x) - PP.polyval(x, poly1n)), '\n'

最佳答案

在双变量情况下,p 是一个二维数组。这意味着 C 与 Fortran 的数组排序不同。默认情况下,numpy 函数给出 C 排序,显然 fortran 例程使用 fortran 排序。

f2py 足够聪明来处理这个问题,并自动在 C 和 Fortran 格式数组之间进行转换。但是,这会导致一些开销,这是性能降低的可能原因之一。您可以通过在计时例程之外使用 numpy.asfortranarray 手动将 p 转换为 fortran 类型来检查这是否是原因。当然,为了使其有意义,在您的实际用例中,您需要确保输入数组采用 Fortran 顺序。

f2py 有一个选项 -DF2PY_REPORT_ON_ARRAY_COPY 可以在复制数组时随时警告您。

如果这不是原因,那么您需要考虑更深入的细节,例如您使用的是哪个 Fortran 编译器,以及它应用了哪种优化。可能会减慢速度的示例包括在堆而不是堆栈上分配数组(对 malloc 的昂贵调用),尽管我希望这种影响对于更大的数组会变得不那么重要。

最后,您应该考虑这样一种可能性,即对于双变量拟合,对于较大的 N,numpy 例程基本上已经处于最佳效率。在这种情况下,numpy 例程可能大部分时间都在运行优化的 C 例程,相比之下,python 代码的开销可以忽略不计。在这种情况下,您不会期望您的 Fortran 代码显示出任何显着的加速。

关于python - 为什么 Fortran 中的单变量 Horner 比 NumPy 对应物更快,而双变量 Horner 却不是,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/18707984/

相关文章:

python - eclipse 中未定义的常量

python - TypeError at/products/object of type 'ModelBase' 没有 len()

arrays - 对象类型的变量作为数组行为不当

java - 有什么方法可以知道数组列表是否包含一段文本?

python - 是否有一个 NumPy 函数来返回数组中某物的第一个索引?

python - Python显示用于查找轮廓的OpenCV断言错误

c++ - 使用 SSE 内在函数将 boolean 数组(8 字节 boolean )转换为 int 或 char

python - 返回 numpy 数组别名内部数组时如何处理引用计数?

python - 使用分隔符python合并列表中的项目

python - RPY2:进口商因 .Renviron 失败