python - 优化里德-所罗门编码器(多项式除法)

我正在尝试优化 Reed-Solomon 编码器，它实际上只是对伽罗瓦域 2^8 的多项式除法运算(这仅意味着值环绕超过 255)。该代码实际上与 Go 的代码非常相似:http://research.swtch.com/field

这里使用的多项式除法算法是 synthetic division (也称为霍纳法)。

我什么都试过了:numpy、pypy、cython。我获得的最佳性能是使用 pypy 和这个简单的嵌套循环:

def rsenc(msg_in, nsym, gen):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    for i in xrange(len(msg_in)):
        coef = msg_out[i]
        # coef = gf_mul(msg_out[i], gf_inverse(gen[0]))  // for general polynomial division (when polynomials are non-monic), we need to compute: coef = msg_out[i] / gen[0]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] += msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

Python 优化向导能否指导我找到一些关于如何获得加速的线索？我的目标是至少获得 3 倍的加速，但更多会很棒。任何方法或工具都可以接受，只要它是跨平台的(至少适用于 Linux 和 Windows)。

这是一个小测试脚本，其中包含我尝试过的其他一些替代方案(不包括 cython 尝试，因为它比原生 python 慢!):

import random
from operator import xor

numpy_enabled = False
try:
    import numpy as np
    numpy_enabled = True
except ImportError:
    pass

# Exponent table for 3, a generator for GF(256)
gf_exp = bytearray([1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19,
    53, 95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34,
    102, 170, 229, 52, 92, 228, 55, 89, 235, 38, 106, 190, 217, 112,
    144, 171, 230, 49, 83, 245, 4, 12, 20, 60, 68, 204, 79, 209, 104,
    184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59, 77, 215, 98,
    166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208, 107, 189, 220,
    127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16,
    48, 80, 240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125,
    135, 146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22,
    58, 78, 210, 109, 183, 194, 93, 231, 50, 86, 250, 21, 63, 65, 195,
    94, 226, 61, 71, 201, 64, 192, 91, 237, 44, 116, 156, 191, 218,
    117, 159, 186, 213, 100, 172, 239, 42, 126, 130, 157, 188, 223,
    122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101, 175, 234, 37,
    111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27,
    45, 119, 153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134,
    145, 168, 227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123,
    141, 140, 143, 138, 133, 148, 167, 242, 13, 23, 57, 75, 221, 124,
    132, 151, 162, 253, 28, 36, 108, 180, 199, 82, 246] * 2 + [1])

# Logarithm table, base 3
gf_log = bytearray([0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 51, 238, 223, # BEWARE: the first entry should be None instead of 0 because it's undefined, but for a bytearray we can't set such a value
        3, 100, 4, 224, 14, 52, 141, 129, 239, 76, 113, 8, 200, 248, 105,
        28, 193, 125, 194, 29, 181, 249, 185, 39, 106, 77, 228, 166, 114,
        154, 201, 9, 120, 101, 47, 138, 5, 33, 15, 225, 36, 18, 240, 130,
        69, 53, 147, 218, 142, 150, 143, 219, 189, 54, 208, 206, 148, 19,
        92, 210, 241, 64, 70, 131, 56, 102, 221, 253, 48, 191, 6, 139, 98,
        179, 37, 226, 152, 34, 136, 145, 16, 126, 110, 72, 195, 163, 182,
        30, 66, 58, 107, 40, 84, 250, 133, 61, 186, 43, 121, 10, 21, 155,
        159, 94, 202, 78, 212, 172, 229, 243, 115, 167, 87, 175, 88, 168,
        80, 244, 234, 214, 116, 79, 174, 233, 213, 231, 230, 173, 232, 44,
        215, 117, 122, 235, 22, 11, 245, 89, 203, 95, 176, 156, 169, 81,
        160, 127, 12, 246, 111, 23, 196, 73, 236, 216, 67, 31, 45, 164,
        118, 123, 183, 204, 187, 62, 90, 251, 96, 177, 134, 59, 82, 161,
        108, 170, 85, 41, 157, 151, 178, 135, 144, 97, 190, 220, 252, 188,
        149, 207, 205, 55, 63, 91, 209, 83, 57, 132, 60, 65, 162, 109, 71,
        20, 42, 158, 93, 86, 242, 211, 171, 68, 17, 146, 217, 35, 32, 46,
        137, 180, 124, 184, 38, 119, 153, 227, 165, 103, 74, 237, 222, 197,
        49, 254, 24, 13, 99, 140, 128, 192, 247, 112, 7])

if numpy_enabled:
    np_gf_exp = np.array(gf_exp)
    np_gf_log = np.array(gf_log)

def gf_pow(x, power):
    return gf_exp[(gf_log[x] * power) % 255]

def gf_poly_mul(p, q):
    r = [0] * (len(p) + len(q) - 1)
    lp = [gf_log[p[i]] for i in xrange(len(p))]
    for j in range(len(q)):
        lq = gf_log[q[j]]
        for i in range(len(p)):
            r[i + j] ^= gf_exp[lp[i] + lq]
    return r

def rs_generator_poly_base3(nsize, fcr=0):
    g_all = {}
    g = [1]
    g_all[0] = g_all[1] = g
    for i in range(fcr+1, fcr+nsize+1):
        g = gf_poly_mul(g, [1, gf_pow(3, i)])
        g_all[nsize-i] = g
    return g_all

# Fastest way with pypy
def rsenc(msg_in, nsym, gen):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    for i in xrange(len(msg_in)):
        coef = msg_out[i]
        # coef = gf_mul(msg_out[i], gf_inverse(gen[0]))  # for general polynomial division (when polynomials are non-monic), the usual way of using synthetic division is to divide the divisor g(x) with its leading coefficient (call it a). In this implementation, this means:we need to compute: coef = msg_out[i] / gen[0]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] += msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative 1: the loops were completely changed, instead of fixing msg_out[i] and updating all subsequent i+j items, we now fixate msg_out[i+j]  and compute it at once using all couples msg_out[i] * gen[j] - msg_out[i+1] * gen[j-1] - ... since when we fixate msg_out[i+j], all previous msg_out[k] with k < i+j are already fully computed.
def rsenc_alt1(msg_in, nsym, gen):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    # Alternative 1
    jlist = range(1, len(gen))
    for k in xrange(1, len(msg_out)):
        for x in xrange(max(k-len(msg_in),0), len(gen)-1):
            if k-x-1 < 0: break
            msg_out[k] ^=  gf_exp[msg_out[k-x-1] + lgen[jlist[x]]]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative 2: a rewrite of alternative 1 with generators and reduce
def rsenc_alt2(msg_in, nsym, gen):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    # Alternative 1
    jlist = range(1, len(gen))
    for k in xrange(1, len(msg_out)):
        items_gen = ( gf_exp[msg_out[k-x-1] + lgen[jlist[x]]] if k-x-1 >= 0 else next(iter(())) for x in xrange(max(k-len(msg_in),0), len(gen)-1) )
        msg_out[k] ^= reduce(xor, items_gen)

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative with Numpy
def rsenc_numpy(msg_in, nsym, gen):
    msg_in = np.array(bytearray(msg_in))
    msg_out = np.pad(msg_in, (0, nsym), 'constant')
    lgen = np_gf_log[gen]
    for i in xrange(msg_in.size):
        msg_out[i+1:i+lgen.size] ^= np_gf_exp[np.add(lgen[1:], msg_out[i])]

    msg_out[:len(msg_in)] = msg_in
    return msg_out

gf_mul_arr = [bytearray(256) for _ in xrange(256)]
gf_add_arr = [bytearray(256) for _ in xrange(256)]

# Precompute multiplication and addition tables
def gf_precomp_tables(gf_exp=gf_exp, gf_log=gf_log):
    global gf_mul_arr, gf_add_arr
    for i in xrange(256):
        for j in xrange(256):
            gf_mul_arr[i][j] = gf_exp[gf_log[i] + gf_log[j]]
            gf_add_arr[i][j] = i ^ j
    return gf_mul_arr, gf_add_arr

# Alternative with precomputation of multiplication and addition tables, inspired by zfec: https://hackage.haskell.org/package/fec-0.1.1/src/zfec/fec.c
def rsenc_precomp(msg_in, nsym, gen=None):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)

    for i in xrange(len(msg_in)): # [i for i in xrange(len(msg_in)) if msg_in[i] != 0]
        coef = msg_out[i]
        if coef != 0:  # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            mula = gf_mul_arr[coef]
            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                #msg_out[i + j] = gf_add_arr[msg_out[i+j]][gf_mul_arr[coef][gen[j]]] # slower...
                #msg_out[i + j] ^= gf_mul_arr[coef][gen[j]] # faster
                msg_out[i + j] ^= mula[gen[j]] # fastest

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in # equivalent to c = mprime - b, where mprime is msg_in padded with [0]*nsym
    return msg_out

def randstr(n, size):
    '''Generate very fastly a random hexadecimal string. Kudos to jcdryer http://stackoverflow.com/users/131084/jcdyer'''
    hexstr = '%0'+str(size)+'x'
    for _ in xrange(n):
        yield hexstr % random.randrange(16**size)

# Simple test case
if __name__ == "__main__":

    # Setup functions to test
    funcs = [rsenc, rsenc_precomp, rsenc_alt1, rsenc_alt2]
    if numpy_enabled: funcs.append(rsenc_numpy)
    gf_precomp_tables()

    # Setup RS vars
    n = 255
    k = 213

    import time

    # Init the generator polynomial
    g = rs_generator_poly_base3(n)

    # Init the ground truth
    mes = 'hello world'
    mesecc_correct = rsenc(mes, n-11, g[k])

    # Test the functions
    for func in funcs:
        # Sanity check
        if func(mes, n-11, g[k]) != mesecc_correct: print func.__name__, ": output is incorrect!"
        # Time the function
        total_time = 0
        for m in randstr(1000, n):
            start = time.clock()
            func(m, n-k, g[k])
            total_time += time.clock() - start
        print func.__name__, ": total time elapsed %f seconds." % total_time

这是我机器上的结果:

With PyPy:
rsenc : total time elapsed 0.108183 seconds.
rsenc_alt1 : output is incorrect!
rsenc_alt1 : total time elapsed 0.164084 seconds.
rsenc_alt2 : output is incorrect!
rsenc_alt2 : total time elapsed 0.557697 seconds.

Without PyPy:
rsenc : total time elapsed 3.518857 seconds.
rsenc_alt1 : output is incorrect!
rsenc_alt1 : total time elapsed 5.630897 seconds.
rsenc_alt2 : output is incorrect!
rsenc_alt2 : total time elapsed 6.100434 seconds.
rsenc_numpy : output is incorrect!
rsenc_numpy : total time elapsed 1.631373 seconds

(注意:替代方案应该是正确的，某些索引必须有点偏离，但由于它们的速度较慢，所以我没有尝试修复它们)

/UPDATE 和赏金目标:我发现了一个非常有趣的优化技巧，它有望大大加快计算速度:to precompute the multiplication table .我用新函数 rsenc_precomp() 更新了上面的代码。但是，在我的实现中根本没有任何收获，它甚至有点慢:

rsenc : total time elapsed 0.107170 seconds.
rsenc_precomp : total time elapsed 0.108788 seconds.

数组查找的成本怎么会比加法或异或之类的操作成本高？为什么它在 ZFEC 中有效，而在 Python 中无效？

我会将赏金归功于谁能向我展示如何使这种乘法/加法查找表优化工作(比异或和加法运算更快)，或者谁能通过引用或分析向我解释为什么这种优化在这里不起作用(使用 Python/PyPy/Cython/Numpy 等。我都试过了)。

最佳答案

以下在我的机器上比 pypy 快 3 倍(0.04 秒对 0.15 秒)。使用 Cython:

ctypedef unsigned char uint8_t # does not work with Microsoft's C Compiler: from libc.stdint cimport uint8_t
cimport cpython.array as array

cdef uint8_t[::1] gf_exp = bytearray([1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19,
   lots of numbers omitted for space reasons
   ...])

cdef uint8_t[::1] gf_log = bytearray([0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 
    more numbers omitted for space reasons
    ...])

import cython

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
def rsenc(msg_in_r, nsym, gen_t):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''

    cdef uint8_t[::1] msg_in = bytearray(msg_in_r) # have to copy, unfortunately - can't make a memory view from a read only object
    cdef int[::1] gen = array.array('i',gen_t) # convert list to array

    cdef uint8_t[::1] msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    cdef int j
    cdef uint8_t[::1] lgen = bytearray(gen.shape[0])
    for j in xrange(gen.shape[0]):
        lgen[j] = gf_log[gen[j]]

    cdef uint8_t coef,lcoef

    cdef int i
    for i in xrange(msg_in.shape[0]):
        coef = msg_out[i]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, gen.shape[0]): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] -= msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:msg_in.shape[0]] = msg_in
    return msg_out

这只是您最快的静态类型版本(并从 cython -a 检查 html，直到循环未以黄色突出显示)。

一些简短的说明:

Cython 更喜欢 x.shape[0] 而不是 len(shape)
将内存 View 定义为 [::1] 保证它们在内存中是连续的，这有帮助
initializedcheck(False) 是避免对全局定义的 gf_exp 和 gf_log 进行大量存在性检查所必需的。 (你可能会发现你可以通过为这些创建一个局部变量引用并使用它来加速你的基本 Python/PyPy 代码)
我不得不复制几个输入参数。 Cython 无法从只读对象(在本例中为 msg_in，一个字符串。我可能只是将其设为 char*)。 gen(一个列表)也需要有快速元素访问的东西。

除此之外，一切都相当简单。 (我没有尝试过它的任何变体以使其更快)。 PyPy 的表现给我留下了深刻的印象。

关于python - 优化里德-所罗门编码器(多项式除法)，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/30363903/

python - 优化里德-所罗门编码器(多项式除法)

上一篇：python - "InvalidRequirement: Invalid requirement, parse error"更新python包后出错

下一篇：python - 在多个程序中正确使用 Scikit 的 LabelEncoder