我刚刚开始使用 Cython。我尝试将此 Python 函数移植到 Cython 以提高其速度,但现在速度较慢。一开始速度有点快,但我尝试改进字符串连接和内存分配,因为它们相当于 80% 的处理时间,但随后情况变得更糟:)。我当然选择了最慢的字符串连接和内存分配解决方案。
我应该使用 malloc 吗?在 Cython 中,多个 strcat 是一个非常糟糕的选择吗?什么是更好的选择?
PS:我没有重新使用我的 python 优化(使用字符串的中间数组并在末尾连接到字符串),因为我找不到在 Cython 末尾进行连接的正确方法。
新的 Cython 代码(更新:用 sprintf 替换 strcat,跟踪字符串大小以避免重新计算):
#
# Input: takes an array of string in input (typically extracted from a csv) with
# different options to help better formatting the output.
#
# It then builds a string made of the different part of this array. Those data are used for
# classification (target class, tag, different float/int data, different categorical data).
#
# Ouput: it returns a string compliant to vowpal wabbit input format.
#
# Sample input:
# array_to_vw(["No", "10990", "32", "64", "28", "dog", "food", "jedi"], False, 0, [], [2, 3, 4], 1, [0, 1],
# re.compile('Y(es)?|T(rue)?|\+?1'), re.compile('No?|F(alse)?|0|-1'), re.compile('^(\-)?[0-9.]*$'))
# output:
# "-1 '10990 |i f2:32 f3:64 f4:28 | dog food jedi"
#
# note: Regex are usually stored not compiled whenever calling this function
#
def array_to_vw(data, train=False, int category_index=0, categorical_colnums=[], numerical_colnums=[], int tag_index=-1, skipped_idx=[0], cpregex=None, cnregex=None, cfloatregex=None): cdef char[5] category
cdef char[20] category
cdef char[1000] outline_array
cdef char[500] categorical_array
cdef char[20] col
cdef char[20] tag
cdef int outline_array_len = 0
cdef int categorical_array_len = 0
cdef int colnum = 0
categorical_array[0] = 0
strcpy(category, data[category_index])
categorical_array_len = sprintf(categorical_array, "| ")
if cpregex.match(category): # regex for positive category
strcpy(category, '1')
elif cnregex.match(category): # regex for negative category
strcpy(category, '-1')
elif train: # if no valid class but in train mode, set default positive value
strcpy(category, '1')
else:
sys.exit("Category's regex did not match a record:\n" + category)
# format the beginning of the string output (change if a tag is specified)
if tag_index > -1:
strcpy(tag, data[tag_index])
outline_array_len = sprintf(outline_array, "%s '%s |i ", category, tag)
else:
outline_array_len = sprintf(outline_array, "%s |i ", category)
for colnum in range(len(data)):
if sprintf(col, data[colnum]) > 0 and colnum not in skipped_idx:
if colnum in categorical_colnums:
categorical_array_len += sprintf(categorical_array + categorical_array_len, "%s ", col)
elif colnum in numerical_colnums:
outline_array_len += sprintf(outline_array + outline_array_len, "f%d:%s ", colnum, col)
else:
if cfloatregex.match(data[colnum]): # If the feature is a number, then give it a label
outline_array_len += sprintf(outline_array + outline_array_len, "f%d:%s ", colnum, col)
else: # If the feature is a string, then let vw handle it directly
categorical_array_len += sprintf(categorical_array + categorical_array_len, "%s ", col)
if categorical_array_len > 2:
sprintf(outline_array + outline_array_len, "%s\n", categorical_array)
else:
strcpy(outline_array + outline_array_len, "\n")
#print outline_array
return outline_array
初始Python代码:
def array_to_vw(data, train=False, category_index=0, categorical_colnums=[], numerical_colnums=[], tag_index=-1, skipped_idx=[0], cpregex=None, cnregex=None, cfloatregex=None):
# providing pre-compiled regex to array_to_vw() really improve performance if array_to_vw() is called many times
if cfloatregex is None:
cfloatregex = re.compile('^(\-)?([0-9.]+e(\+|-))?[0-9.]+$')
if cpregex is None:
cpregex = re.compile('Y(es)?|T(rue)?|\+?1')
if cnregex is None:
cnregex = re.compile('No?|F(alse)?|0|-1')
category = data[category_index]
#if re.search(pregex, category): # regex for positive category
if cpregex.match(category): # regex for positive category
category = '1'
#elif re.search(nregex, category): # regex for negative category
elif cnregex.match(category): # regex for negative category
category = '-1'
elif train: # if no valid class but in train mode, set default positive value
category = '1'
else:
sys.exit("Regex did not match a record, exiting.\nPostive Regex: " + pregex + "\nNegative Regex: "+ nregex + "\nRecord:\n" + str(data))
if tag_index > -1:
tag = data[tag_index]
outline_array = [category, " '", tag, " |i "]
else:
outline_array = [category, "| "]
colnum = 0
categorical_array = ['| ']
for col in data:
if col and colnum not in skipped_idx:
if colnum in categorical_colnums:
#outline_array.extend([col, ' '])
categorical_array.extend([col, ' '])
elif colnum in numerical_colnums:
outline_array.extend(['f', str(colnum), ':', col, ' '])
#outline_array = "%sf%s:%s " % (outline_array, str(colnum), col)
else:
colstr = str(colnum)
if cfloatregex.match(col): # If the feature is a number, then give it a label
#print "numerical col:", colstr
outline_array.extend(['f', colstr, ':', col, ' '])
else: # If the feature is a string, then let vw handle it directly
#print "categorical col:", colstr
categorical_array.extend([col, ' '])
# once a column is detected as a string/categorical it shouldn't be processed differently later on
categorical_colnums.append(colnum)
#colnum = colnum + 1
colnum += 1
#if len(categorical_array) > 1:
# outline_array.extend(categorical_array)
outline_array.extend(categorical_array)
outline_array.extend("\n")
return "".join(outline_array)
最佳答案
如果使用 NumPy 对操作进行矢量化,则可以大大加快速度。例如,以下是如何获取输入数据列表/数组并确定哪些索引是数字,而不使用(相对较慢的)正则表达式:
>>> data = ["No", "10990", "32", "64", "28", "dog", "food", "jeddy"]
>>> np.genfromtxt(np.array(data))
array([nan, 10990., 32., 64., 28., nan, nan, nan])
>>> np.isnan(np.genfromtxt(np.array(data)))
array([True, False, False, False, False, True, True, True], dtype=bool)
希望这能让您体验到什么是可能的。至少这将完全消除正则表达式匹配,这可能是现在代码中较慢的部分之一。而且它不需要 Cython。
关于c++ - Cython 字符串管理和内存,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/25006280/