python - 在 .so 文件中使用 C 模块时出现段错误

标签 python c segmentation-fault

在 Linux 下的命令行中运行 python 脚本时,我遇到了段错误(可能是由长循环引起的)。我确切地知道问题出在哪里,但不知道为什么。我已经尝试了网上搜索的一些方法,包括这个网站,但仍然无法解决。所以,请帮助我——提前谢谢你。下面是一些代码:

0

analyzer.py,程序开始的位置:

from classify import BayesClassifier
class Analyzer:

    def __init__(self):
        self.classify = BayesClassifier('/home/user/yakamoz/srcanalyzer/classification/training.tab')

if __name__ == '__main__':
    a = Analyzer()
    # the following is a string of Chinese character, which, I am sure,
    # has no influence on the Segmentation fault, you can just suppose
    # it as a paragraph in English.

    text = "市委常委、纪委书记杨娟高度评价我县基层党风廉政建设:\
       务实创新成效显著作者:县纪委办公室发布时间:11月27日下午,\
       市委常委、纪委书记杨娟率领市纪委副书记蒋玉平、王友富,\
       市纪委常委、秘书长任斌,市纪委机关党委书记刘林建一行来我\
       县调研基层党风廉政建设。调研中,杨娟高度评价我县基层党风廉政建设,\
       认为工作务实创新,成效显著。县委书记陈朝先,县委副书记季代双,县委常委、\
       纪委书记韩忠明陪同调研。杨娟一行先后来到两河镇、西部花都、两江广场、\
       工业园区等地实地调研我县基层党风廉政建设,检阅我县“两化”互动、“三化”\
       联动发展成果。查阅相关资料在两河镇,杨娟认真听取了两河片区纪工委\
       日常工作开展情况的汇报,仔细翻阅了巡查工作日记和接访记录。杨娟指出,\
       设置乡镇片区纪工委是加强基层纪检组织建设的创新举措。\
       盐亭在全市率先设置、运行纪工委以来,在化解农村信访矛盾,理顺群众情绪,\
       强化基层办案工作等方面取得了明显成效。她要求,要总结提炼片区纪工委的经验,\
       进一步明确职能职责,在机构设置、人员配备、制度建设等方面进行探索实践,\
       为全市基层纪检组织建设提供有益经验借鉴。杨娟还饶有兴趣地参观了两河镇\
       的机关廉政文化建设"

    print str(a.classify.classify_text(text)[0])

1

classify.py;该文件由上面介绍的 analyzer.py 使用:

# -*- coding:utf-8 -*-
from match import WordMatch
import cPickle
import math

class BayesClassifier:

    __trainingdata = {}                    
    __classifywordscount = {}
    __classifydoccount = {}

    def __init__(self, table_name):         
        self.trainingtable = cPickle.load(open(table_name, 'r'))  
        for x in self.trainingtable:
            self.train(x[1], x[0])  
        print 'training finished'
        self.matrix = self.get_matrix()         
        self.vector_count = len(self.matrix)
        self.doc_count = len(self.trainingtable)
        self.match = WordMatch(self.matrix)

    def get_matrix(self):                       
        matrix = {}
        for x in self.trainingtable:
            for k in x[0]:
                matrix[k] = 0
        return matrix

    def doc_to_vector(self, content):
        matrix = {word:value for (word, value) in self.match.find(content).items()}     
        return matrix        

    def train(self, cls, vector):
        if cls not in self.__trainingdata:
            self.__trainingdata[cls] = {}
        if cls not in self.__classifywordscount:
            self.__classifywordscount[cls] = 0
        if cls not in self.__classifydoccount:
            self.__classifydoccount[cls] = 0
        self.__classifydoccount[cls] += 1

        for word in vector.keys():
            self.__classifywordscount[cls] += vector[word]
            if word not in self.__trainingdata[cls]:
                self.__trainingdata[cls][word] = vector[word]
            else:
                self.__trainingdata[cls][word] += vector[word]
        

    def classify_text(self, content):
        t = -1 << 32
        res = "unknown classification"
        for cls in self.__trainingdata.keys():
            prob = self.__count(cls, self.doc_to_vector(content))
            if prob > t:
                res = cls
                t = prob
        return res, t

2

match.py​​;此代码由 classify.py

引用
# -*- coding:utf-8 -*-
import os
import re
import util.ahocorasick.x64 as ahocorasick
# util.ahocorasick.x64 is a folder where .so file locates

class WordMatch(object):
    def __init__(self, arg):
        self.__tree = ahocorasick.KeywordTree()
        if isinstance(arg, (list, dict)):
            for item in arg:
                if item:
                    self.__tree.add(item)
        elif isinstance(arg, basestring):
            if os.path.isfile(arg):
                fp = open(arg)
                for line in fp:
                    line = line.strip()
                    if line:
                        self.__tree.add(line)
                fp.close()
            else:
                print 'the path of the input file does not exist'
                return
        else:
            print 'parameter fault'
            return           
        self.__tree.make()

    def _findall(self, content):
        '''return the list of keywords that is found 
        '''
        hit_list = []
        if isinstance(content, basestring):
            for start, end in self.__tree.findall(content):
                if len(content[start:end]):
                    hit_list.append(content[start:end])
        else:
            print 'AC automation requires string '
        return hit_list

    def find(self, content):
        '''return those matched keywords and the corresponding count
        '''
        hit_list = self._findall(content)
        mydict = {}
        for item in hit_list:
            if item in mydict:
                mydict[item] += 1
            else:
                mydict[item] = 1
        return mydict

3

__init__.py,在util.ahocorasick.x64文件夹下:

import _ahocorasick

__all__ = ['KeywordTree']


# A high level version of the keyword tree.  Most of the methods here
# are just delegated over to the underlying C KeywordTree 
#(in the .so file, which is not shown here).


class KeywordTree(object):
    def __init__(self):
        self.__tree = _ahocorasick.KeywordTree();


    def add(self, s):
        return self.__tree.add(s)


    def make(self):
        return self.__tree.make()


    def zerostate(self):
        return self.__tree.zerostate()

    ##### !! I found this is where the segmentation fault occurs

    def __findall_helper(self, sourceBlock, allow_overlaps, search_function):
        """Helper function that captures the common logic behind the
        two findall methods."""
        startpos = 0
        startstate = self.zerostate()
        loop_times = 0            

        while True:
            #print spot_1
            match = search_function(sourceBlock, startpos, startstate)
            #print spot_2
            if not match:
                break
            yield match[0:2]
            startpos = match[1]
            if allow_overlaps: #which in my case is always false
                startstate = match[2]
            else:
                loop_times = loop_times + 1
                #print spot_3
                startstate = self.zerostate()
                #print spot_4
                #print loop_times

    def findall(self, sourceBlock, allow_overlaps=0):
        return self.__findall_helper(sourceBlock, allow_overlaps,self.__tree.search)

我对给出的不同结果感到困惑:我发现问题出在 3 __init__.py 或者更确切地说,__findall_helper(self, sourceBlock, akkow_overlaps, search_function).

通过取消注释以下注释之一:

#print spot_1
#print spot_2
#print spot_4

可以消除段错误并且循环是有限的(匹配可以是None),但是通过取消注释#print Spot_3,可以(看起来像一个无限循环)。我的问题来了:

print 语句在 python 中是否有副作用?我只发现上面提到的三个点之一(spot_1spot_2spot_4中有一个print语句>)可排除故障。顺便说一下,我偶然发现了这个,一开始没有print

4

这是使用 gdb回溯

(gdb) r analyzer.py

Starting program: /usr/local/bin/python analyzer.py
[Thread debugging using libthread_db enabled]
Detaching after fork from child process 11499.
training finished

Program received signal SIGSEGV, Segmentation fault.
0x00007ffff178956d in ahocorasick_KeywordTree_search_helper (state=0x85c730, 
string=0x8967d4 "【中国环保在线 市场行情】“我国将在2016年启动全国碳市场。全国碳交    易市场的首批行业企业将由电力、冶金、有色、建材、化工5个传统制造业和航", <incomplete     sequence \347\251>..., n=140733193395828, startpos=118366835, 
out_start=0x7fffffffd468, out_end=0x7fffffffd460,       out_last_state=0x7fffffffd458) at aho-corasick.c:216
216 aho-corasick.c: No such file or directory.
in aho-corasick.c
Missing separate debuginfos, use: debuginfo-install glibc-2.12-    1.149.el6.x86_64

(gdb) bt

#0  0x00007ffff178956d in ahocorasick_KeywordTree_search_helper     (state=0x85c730, string=0x8967d4 "【中国环保在线 市场行情】“我国将在2016年启动全国碳市场。全国碳交     易市场的首批行业企业将由电力、冶金、有色、建材、化工5个传统制造业和航", <incomplete sequence \347\251>..., n=140733193395828, startpos=118366835, out_start=0x7fffffffd468, out_end=0x7fffffffd460,   out_last_state=0x7fffffffd458) at aho-corasick.c:216
#1  0x00007ffff178a2b1 in ahocorasick_KeywordTree_basesearch     (self=0x7ffff7f6c230, args=0x7ffff0ca1a50, kwargs=0x0, helper=0x7ffff1789525<ahocorasick_KeywordTree_search_helper>) at     py_wrapper.c:190
#2  0x00007ffff178a358 in ahocorasick_KeywordTree_search  (self=0x7ffff7f6c230, args=0x7ffff0ca1a50, kwargs=0x0) at py_wrapper.c:212
#3  0x00000000004a7103 in call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4013
#4  PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666
#5  0x0000000000507e8d in gen_send_ex (gen=0x7904640, arg=0x0, exc=<value optimized out>) at Objects/genobject.c:84
#6  0x00000000004a25da in PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2497
#7  0x00000000004a805b in fast_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4099
#8  call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4034
#9  PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666
#10 0x00000000004a8bd7 in PyEval_EvalCodeEx (co=0x7ffff1ff54b0, globals=<value optimized out>, locals=<value optimized out>, args=<value optimized out>, argcount=3, kws=0x9984520, kwcount=0, defs=0x7ffff2016968, defcount=1, closure=0x0) at Python/ceval.c:3253
#11 0x00000000004a6dce in fast_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4109
#12 call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4034
#13 PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666
#14 0x00000000004a805b in fast_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4099
#15 call_function (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:4034
#16 PyEval_EvalFrameEx (f=<value optimized out>, throwflag=<value optimized out>) at Python/ceval.c:2666
#17 0x00000000004a8bd7 in PyEval_EvalCodeEx (co=0x7ffff7ec4130, globals=<value optimized out>, locals=<value optimized out>, args=<value optimized out>, argcount=0, kws=0x0, kwcount=0, defs=0x0, defcount=0, closure=0x0) at Python/ceval.c:3253
#18 0x00000000004a8ce2 in PyEval_EvalCode (co=<value optimized out>, globals=<value optimized out>, locals=<value optimized out>) at Python/ceval.c:667
#19 0x00000000004c91fe in run_mod (fp=0x880ee0, filename=0x7fffffffe30c "analyzer.py", start=<value optimized out>, globals=0x7fc140, locals=0x7fc140, closeit=1, flags=0x7fffffffdea0) at Python/pythonrun.c:1346
#20 PyRun_FileExFlags (fp=0x880ee0, filename=0x7fffffffe30c "analyzer.py", start=<value optimized out>, globals=0x7fc140, locals=0x7fc140, closeit=1,flags=0x7fffffffdea0) at Python/pythonrun.c:1332
#21 0x00000000004c9414 in PyRun_SimpleFileExFlags (fp=0x880ee0, filename=0x7fffffffe30c "analyzer.py", closeit=1, flags=0x7fffffffdea0)at Python/pythonrun.c:936
#22 0x0000000000414a4f in Py_Main (argc=<value optimized out>, argv=<value optimized out>) at Modules/main.c:599
#23 0x0000003fd281ed5d in __libc_start_main () from /lib64/libc.so.6
#24 0x0000000000413bc9 in _start ()

最佳答案

我明白了

  self.__tree = _ahocorasick.KeywordTree();

然后

self.__tree.zerostate()

最后

return self.__findall_helper(sourceBlock, allow_overlaps,self.__tree.search_long)

所以我的猜测是,当您执行 __tree.zerostate() 时,函数 search_long 会失效,因此您会得到未定义的行为,在某些情况下会导致段错误。它有很多代码,并且有一个不透明的库,因此很难确定。最好的办法是查看文档并确保您正确使用该库。

print 是一个转移注意力的东西,通过分配一些东西只会迫使崩溃更快发生。

希望有帮助。

关于python - 在 .so 文件中使用 C 模块时出现段错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30098630/

相关文章:

python - 从命令行python获取参数

python - 如何在脚本中列出与导入对应的 PyPI 包的名称?

python - 在 Python 中使用 Yandex 翻译 API 时出现问题

c - Printf 不是 c 语言的一部分。从 stdio.h 执行 PRINTF 的代码时实际发生了什么?

c++ - std::ifstream 读取大数的错误大小

python - 将浮点范围放入容器中

c++ - 如何获取自动(局部)变量相对于堆栈帧的偏移量( __builtin_frame_address )

c - 总线错误: 10 in data generation in C

c - pthread_create 之后的段错误 (

c++ - 为什么我的代码在 Windows 7 上不会出现段错误?