Python PDFMIner - PDF 到 CSV

标签 python csv pdf pdfminer

我希望能够将 PDF 转换为 CSV 文件并找到了几个有用的脚本,但是作为 Python 的新手,我有一个问题:

您在哪里指定要打印到的 PDF 和 CSV 的文件路径?

我正在使用 Python 2.7.11 和 PDFMiner 20140328。

import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO

def pdfparser(data):

    fp = file(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)


    for page in PDFPage.get_pages(fp):
    interpreter.process_page(page)
    data =  retstr.getvalue()

    print data

if __name__ == '__main__':
pdfparser(sys.argv[1]) 

最佳答案

这是来自 this 的一些修改代码所以答案由 tgray 撰写:

def pdf_to_csv(filename, separator, threshold):
    from cStringIO import StringIO
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)
            self.separator = separator
            self.threshold = threshold

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  # <-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  # <-- changed
            for y in sorted(lines.keys()):
                line = lines[y]
                self.line_creator(line)
                self.outfp.write(self.line_creator(line))
                self.outfp.write("\n")

        def line_creator(self, line):
            keys = sorted(line.keys())
            # calculate the average distange between each character on this row
            average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys)
            # append the first character to the result
            result = [line[keys[0]]]
            for i in range(1, len(keys)):
                # if the distance between this character and the last character is greater than the average*threshold
                if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
                    # append the separator into that position
                    result.append(self.separator)
                # append the character
                result.append(line[keys[i]])
            printable_line = ''.join(result)
            return printable_line

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    fp = open(filename, 'rb')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(PDFPage.get_pages(fp)):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            print 'none'
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()


if __name__ == '__main__':
    # the separator to use with the CSV
    separator = ';'
    # the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well
    threshold = 1.5
    print pdf_to_csv('myLovelyFile.pdf', separator, threshold)

链接中的答案与这个答案之间的主要区别是 line_creator 方法,它试图从 PDF 中提取一些结构。

应与 PDFminer 20140328 一起使用。

关于Python PDFMIner - PDF 到 CSV,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36902496/

相关文章:

python - Anaconda 3 Python 命令似乎已损坏

python - 动态创建的类方法在运行时可以知道它们的 'created' 名称吗?

python - 你能写一个适用于生成器函数和普通函数并匹配它们的返回样式的 python 装饰器吗?

Java:CSV 文件易于读​​/写

VBA 在 PDF XChange Viewer 中打开文件导出的 Excel 文件

python - 在 python,numpy 中保存两列

mysql - 如何将 MySQL 的查询结果以格式化的 CSV 格式导出到标准输出?

python - 从CSV移除最后一行

java - 如何将 PDF 转换为 JSON/EXCEL/WORD 文件?

java - 使用 pdfbox 2.0.8 时 block 类型无效