你好,我有一个包含 2 种语言(英语、希腊语)的 pdf 文件,我想通过 python ocr 提取它。到目前为止,我有这段代码,但它只适用于一种语言(希腊语)
如何在有 2 种语言的 pdf 文件中运行 ocr 提取?
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io
def pdfparser(data):
tool = pyocr.get_available_tools()[0]
for i in enumerate(tool.get_available_languages()):
print(i)
lang = tool.get_available_languages()[2]
req_image = []
final_text = []
words = []
image_pdf = Image(filename=data, resolution=600)
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
for img in req_image:
txt = tool.image_to_string(
PI.open(io.BytesIO(img)),
lang=lang,
builder=pyocr.builders.TextBuilder()
)
final_text.append(txt)
#words.extend(u'{}'.format(txt.split()))
#print(final_text)
#print(words)
for x in final_text:
'''
for i in x:
print(i.replace('|(',u'Κ').replace('|',u'Ι'))
'''
try:
word = x.encode('utf8')
print(word)
except UnicodeEncodeError , e:
print(e)
continue
if __name__ == '__main__':
pdfparser(sys.argv[1])
最佳答案
我在这里冒险回答。试试 lang = 'eng+ell'
。确保 eng.traineddata
和 ell.traineddata
文件都在您的 tessdata
文件夹中。
关于Python ocr pdf提取多种语言,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/39166423/