我尝试检查 Windows 环境中的 PDF 文件是否损坏,并提出以下 python 代码。
只是想检查它是检查损坏的 PDF 文件的最佳方法还是有其他简单的方法?
备注 : C:\Temp\python\sample-map (1).pdf
是损坏的 PDF 文件
这是示例代码
import os
import subprocess
import re
from subprocess import Popen, PIPE
def checkFile(fullfile):
proc=subprocess.Popen(["file", "-b", fullfile], shell=True, stdout=PIPE, stderr=PIPE, bufsize=0)
# -b, --brief : do not prepend filenames to output lines
out, err = proc.communicate()
exitcode = proc.returncode
return exitcode, out, err
def searchFiles(dirpath):
pwdpath=os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles=os.listdir(dirpath)
for files in listfiles:
fullfile=os.path.join(dirpath, files)
if os.access(fullfile, os.R_OK):
code, out, error = checkFile(fullfile)
if str(code) !="0" or str(error, "utf-8") != "" or re.search("^(?!PDF(\s)).*", str(out,'utf-8')):
print("ERROR " + fullfile+"\n################")
else:
print("OK " + fullfile+"\n################")
else:
print("$s : File not readable" %fullfile)
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
样本输出:
$ "C:/Program Files (x86)/Python37-32/python.exe" c:/Users/myuser/python/check_pdf_file.py
running path : c:\Users\myuser\python
Path C:\Temp\python validation OK
OK C:\Temp\python\Induction Guide.pdf
################
ERROR C:\Temp\python\sample-map (1).pdf
################
OK C:\Temp\python\sample-map.pdf
################
最佳答案
我认为您可以使用 PyPDF2 模块。pip install pypdf2
代码如下。
from PyPDF2 import PdfFileReader
import os
def checkFile(fullfile):
with open(fullfile, 'rb') as f:
try:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
if info:
return True
else:
return False
except:
return False
def searchFiles(dirpath):
pwdpath = os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles = os.listdir(dirpath)
for f in listfiles:
fullfile = os.path.join(dirpath, f)
if checkFile(fullfile):
print("OK " + fullfile + "\n################")
else:
print("ERROR " + fullfile + "\n################")
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
我试图匹配你的编码风格。我认为这段代码也可以在 MacOS 或 Linux 上使用。
关于python-3.x - 使用python检查PDF文件是否损坏的最佳方法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58807673/