123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- import io
- import os
- import pyocr.builders
- from PIL import Image as PIL_Image
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.pdfpage import PDFPage
- from wand.image import Image
- tool = pyocr.get_available_tools()[0]
- def spdf2text(filepath, pages=None):
- ''''scan pdf to text'''
- if not os.path.isfile(filepath):
- raise ValueError("can not find the file {}".format(filepath))
- final_text = []
- image_pdf = Image(filename=filepath, resolution=300)
- image_jpeg = image_pdf.convert('jpeg')
- for i, img in enumerate(image_jpeg.sequence):
- if pages is None or (pages is not None and i in pages):
- img_page = Image(image=img)
- img = img_page.make_blob('jpeg')
- txt = tool.image_to_string(
- PIL_Image.open(io.BytesIO(img)),
- lang="chi_sim",
- builder=pyocr.builders.TextBuilder(),
- )
- final_text.append(txt)
- final_text = "\n".join(final_text)
- return final_text
- def pdf2text(filepath, pages=None):
- with open(filepath, "rb") as fp:
- rsrcmgr = PDFResourceManager()
- retstr = io.StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- for i, page in enumerate(PDFPage.get_pages(fp)):
- if pages is None or (pages is not None and i in pages):
- interpreter.process_page(page)
- data = retstr.getvalue()
- return data
- def get_pdf_page_num(filepath):
- cnt = 0
- with open(filepath, "rb") as fp:
- for _ in PDFPage.get_pages(fp):
- cnt += 1
- return cnt
- def route_pdf(filepath):
- page0 = pdf2text(filepath, [0])
- if len(page0) >= 50:
- return pdf2text(filepath)
- else:
- spage0 = spdf2text(filepath, [0])
- if len(page0) >= len(spage0):
- return pdf2text(filepath)
- else:
- return spdf2text(filepath)
- if __name__ == '__main__':
- # filepath = r"D:\yx\1.pdf"
- filepath = r"D:\yx\scan_pdf\2.pdf"
- # print(pdf2text(filepath, pages=[0, 1, 2]))
- # print(get_pdf_page_num(filepath))
- print(route_pdf(filepath))
|