import io import os import pyocr.builders from PIL import Image as PIL_Image from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from wand.image import Image tool = pyocr.get_available_tools()[0] def spdf2text(filepath, pages=None): ''''scan pdf to text''' if not os.path.isfile(filepath): raise ValueError("can not find the file {}".format(filepath)) final_text = [] image_pdf = Image(filename=filepath, resolution=300) image_jpeg = image_pdf.convert('jpeg') for i, img in enumerate(image_jpeg.sequence): if pages is None or (pages is not None and i in pages): img_page = Image(image=img) img = img_page.make_blob('jpeg') txt = tool.image_to_string( PIL_Image.open(io.BytesIO(img)), lang="chi_sim", builder=pyocr.builders.TextBuilder(), ) final_text.append(txt) final_text = "\n".join(final_text) return final_text def pdf2text(filepath, pages=None): with open(filepath, "rb") as fp: rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.get_pages(fp)): if pages is None or (pages is not None and i in pages): interpreter.process_page(page) data = retstr.getvalue() return data def get_pdf_page_num(filepath): cnt = 0 with open(filepath, "rb") as fp: for _ in PDFPage.get_pages(fp): cnt += 1 return cnt def route_pdf(filepath): page0 = pdf2text(filepath, [0]) if len(page0) >= 50: return pdf2text(filepath) else: spage0 = spdf2text(filepath, [0]) if len(page0) >= len(spage0): return pdf2text(filepath) else: return spdf2text(filepath) if __name__ == '__main__': # filepath = r"D:\yx\1.pdf" filepath = r"D:\yx\scan_pdf\2.pdf" # print(pdf2text(filepath, pages=[0, 1, 2])) # print(get_pdf_page_num(filepath)) print(route_pdf(filepath))