pdf2text.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import io
  2. import os
  3. import pyocr.builders
  4. from PIL import Image as PIL_Image
  5. from pdfminer.converter import TextConverter
  6. from pdfminer.layout import LAParams
  7. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  8. from pdfminer.pdfpage import PDFPage
  9. from wand.image import Image
  10. tool = pyocr.get_available_tools()[0]
  11. def spdf2text(filepath, pages=None):
  12. ''''scan pdf to text'''
  13. if not os.path.isfile(filepath):
  14. raise ValueError("can not find the file {}".format(filepath))
  15. final_text = []
  16. image_pdf = Image(filename=filepath, resolution=300)
  17. image_jpeg = image_pdf.convert('jpeg')
  18. for i, img in enumerate(image_jpeg.sequence):
  19. if pages is None or (pages is not None and i in pages):
  20. img_page = Image(image=img)
  21. img = img_page.make_blob('jpeg')
  22. txt = tool.image_to_string(
  23. PIL_Image.open(io.BytesIO(img)),
  24. lang="chi_sim",
  25. builder=pyocr.builders.TextBuilder(),
  26. )
  27. final_text.append(txt)
  28. final_text = "\n".join(final_text)
  29. return final_text
  30. def pdf2text(filepath, pages=None):
  31. with open(filepath, "rb") as fp:
  32. rsrcmgr = PDFResourceManager()
  33. retstr = io.StringIO()
  34. codec = 'utf-8'
  35. laparams = LAParams()
  36. device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
  37. interpreter = PDFPageInterpreter(rsrcmgr, device)
  38. for i, page in enumerate(PDFPage.get_pages(fp)):
  39. if pages is None or (pages is not None and i in pages):
  40. interpreter.process_page(page)
  41. data = retstr.getvalue()
  42. return data
  43. def get_pdf_page_num(filepath):
  44. cnt = 0
  45. with open(filepath, "rb") as fp:
  46. for _ in PDFPage.get_pages(fp):
  47. cnt += 1
  48. return cnt
  49. def route_pdf(filepath):
  50. page0 = pdf2text(filepath, [0])
  51. if len(page0) >= 50:
  52. return pdf2text(filepath)
  53. else:
  54. spage0 = spdf2text(filepath, [0])
  55. if len(page0) >= len(spage0):
  56. return pdf2text(filepath)
  57. else:
  58. return spdf2text(filepath)
  59. if __name__ == '__main__':
  60. # filepath = r"D:\yx\1.pdf"
  61. filepath = r"D:\yx\scan_pdf\2.pdf"
  62. # print(pdf2text(filepath, pages=[0, 1, 2]))
  63. # print(get_pdf_page_num(filepath))
  64. print(route_pdf(filepath))