filepath2text.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import os
  2. import re
  3. import textract
  4. from img2text import img2text
  5. from pdf2text import route_pdf
  6. from ppt2text import ppt2text
  7. from word2text import word2text
  8. from wand.image import Image
  9. page_num1 = re.compile("第\s*\d+\s*页")
  10. page_num2 = re.compile("共\s*\d+\s*页")
  11. def post_preprocess(text):
  12. if not text:
  13. print('\n+++++++该文件无法解析,请仔细核对文件,如果是PDF和图片请走图像组+++++++\n')
  14. return None
  15. res = []
  16. lines = text.replace("\r", "").split("\n")
  17. for line in lines:
  18. if not line.strip():
  19. continue
  20. if page_num1.search(line):
  21. continue
  22. if page_num2.search(line):
  23. continue
  24. res.append(line)
  25. return "\n".join(res)
  26. def route_filename_old(filename):
  27. basename, ext = os.path.splitext(filename)
  28. ext = ext.lower()
  29. if ext in [".gif", ".jpg", ".jpeg", ".png", ".tiff", ".tif"]:
  30. try:
  31. text = img2text(filename)
  32. except:
  33. # 有些tif 格式会出现bug, 进行格式转换
  34. new_filename = 'result.jpg'
  35. with Image(filename=filename) as img:
  36. # img.resize(200, 200)
  37. img.save(filename=new_filename)
  38. text = img2text(new_filename)
  39. elif ext in [".doc", ".docx"]:
  40. text = word2text(filename)
  41. elif ext in [".pdf"]:
  42. text = route_pdf(filename)
  43. elif ext in [".ppt"]:
  44. text = ppt2text(filename)
  45. elif ext in [".rtf"]:
  46. text = textract.process(filename)
  47. text = text.decode("utf-8")
  48. text = "\n".join(text.replace("\r", "").split("\n")[7:])
  49. else:
  50. text = textract.process(filename)
  51. text = text.decode("utf-8")
  52. text = post_preprocess(text)
  53. return text
  54. def route_filename(filename):
  55. basename, ext = os.path.splitext(filename)
  56. ext = ext.lower()
  57. if ext in [".doc", ".docx"]:
  58. text = word2text(filename)
  59. # elif ext in [".ppt"]:
  60. # text = ppt2text(filename)
  61. #
  62. # elif ext in [".rtf"]:
  63. # text = textract.process(filename)
  64. # text = text.decode("utf-8")
  65. # text = "\n".join(text.replace("\r", "").split("\n")[7:])
  66. else:
  67. text = None
  68. text = post_preprocess(text)
  69. return text
  70. if __name__ == "__main__":
  71. filepath = r'result.jpg'
  72. text = route_filename(filepath)
  73. print(text)