1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- import os
- import re
- import textract
- from img2text import img2text
- from pdf2text import route_pdf
- from ppt2text import ppt2text
- from word2text import word2text
- from wand.image import Image
- page_num1 = re.compile("第\s*\d+\s*页")
- page_num2 = re.compile("共\s*\d+\s*页")
- def post_preprocess(text):
- if not text:
- print('\n+++++++该文件无法解析,请仔细核对文件,如果是PDF和图片请走图像组+++++++\n')
- return None
- res = []
- lines = text.replace("\r", "").split("\n")
- for line in lines:
- if not line.strip():
- continue
- if page_num1.search(line):
- continue
- if page_num2.search(line):
- continue
- res.append(line)
- return "\n".join(res)
- def route_filename_old(filename):
- basename, ext = os.path.splitext(filename)
- ext = ext.lower()
- if ext in [".gif", ".jpg", ".jpeg", ".png", ".tiff", ".tif"]:
- try:
- text = img2text(filename)
- except:
- # 有些tif 格式会出现bug, 进行格式转换
- new_filename = 'result.jpg'
- with Image(filename=filename) as img:
- # img.resize(200, 200)
- img.save(filename=new_filename)
- text = img2text(new_filename)
- elif ext in [".doc", ".docx"]:
- text = word2text(filename)
- elif ext in [".pdf"]:
- text = route_pdf(filename)
- elif ext in [".ppt"]:
- text = ppt2text(filename)
- elif ext in [".rtf"]:
- text = textract.process(filename)
- text = text.decode("utf-8")
- text = "\n".join(text.replace("\r", "").split("\n")[7:])
- else:
- text = textract.process(filename)
- text = text.decode("utf-8")
- text = post_preprocess(text)
- return text
- def route_filename(filename):
- basename, ext = os.path.splitext(filename)
- ext = ext.lower()
- if ext in [".doc", ".docx"]:
- text = word2text(filename)
- # elif ext in [".ppt"]:
- # text = ppt2text(filename)
- #
- # elif ext in [".rtf"]:
- # text = textract.process(filename)
- # text = text.decode("utf-8")
- # text = "\n".join(text.replace("\r", "").split("\n")[7:])
- else:
- text = None
- text = post_preprocess(text)
- return text
- if __name__ == "__main__":
- filepath = r'result.jpg'
- text = route_filename(filepath)
- print(text)
|