123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- import os
- import re
- import shutil
- from pprint import pprint
- import requests
- from pypinyin import lazy_pinyin
- import config
- chinese_pat = re.compile(r"[\u4e00-\u9fa5]+")
- english_pat = re.compile(r"[a-zA-Z]+")
- def to_pinyin_camel(s):
- py_ls = lazy_pinyin(s)
- py_camel = [py.capitalize() for py in py_ls]
- return "".join(py_camel)
- def start_word2html_app(kill_mathtype=True):
- if kill_mathtype:
- os.system("taskkill /f /im MathType.exe")
- os.system("taskkill /f /im WINWORD.EXE")
- os.system("taskkill /f /im ConsoleApplication1.exe")
- os.system("start {}".format(config.word2html_exe)) # start 在新窗口中打开
- def check_pid():
- command = 'tasklist /fi "imagename eq WINWORD.EXE"'
- r = os.popen(command)
- info = r.read() # 读取命令行的输出到一个list
- if str(info).strip() != "信息: 没有运行的任务匹配指定标准。":
- print("++++再补一刀++++")
- start_word2html_app()
- else:
- print("----word is closed----")
- def call_c_shape(doc_file):
- # r = None
- # for i in range(1):
- try:
- r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=90)
- r.close()
- except:
- r = None
- start_word2html_app()
- # break
- # if r is not None:
- # break
- # else:
- # start_word2html_app()
- check_pid()
- return r
- def _word2html(filename):
- r = call_c_shape(filename)
- clean_html = os.path.splitext(filename)[0] + "_clean.html"
- if r is None or r.text == 4 or not os.path.isfile(clean_html):
- return None
- else:
- with open(clean_html, "r", encoding="utf-8") as f:
- return f.read()
- def word2html(filename_in):
- filename_in = os.path.abspath(filename_in)
- filepath, filename = os.path.split(filename_in)
- filepath2 = os.path.join(filepath, to_pinyin_camel(filename))
- if filename_in != filepath2:
- shutil.copy(filename_in, filepath2)
- return _word2html(filepath2)
- def c_shape_html2text(text):
- text = text.replace("<html>", "").replace("</html>", "")
- text = text.replace("<head>", "").replace("</head>", "")
- text = text.replace("<body>", "").replace("</body>", "")
- text = text.replace("<h1>", "").replace("</h1>", "")
- text = text.replace("<h2>", "").replace("</h2>", "")
- text = text.replace("<h3>", "").replace("</h3>", "")
- text = text.replace("<h4>", "").replace("</h4>", "")
- text = text.replace("<h5>", "").replace("</h5>", "")
- text = text.replace("<h6>", "").replace("</h6>", "")
- text = text.replace("<p>", "").replace("</p>", "")
- text = text.replace("<div>", "").replace("</div>", "")
- # text = text.replace(" ", " ")
- text = re.sub(r"<img.*?>", "", str(text))
- return text
- def word2text(word):
- text = word2html(word)
- if text:
- text = c_shape_html2text(text)
- return text
- if __name__ == "__main__":
- print(word2text("D:\yx\ocr\英语试卷.doc"))
- # print(word2text("upload/0.docx"))
|