xh
/
word_parse_of_eng


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
							import os
import re
import shutil
from pprint import pprint

import requests
from pypinyin import lazy_pinyin

import config

chinese_pat = re.compile(r"[\u4e00-\u9fa5]+")
english_pat = re.compile(r"[a-zA-Z]+")


def to_pinyin_camel(s):
    py_ls = lazy_pinyin(s)
    py_camel = [py.capitalize() for py in py_ls]
    return "".join(py_camel)


def start_word2html_app(kill_mathtype=True):
    if kill_mathtype:
        os.system("taskkill /f /im MathType.exe")
        os.system("taskkill /f /im WINWORD.EXE")
        os.system("taskkill /f /im ConsoleApplication1.exe")
    os.system("start {}".format(config.word2html_exe))  # start 在新窗口中打开


def check_pid():
    command = 'tasklist /fi  "imagename eq WINWORD.EXE"'
    r = os.popen(command)
    info = r.read()  # 读取命令行的输出到一个list
    if str(info).strip() != "信息: 没有运行的任务匹配指定标准。":
        print("++++再补一刀++++")
        start_word2html_app()
    else:
        print("----word is closed----")


def call_c_shape(doc_file):
    # r = None
    # for i in range(1):
    try:
        r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=90)
        r.close()
    except:
        r = None
        start_word2html_app()
    #     break
    # if r is not None:
    #     break
    # else:
    #     start_word2html_app()

    check_pid()
    return r


def _word2html(filename):
    r = call_c_shape(filename)
    clean_html = os.path.splitext(filename)[0] + "_clean.html"
    if r is None or r.text == 4 or not os.path.isfile(clean_html):
        return None
    else:
        with open(clean_html, "r", encoding="utf-8") as f:
            return f.read()


def word2html(filename_in):
    filename_in = os.path.abspath(filename_in)
    filepath, filename = os.path.split(filename_in)
    filepath2 = os.path.join(filepath, to_pinyin_camel(filename))
    if filename_in != filepath2:
        shutil.copy(filename_in, filepath2)
    return _word2html(filepath2)


def c_shape_html2text(text):
    text = text.replace("<html>", "").replace("</html>", "")
    text = text.replace("<head>", "").replace("</head>", "")
    text = text.replace("<body>", "").replace("</body>", "")
    text = text.replace("<h1>", "").replace("</h1>", "")
    text = text.replace("<h2>", "").replace("</h2>", "")
    text = text.replace("<h3>", "").replace("</h3>", "")
    text = text.replace("<h4>", "").replace("</h4>", "")
    text = text.replace("<h5>", "").replace("</h5>", "")
    text = text.replace("<h6>", "").replace("</h6>", "")
    text = text.replace("<p>", "").replace("</p>", "")
    text = text.replace("<div>", "").replace("</div>", "")
    # text = text.replace("&nbsp;", " ")
    text = re.sub(r"<img.*?>", "", str(text))
    return text


def word2text(word):
    text = word2html(word)
    if text:
        text = c_shape_html2text(text)
    return text


if __name__ == "__main__":
    print(word2text("D:\yx\ocr\英语试卷.doc"))
    # print(word2text("upload/0.docx"))