import glob import os import shutil import requests from pypinyin import lazy_pinyin def to_pinyin_camel(s): py_ls = lazy_pinyin(s) py_camel = [py.capitalize() for py in py_ls] return "".join(py_camel) def demo1(): for i, file_path in enumerate(glob.glob("topic_type_word/topic_type_word_zh/*.docx") + glob.glob("topic_type_word/topic_type_word_zh/*.doc")): file_dir, file_name = os.path.split(file_path) base_name, ext = os.path.splitext(file_name) new_dir = "topic_type_word/topic_type_word_en/" + to_pinyin_camel(base_name) if not os.path.isdir(new_dir): os.mkdir(new_dir) new_file = os.path.join(new_dir, to_pinyin_camel(base_name) + ext) shutil.copy(file_path, new_file) new_file = os.path.abspath(new_file) requests.get(r"http://localhost:9001/word/?name={}".format(new_file)) def call_c_shape(doc_file, timeout=30): r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=timeout) return r def start_word2html_app(kill_mathtype=True): if kill_mathtype: os.system("taskkill /f /im MathType.exe") os.system("taskkill /f /im WINWORD.EXE") os.system("taskkill /f /im ConsoleApplication1.exe") os.system( r"start C:\Users\HJ\Desktop\code\extract_math_test\extract_math_test_online\word_bin\ConsoleApplication1.exe") # start 在新窗口中打开 def get_call_c_shape_response(doc_file): r = None for _ in range(2): try: r = call_c_shape(doc_file) except: pass if r is not None: break else: start_word2html_app() return r def get_call_c_shape_text(doc_file): r = get_call_c_shape_response(doc_file) clean_html_file = "_clean".join([os.path.splitext(doc_file)[0], ".html"]) if r.text == "4" or not os.path.isfile(clean_html_file): raise ValueError("can not parse {}".format(doc_file)) else: html = '' if os.path.exists(clean_html_file): with open(clean_html_file, "r", encoding="utf-8") as f: html = f.read() return html def word2html(src_filename): filename = os.path.abspath(src_filename) filepath, filename = os.path.split(filename) filename = to_pinyin_camel(filename) des_filename = os.path.join(filepath, filename) shutil.copy(src_filename, des_filename) html = get_call_c_shape_text(des_filename) return html if __name__ == "__main__": print(word2html("../upload/哈哈.doc"))