12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- import glob
- import os
- import shutil
- import requests
- from pypinyin import lazy_pinyin
- def to_pinyin_camel(s):
- py_ls = lazy_pinyin(s)
- py_camel = [py.capitalize() for py in py_ls]
- return "".join(py_camel)
- def demo1():
- for i, file_path in enumerate(glob.glob("topic_type_word/topic_type_word_zh/*.docx") +
- glob.glob("topic_type_word/topic_type_word_zh/*.doc")):
- file_dir, file_name = os.path.split(file_path)
- base_name, ext = os.path.splitext(file_name)
- new_dir = "topic_type_word/topic_type_word_en/" + to_pinyin_camel(base_name)
- if not os.path.isdir(new_dir):
- os.mkdir(new_dir)
- new_file = os.path.join(new_dir, to_pinyin_camel(base_name) + ext)
- shutil.copy(file_path, new_file)
- new_file = os.path.abspath(new_file)
- requests.get(r"http://localhost:9001/word/?name={}".format(new_file))
- def call_c_shape(doc_file, timeout=30):
- r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=timeout)
- return r
- def start_word2html_app(kill_mathtype=True):
- if kill_mathtype:
- os.system("taskkill /f /im MathType.exe")
- os.system("taskkill /f /im WINWORD.EXE")
- os.system("taskkill /f /im ConsoleApplication1.exe")
- os.system(
- r"start C:\Users\HJ\Desktop\code\extract_math_test\extract_math_test_online\word_bin\ConsoleApplication1.exe") # start 在新窗口中打开
- def get_call_c_shape_response(doc_file):
- r = None
- for _ in range(2):
- try:
- r = call_c_shape(doc_file)
- except:
- pass
- if r is not None:
- break
- else:
- start_word2html_app()
- return r
- def get_call_c_shape_text(doc_file):
- r = get_call_c_shape_response(doc_file)
- clean_html_file = "_clean".join([os.path.splitext(doc_file)[0], ".html"])
- if r.text == "4" or not os.path.isfile(clean_html_file):
- raise ValueError("can not parse {}".format(doc_file))
- else:
- html = ''
- if os.path.exists(clean_html_file):
- with open(clean_html_file, "r", encoding="utf-8") as f:
- html = f.read()
- return html
- def word2html(src_filename):
- filename = os.path.abspath(src_filename)
- filepath, filename = os.path.split(filename)
- filename = to_pinyin_camel(filename)
- des_filename = os.path.join(filepath, filename)
- shutil.copy(src_filename, des_filename)
- html = get_call_c_shape_text(des_filename)
- return html
- if __name__ == "__main__":
- print(word2html("../upload/哈哈.doc"))
|