word2html.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. import glob
  2. import os
  3. import shutil
  4. import requests
  5. from pypinyin import lazy_pinyin
  6. def to_pinyin_camel(s):
  7. py_ls = lazy_pinyin(s)
  8. py_camel = [py.capitalize() for py in py_ls]
  9. return "".join(py_camel)
  10. def demo1():
  11. for i, file_path in enumerate(glob.glob("topic_type_word/topic_type_word_zh/*.docx") +
  12. glob.glob("topic_type_word/topic_type_word_zh/*.doc")):
  13. file_dir, file_name = os.path.split(file_path)
  14. base_name, ext = os.path.splitext(file_name)
  15. new_dir = "topic_type_word/topic_type_word_en/" + to_pinyin_camel(base_name)
  16. if not os.path.isdir(new_dir):
  17. os.mkdir(new_dir)
  18. new_file = os.path.join(new_dir, to_pinyin_camel(base_name) + ext)
  19. shutil.copy(file_path, new_file)
  20. new_file = os.path.abspath(new_file)
  21. requests.get(r"http://localhost:9001/word/?name={}".format(new_file))
  22. def call_c_shape(doc_file, timeout=30):
  23. r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=timeout)
  24. return r
  25. def start_word2html_app(kill_mathtype=True):
  26. if kill_mathtype:
  27. os.system("taskkill /f /im MathType.exe")
  28. os.system("taskkill /f /im WINWORD.EXE")
  29. os.system("taskkill /f /im ConsoleApplication1.exe")
  30. os.system(
  31. r"start C:\Users\HJ\Desktop\code\extract_math_test\extract_math_test_online\word_bin\ConsoleApplication1.exe") # start 在新窗口中打开
  32. def get_call_c_shape_response(doc_file):
  33. r = None
  34. for _ in range(2):
  35. try:
  36. r = call_c_shape(doc_file)
  37. except:
  38. pass
  39. if r is not None:
  40. break
  41. else:
  42. start_word2html_app()
  43. return r
  44. def get_call_c_shape_text(doc_file):
  45. r = get_call_c_shape_response(doc_file)
  46. clean_html_file = "_clean".join([os.path.splitext(doc_file)[0], ".html"])
  47. if r.text == "4" or not os.path.isfile(clean_html_file):
  48. raise ValueError("can not parse {}".format(doc_file))
  49. else:
  50. html = ''
  51. if os.path.exists(clean_html_file):
  52. with open(clean_html_file, "r", encoding="utf-8") as f:
  53. html = f.read()
  54. return html
  55. def word2html(src_filename):
  56. filename = os.path.abspath(src_filename)
  57. filepath, filename = os.path.split(filename)
  58. filename = to_pinyin_camel(filename)
  59. des_filename = os.path.join(filepath, filename)
  60. shutil.copy(src_filename, des_filename)
  61. html = get_call_c_shape_text(des_filename)
  62. return html
  63. if __name__ == "__main__":
  64. print(word2html("../upload/哈哈.doc"))