word2text.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. import os
  2. import re
  3. import shutil
  4. from pprint import pprint
  5. import requests
  6. from pypinyin import lazy_pinyin
  7. import config
  8. chinese_pat = re.compile(r"[\u4e00-\u9fa5]+")
  9. english_pat = re.compile(r"[a-zA-Z]+")
  10. def to_pinyin_camel(s):
  11. py_ls = lazy_pinyin(s)
  12. py_camel = [py.capitalize() for py in py_ls]
  13. return "".join(py_camel)
  14. def start_word2html_app(kill_mathtype=True):
  15. if kill_mathtype:
  16. os.system("taskkill /f /im MathType.exe")
  17. os.system("taskkill /f /im WINWORD.EXE")
  18. os.system("taskkill /f /im ConsoleApplication1.exe")
  19. os.system("start {}".format(config.word2html_exe)) # start 在新窗口中打开
  20. def check_pid():
  21. command = 'tasklist /fi "imagename eq WINWORD.EXE"'
  22. r = os.popen(command)
  23. info = r.read() # 读取命令行的输出到一个list
  24. if str(info).strip() != "信息: 没有运行的任务匹配指定标准。":
  25. print("++++再补一刀++++")
  26. start_word2html_app()
  27. else:
  28. print("----word is closed----")
  29. def call_c_shape(doc_file):
  30. # r = None
  31. # for i in range(1):
  32. try:
  33. r = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=90)
  34. r.close()
  35. except:
  36. r = None
  37. start_word2html_app()
  38. # break
  39. # if r is not None:
  40. # break
  41. # else:
  42. # start_word2html_app()
  43. check_pid()
  44. return r
  45. def _word2html(filename):
  46. r = call_c_shape(filename)
  47. clean_html = os.path.splitext(filename)[0] + "_clean.html"
  48. if r is None or r.text == 4 or not os.path.isfile(clean_html):
  49. return None
  50. else:
  51. with open(clean_html, "r", encoding="utf-8") as f:
  52. return f.read()
  53. def word2html(filename_in):
  54. filename_in = os.path.abspath(filename_in)
  55. filepath, filename = os.path.split(filename_in)
  56. filepath2 = os.path.join(filepath, to_pinyin_camel(filename))
  57. if filename_in != filepath2:
  58. shutil.copy(filename_in, filepath2)
  59. return _word2html(filepath2)
  60. def c_shape_html2text(text):
  61. text = text.replace("<html>", "").replace("</html>", "")
  62. text = text.replace("<head>", "").replace("</head>", "")
  63. text = text.replace("<body>", "").replace("</body>", "")
  64. text = text.replace("<h1>", "").replace("</h1>", "")
  65. text = text.replace("<h2>", "").replace("</h2>", "")
  66. text = text.replace("<h3>", "").replace("</h3>", "")
  67. text = text.replace("<h4>", "").replace("</h4>", "")
  68. text = text.replace("<h5>", "").replace("</h5>", "")
  69. text = text.replace("<h6>", "").replace("</h6>", "")
  70. text = text.replace("<p>", "").replace("</p>", "")
  71. text = text.replace("<div>", "").replace("</div>", "")
  72. # text = text.replace("&nbsp;", " ")
  73. text = re.sub(r"<img.*?>", "", str(text))
  74. return text
  75. def word2text(word):
  76. text = word2html(word)
  77. if text:
  78. text = c_shape_html2text(text)
  79. return text
  80. if __name__ == "__main__":
  81. print(word2text("D:\yx\ocr\英语试卷.doc"))
  82. # print(word2text("upload/0.docx"))