server_tools2.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import os
  4. import re
  5. import json
  6. import datetime
  7. import time
  8. import random
  9. import hashlib
  10. # import traceback
  11. import requests
  12. from retrying import retry
  13. from PIL import Image
  14. from flask import make_response
  15. from func_timeout import func_set_timeout
  16. from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
  17. # from photo_upload import upload_replace_image, upload_img_to_ucloud
  18. from photo_upload_qcloud import upload_replace_image, upload_img_to_ucloud
  19. from structure.structure_mian import WordParseStructure
  20. RES_FOLDER = r'D:\zwj\word_non-temp_paper_structure\res_folder'
  21. server_ip = "49.233.23.58"
  22. server_file_port = "11088"
  23. wordbin_path = r"D:\word_bin\ConsoleApplication1.exe"
  24. def pin(dirpath):
  25. from xpinyin import Pinyin
  26. pinyin_converter = Pinyin()
  27. need_modify = os.path.basename(dirpath)
  28. res = os.path.dirname(dirpath)
  29. aft_modify = pinyin_converter.get_pinyin(need_modify, '_')
  30. aft_modify = re.sub(r"[(())+\-]", "", str(aft_modify))
  31. aft_modify = re.sub(r"\s", "", str(aft_modify))
  32. b, h = str(aft_modify).split(".")
  33. now_time = datetime.datetime.now()
  34. time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
  35. aft_modify = str(b) + '__' + str(random.random())
  36. if isinstance(aft_modify, str):
  37. aft_modify = aft_modify.encode("utf-8")
  38. md = hashlib.md5()
  39. md.update(aft_modify)
  40. aft_name = str(md.hexdigest() + '__' + time_str + '.' + h)
  41. print("aft_name-----", aft_name)
  42. bef = os.path.join(res, need_modify)
  43. aft = os.path.join(res, aft_name)
  44. os.rename(bef, aft)
  45. return aft
  46. # 生成html工具 wordbin
  47. @retry(stop_max_attempt_number=2, wait_fixed=1) # 最大重试2次,2次全部报错,才会报错
  48. def call_c_shape(doc_file):
  49. res = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=70)
  50. return res
  51. def start_word2html_app(kill_mathtype=True):
  52. if kill_mathtype:
  53. os.system("taskkill /f /im MathType.exe")
  54. os.system("taskkill /f /im WINWORD.EXE")
  55. os.system("taskkill /f /im ConsoleApplication1.exe")
  56. os.system("start {}".format(wordbin_path)) # start 在新窗口中打开
  57. def check_fault_pid():
  58. command1 = 'tasklist /fi "imagename eq WINWORD.EXE"'
  59. command2 = 'tasklist /fi "imagename eq WerFault.exe"'
  60. r1 = os.popen(command1)
  61. info1 = r1.read() # 读取命令行的输出到一个list
  62. if str(info1).strip() != "信息: 没有运行的任务匹配指定标准。":
  63. print("++++出现office word 宏提醒,开始kill ++++")
  64. start_word2html_app()
  65. return 1
  66. else:
  67. print("-----没有word问题报告弹窗------")
  68. r2 = os.popen(command2)
  69. info2 = r2.read()
  70. if str(info2).strip() != "信息: 没有运行的任务匹配指定标准。":
  71. print("++++出现《问题报告》弹窗,开始kill ++++")
  72. os.system("taskkill /f /im WerFault.exe")
  73. start_word2html_app()
  74. return 1
  75. else:
  76. print("-----没有wordbin问题报告弹窗------")
  77. return 0
  78. def get_html(doc_file): # doc_file:文件绝对路径名
  79. """调wordbin获取html文件"""
  80. stime2 = time.time()
  81. try:
  82. r = call_c_shape(doc_file)
  83. # executor1 = ProcessPoolExecutor(3) # wordbin不支持多线程
  84. # word2html = executor1.submit(call_c_shape, doc_file)
  85. # word2html = word2html.result()
  86. # r = word2html.text
  87. # executor1.shutdown(wait=True)
  88. if r.text == 4:
  89. check_fault_pid()
  90. return "html文件生成失败", 0
  91. except:
  92. is_kill = check_fault_pid()
  93. if not is_kill:
  94. start_word2html_app(kill_mathtype=True)
  95. return "试卷格式有问题", 0
  96. etime2 = time.time()
  97. try:
  98. html = open(doc_file.replace(".docx", "_clean.html").replace(".doc", "_clean.html"), 'r', encoding="utf-8")
  99. return html.read(), etime2 - stime2
  100. except:
  101. return "试卷格式有问题", etime2 - stime2 # 也可能超时
  102. @func_set_timeout(120)
  103. def parse_word(doc_file, filename_root, flag=0, sid=0):
  104. print("-------解析的文件名为: {} -------".format(doc_file))
  105. html, wordbin_time = get_html(doc_file)
  106. print('解析中wordbin服务时间:', wordbin_time)
  107. img_upload_time = 0
  108. if html in ["html文件生成失败", "试卷格式有问题"]:
  109. res = {"errcode": 1,
  110. "errmsgs": "word读取失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n2.请尝试将内容粘贴复制到一份空白word再上传\n"
  111. "3.若上传文档题目太多,请分两次上传\n4.重点检查图片(设为嵌入式)和换行"}
  112. paper_type = "wordbin报错,未知"
  113. # log_f.write("\n----{}解析中wordbin服务时间:{}".format(file_name, wordbin_time))
  114. else:
  115. num = int(os.path.split(os.path.split(doc_file)[0])[1]) # word文件所在文件夹
  116. # todo if flag==1 --> upload then upload image and replaced online address; else static
  117. # http://zxhx-test + .cn-bj.ufileos.com + / + teacher/uploadfiles/wording + / 52 / 2020/04/21 + / + 5e9ea1ec2e28f.png"""
  118. # flag存在時,sid必存在,上傳圖片到綫上
  119. # print('flag:', flag)
  120. images_url = ""
  121. put_key_list, localfile_list = [], []
  122. if str(flag) == "1":
  123. # 图片替换
  124. data = upload_replace_image(filename_root, sid, html)
  125. if isinstance(data, dict): # 图片替换失败
  126. print("图片替换线上地址失败")
  127. res, paper_type = data, "图片替换线上地址失败,未知"
  128. return res, wordbin_time, paper_type, img_upload_time
  129. else:
  130. images_url, put_key_list, localfile_list = data
  131. else:
  132. # html = re.sub(r'<img src="files/', '<img src="' + str(images_url), html)
  133. images_url = "http://{}:{}/{}/{}/files/".format(server_ip, server_file_port, "ser_static", num)
  134. print("不用上传图片到cloud")
  135. try:
  136. # 开始结构化解析
  137. res, paper_type = WordParseStructure(html, images_url).structure()
  138. # 解析成功后再上传图片比较好,节约空间
  139. print(res["errcode"], str(flag))
  140. if not res["errcode"] and str(flag) == "1":
  141. print("开始上传图片到Ucloud,并替换成线上地址")
  142. stime3 = time.time()
  143. if not localfile_list: # 直接解析没有图片上传
  144. pass
  145. else:
  146. # TODO 一个进程解析,一个进程上传
  147. executor1 = ProcessPoolExecutor(5)
  148. executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list))
  149. executor1.shutdown(wait=True) # 进程池内部的进程都执行完毕,才会关闭,然后执行后续代码
  150. etime3 = time.time()
  151. img_upload_time = etime3 - stime3
  152. print("---img_upload_time==>:{}".format(etime3 - stime3))
  153. except:
  154. res = {"errcode": 1,
  155. "errmsgs": "解析失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n"
  156. "2.若上传文档题目太多,请分两次上传\n3.重点检查图片(设为嵌入式)和换行"}
  157. paper_type = "解析报错,未知"
  158. print("\n+++++++++解析结果结束++++++++++\n")
  159. return res, wordbin_time, paper_type, img_upload_time
  160. def save_post_file(parse_res, fname, id):
  161. """保存回调的数据解析结果"""
  162. b, h = str(os.path.basename(fname)).split(".")
  163. now_time = datetime.datetime.now()
  164. time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
  165. aft_modify = (str(b) + '__' + str(random.random())).encode("utf-8")
  166. aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '__' + str(id) + '__' + b +'.json'
  167. file1 = RES_FOLDER
  168. if not os.path.isdir(file1):
  169. os.makedirs(file1)
  170. new_fpath = os.path.join(file1, aft_name)
  171. re_f = open(new_fpath, 'w', encoding='utf-8')
  172. json.dump(parse_res, re_f)
  173. return new_fpath
  174. def Response_headers(content):
  175. resp = make_response(content)
  176. resp.headers['Access-Control-Allow-Headers'] = "x-requested-with,content-type,Authorization"
  177. resp.headers['Access-Control-Allow-Methods'] = "POST,GET,OPTIONS"
  178. resp.headers['Access-Control-Allow-Origin'] = '*'
  179. return resp
  180. # 修改图片大小
  181. def resize_img(img1, img2):
  182. """
  183. 将图片进行压缩,两个地址相同,均为图片的绝对地址,目前该函数弃用,图片不再进行压缩
  184. :param img1: D://upload/23.png
  185. :param img2: D://upload/23.png
  186. :return:
  187. """
  188. im = Image.open(img1)
  189. if im.size[0] > 4000: # 2020/3/20+
  190. ratio = 4000 / im.size[0]
  191. thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS)
  192. thumb.save(img2)
  193. def img_rep(cont):
  194. """
  195. 获取文本中含有的本地图片流
  196. :param cont:文本
  197. :return: 图片流
  198. """
  199. import base64
  200. img_stream = ''
  201. all_photo = re.findall("<img src=\"([^\"]+?)\".*?/>", cont)
  202. if all_photo:
  203. for src in all_photo:
  204. img_path = src.split(" ")[0]
  205. img_local_path = "F:/word_uploads/" + img_path.split("ser_static/")[1].replace("\"", "")
  206. print(img_local_path)
  207. with open(img_local_path, 'rb') as img_f:
  208. img_stream = img_f.read()
  209. img_stream = base64.b64encode(img_stream)
  210. cont = cont.replace(src, "")
  211. requests.post(img_path)
  212. return cont
  213. def return_img_stream(img_local_path):
  214. """
  215. 工具函数:
  216. 获取本地图片流
  217. :param img_local_path:文件单张图片的本地绝对路径
  218. :return: 图片流
  219. """
  220. import base64
  221. img_stream = ''
  222. with open(img_local_path, 'rb') as img_f:
  223. img_stream = img_f.read()
  224. img_stream = base64.b64encode(img_stream).decode()
  225. return img_stream