server_tools.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import os
  4. import re
  5. import json
  6. import datetime
  7. import time
  8. import random
  9. import hashlib
  10. import requests
  11. from retrying import retry
  12. from PIL import Image
  13. from flask import make_response
  14. from func_timeout import func_set_timeout
  15. from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
  16. # from photo_upload import upload_replace_image, upload_img_to_ucloud
  17. from photo_upload_qcloud import upload_replace_image, upload_img_to_ucloud
  18. from structure_mian import WordParseStructure
  19. import ps_configs as config
  20. from ps_configs import myLog
  21. logger = myLog(__name__).getlog()
  22. def pin(dirpath):
  23. from xpinyin import Pinyin
  24. pinyin_converter = Pinyin()
  25. need_modify = os.path.basename(dirpath)
  26. res = os.path.dirname(dirpath)
  27. aft_modify = pinyin_converter.get_pinyin(need_modify, '_')
  28. aft_modify = re.sub(r"[(())+\-]", "", str(aft_modify))
  29. aft_modify = re.sub(r"\s", "", str(aft_modify))
  30. b, h = str(aft_modify).split(".")
  31. now_time = datetime.datetime.now()
  32. time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
  33. aft_modify = str(b) + '__' + str(random.random())
  34. if isinstance(aft_modify, str):
  35. aft_modify = aft_modify.encode("utf-8")
  36. md = hashlib.md5()
  37. md.update(aft_modify)
  38. aft_name = str(md.hexdigest() + '__' + time_str + '.' + h)
  39. print("aft_name-----", aft_name)
  40. bef = os.path.join(res, need_modify)
  41. aft = os.path.join(res, aft_name)
  42. os.rename(bef, aft)
  43. return aft
  44. # 生成html工具 wordbin
  45. @retry(stop_max_attempt_number=2, wait_fixed=1) # 最大重试2次,2次全部报错,才会报错
  46. def call_c_shape(doc_file):
  47. res = requests.get(r"http://localhost:9001/word/?name={}".format(doc_file), timeout=70)
  48. return res
  49. def start_word2html_app(kill_mathtype=True):
  50. if kill_mathtype:
  51. os.system("taskkill /f /im MathType.exe")
  52. os.system("taskkill /f /im WINWORD.EXE")
  53. os.system("taskkill /f /im ConsoleApplication1.exe")
  54. # os.system('"C:\Program Files (x86)\MathType\MathType.exe" -server')
  55. os.system("start {}".format(config.wordbin_exe)) # start 在新窗口中打开
  56. def check_fault_pid():
  57. command1 = 'tasklist /fi "imagename eq WINWORD.EXE"'
  58. command2 = 'tasklist /fi "imagename eq WerFault.exe"'
  59. r1 = os.popen(command1)
  60. info1 = r1.read() # 读取命令行的输出到一个list
  61. if str(info1).strip() != "信息: 没有运行的任务匹配指定标准。":
  62. print("++++出现office word 宏提醒,开始kill ++++")
  63. start_word2html_app()
  64. return 1
  65. else:
  66. print("-----没有word问题报告弹窗------")
  67. r2 = os.popen(command2)
  68. info2 = r2.read()
  69. if str(info2).strip() != "信息: 没有运行的任务匹配指定标准。":
  70. print("++++出现《问题报告》弹窗,开始kill ++++")
  71. os.system("taskkill /f /im WerFault.exe")
  72. start_word2html_app()
  73. return 1
  74. else:
  75. print("-----没有wordbin问题报告弹窗------")
  76. return 0
  77. def get_html(doc_file): # doc_file:文件绝对路径名
  78. """调wordbin获取html文件"""
  79. stime2 = time.time()
  80. try:
  81. r = call_c_shape(doc_file)
  82. # executor1 = ProcessPoolExecutor(3) # wordbin不支持多线程
  83. # word2html = executor1.submit(call_c_shape, doc_file)
  84. # word2html = word2html.result()
  85. # r = word2html.text
  86. # executor1.shutdown(wait=True)
  87. if r.text == 4:
  88. check_fault_pid()
  89. return "html文件生成失败", 0
  90. except:
  91. is_kill = check_fault_pid()
  92. if not is_kill:
  93. start_word2html_app(kill_mathtype=True)
  94. return "试卷格式有问题", 0
  95. etime2 = time.time()
  96. try:
  97. html = open(doc_file.replace(".docx", "_clean.html").replace(".doc", "_clean.html"), 'r', encoding="utf-8")
  98. return html.read(), etime2 - stime2
  99. except:
  100. return "试卷格式有问题", etime2 - stime2 # 也可能超时
  101. @func_set_timeout(120)
  102. def parse_word(doc_file, filename_root, flag=0, sid=0, upload_id='0000', consumer='phy'):
  103. # if consumer=='phy':
  104. # from parse_v1.non_template_word_parse_phy import WordParseStructure
  105. # else:
  106. # from parse_v2.non_template_word_parse_new import WordParseStructure
  107. logger.info("----【upload_id:{}】开始解析文件: {} -------".format(upload_id, doc_file))
  108. html, wordbin_time = get_html(doc_file)
  109. logger.info('----【upload_id:{}】解析中wordbin服务时间:{}'.format(upload_id, wordbin_time))
  110. if html in ["html文件生成失败", "试卷格式有问题"]:
  111. res = {"errcode": 1,
  112. "errmsgs": "word读取失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n2.请尝试将内容粘贴复制到一份空白word再上传\n"
  113. "3.若上传文档题目太多,请分两次上传\n4.重点检查图片(设为嵌入式)和换行"}
  114. logger.info('----【upload_id:{}】wordbin报错----'.format(upload_id))
  115. return res, "wordbin报错,未知"
  116. # log_f.write("\n----{}解析中wordbin服务时间:{}".format(file_name, wordbin_time))
  117. else:
  118. num = int(os.path.split(os.path.split(doc_file)[0])[1]) # word文件所在文件夹
  119. # todo if flag==1 --> upload then upload image and replaced online address; else static
  120. # http://zxhx-test + .cn-bj.ufileos.com/teacher/uploadfiles/wording + /52/2020/04/21 + /5e9ea1ec2e28f.png
  121. # flag存在時,sid必存在,上傳圖片到綫上
  122. # print('flag:', flag)
  123. images_url = "http://{}:{}/{}/{}/files/".format(config.server_ip, config.server_port, "ser_static", num)
  124. put_key_list, localfile_list = [], []
  125. if str(flag) == "1":
  126. # 图片替换
  127. data = upload_replace_image(filename_root, sid, html)
  128. if isinstance(data, dict): # 图片替换失败
  129. # print("图片替换线上地址失败")
  130. logger.info('----【upload_id:{}】图片替换线上地址失败----'.format(upload_id))
  131. return data, "图片替换线上地址失败,未知"
  132. else:
  133. images_url, put_key_list, localfile_list = data
  134. try:
  135. # 开始结构化解析
  136. res, paper_type = WordParseStructure(html, images_url).structure()
  137. # pprint.pprint(res)
  138. # 解析成功后再上传图片比较好,节约空间
  139. if not res["errcode"] and str(flag) == "1":
  140. # print("开始上传图片到cloud,并替换成线上地址")
  141. logger.info('----【upload_id:{}】开始上传图片到cloud,并替换成线上地址----'.format(upload_id))
  142. stime3 = time.time()
  143. if not localfile_list: # 直接解析没有图片上传
  144. pass
  145. else:
  146. # TODO 一个进程解析,一个进程上传
  147. executor1 = ProcessPoolExecutor(5)
  148. executor1.map(upload_img_to_ucloud, zip(put_key_list, localfile_list))
  149. executor1.shutdown(wait=True) # 进程池内部的进程都执行完毕,才会关闭,然后执行后续代码
  150. etime3 = time.time()
  151. img_upload_time = etime3 - stime3
  152. logger.info("----【upload_id:{}】,图片上传时间img_upload_time:{}".format(upload_id, img_upload_time))
  153. except:
  154. logger.info('----【upload_id:{}】试题结构化或图片上传cloud报错----'.format(upload_id))
  155. res = {"errcode": 1,
  156. "errmsgs": "解析失败。1.请确定试卷版面格式是否为常见的试卷排版格式,删除不规范格式\n"
  157. "2.若上传文档题目太多,请分两次上传\n3.重点检查图片(设为嵌入式)和换行"}
  158. paper_type = "解析报错,未知"
  159. print("\n+++++++++解析结果结束++++++++++\n")
  160. return res, paper_type
  161. def save_post_file(parse_res, fname, id):
  162. """保存回调的数据解析结果"""
  163. b, h = str(os.path.basename(fname)).split(".")
  164. now_time = datetime.datetime.now()
  165. time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
  166. aft_modify = (str(b) + '__' + str(random.random())).encode("utf-8")
  167. aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '__' + str(id) + '__' + b +'.json'
  168. file1 = config.RES_FOLDER
  169. if not os.path.isdir(file1):
  170. os.makedirs(file1)
  171. new_fpath = os.path.join(file1, aft_name)
  172. re_f = open(new_fpath, 'w', encoding='utf-8')
  173. json.dump(parse_res, re_f)
  174. return new_fpath
  175. def Response_headers(content):
  176. resp = make_response(content)
  177. resp.headers['Access-Control-Allow-Headers'] = "x-requested-with,content-type,Authorization"
  178. resp.headers['Access-Control-Allow-Methods'] = "POST,GET,OPTIONS"
  179. resp.headers['Access-Control-Allow-Origin'] = '*'
  180. return resp
  181. # 修改图片大小
  182. def resize_img(img1, img2):
  183. """
  184. 将图片进行压缩,两个地址相同,均为图片的绝对地址,目前该函数弃用,图片不再进行压缩
  185. :param img1: D://upload/23.png
  186. :param img2: D://upload/23.png
  187. :return:
  188. """
  189. im = Image.open(img1)
  190. if im.size[0] > 4000: # 2020/3/20+
  191. ratio = 4000 / im.size[0]
  192. thumb = im.resize((int(im.size[0] * ratio), int(im.size[1] * ratio)), Image.ANTIALIAS)
  193. thumb.save(img2)
  194. def img_rep(cont):
  195. """
  196. 获取文本中含有的本地图片流
  197. :param cont:文本
  198. :return: 图片流
  199. """
  200. import base64
  201. img_stream = ''
  202. all_photo = re.findall("<img src=\"([^\"]+?)\".*?/>", cont)
  203. if all_photo:
  204. for src in all_photo:
  205. img_path = src.split(" ")[0]
  206. img_local_path = "F:/word_uploads/" + img_path.split("ser_static/")[1].replace("\"", "")
  207. print(img_local_path)
  208. with open(img_local_path, 'rb') as img_f:
  209. img_stream = img_f.read()
  210. img_stream = base64.b64encode(img_stream)
  211. cont = cont.replace(src, "")
  212. requests.post(img_path)
  213. return cont
  214. def return_img_stream(img_local_path):
  215. """
  216. 工具函数:
  217. 获取本地图片流
  218. :param img_local_path:文件单张图片的本地绝对路径
  219. :return: 图片流
  220. """
  221. import base64
  222. img_stream = ''
  223. with open(img_local_path, 'rb') as img_f:
  224. img_stream = img_f.read()
  225. img_stream = base64.b64encode(img_stream).decode()
  226. return img_stream