server.py 11 KB


  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. # from gevent import monkey
  4. # monkey.patch_all() # 打上猴子补丁
  5. import pickle
  6. from flask import Flask, render_template, send_from_directory
  7. from flask import request, redirect, Response
  8. from flask_cors import *
  9. import re
  10. import traceback
  11. from flask.views import MethodView
  12. from multiprocessing import Process, Queue
  13. import configs
  14. from structure.danti_structure import single_parse
  15. from structure.structure_main import WordParseStructure
  16. import os, datetime, hashlib
  17. import time, json, random
  18. import pprint
  19. from utils.ruku_opera import Ruku
  20. from pprint import pprint
  21. logger = configs.myLog(__name__, log_cate="reparse_ruku_log").getlog()
  22. app = Flask(__name__)
  23. app.debug = True
  24. CORS(app, supports_credentials=True)
  25. @app.route('/word_structure', methods=["GET", "POST"])
  26. def word_structure():
  27. """
  28. word 批量结构化
  29. :return:
  30. """
  31. mydata = request.json.get("sci_html_data", "")
  32. is_reparse = request.json.get("is_reparse", "0")
  33. word_id = request.json.get("paper_id", 0)
  34. source = request.json.get("source", "zxhx")
  35. subject = request.json.get("subject", "")
  36. must_latex = request.json.get("must_latex", 1) # 非必传
  37. print("【再解析】==request.POST.dict==>is_reparse:{}, word_id:{}".format(is_reparse, word_id))
  38. # print(mydata)
  39. loginfo = {"log_level": "info",
  40. "request_ip": request.remote_addr,
  41. "receive_data": {"paper_id": word_id,
  42. "is_reparse": is_reparse,
  43. "source": source,
  44. "subject": subject},
  45. "task_name": "批量文本结构化解析"}
  46. # 接收的文件记录一下,按wordid命名
  47. # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d_%H_%M_%S')
  48. # new_fpath = os.path.join(configs.FAIL_FOLDER, str(time_str)+".html")
  49. if not word_id:
  50. word_id = get_wordid(mydata)
  51. loginfo["receive_data"]["is_auto_id"] = 1
  52. loginfo["receive_data"]["paper_id"] = word_id
  53. logger.info(json.dumps(loginfo, ensure_ascii=False))
  54. if word_id:
  55. getfile_savepath = os.path.dirname(os.getcwd()) + '\\accept_files\\' + str(word_id) + ".html"
  56. if os.path.exists(getfile_savepath):
  57. print("同一份wordid文件发送多次:{}".format(word_id))
  58. re_f = open(getfile_savepath, 'w', encoding='utf-8')
  59. re_f.write(mydata)
  60. re_f.close()
  61. result = {"errcode": 0, "errmsgs": "", "data": {}}
  62. st1 = time.time()
  63. try:
  64. if int(is_reparse) and word_id: # 再解析
  65. res, paper_type = WordParseStructure(mydata, str(word_id), int(is_reparse), must_latex, source, subject)()
  66. # print(res)
  67. if "errcode" not in res:
  68. result["data"] = res
  69. else:
  70. result = res
  71. print("【再解析】==解析结束==> word_id:{}".format(word_id))
  72. elif not int(is_reparse) and mydata: # 不是再解析
  73. res, paper_type = WordParseStructure(mydata, "", source=source, subject=subject)()
  74. # print(res)
  75. if "errcode" not in res:
  76. result["data"] = res
  77. else:
  78. result = res
  79. print("【再解析】==解析结束==> word_id:{}".format(word_id))
  80. else:
  81. result["errmsgs"] = "无data或paper_id"
  82. result["errcode"] = 1
  83. if result["errcode"]:
  84. logger.info(json.dumps({"log_level": "info",
  85. "paper_id": word_id,
  86. "status": "解析失败",
  87. "errmsg": result["errmsgs"],
  88. "task_time": time.time() - st1,
  89. "task_name": "批量文本结构化解析"}, ensure_ascii=False))
  90. else:
  91. logger.info(json.dumps({"log_level": "info",
  92. "paper_id": word_id,
  93. "status": "解析成功",
  94. "task_time": time.time() - st1,
  95. "task_name": "批量文本结构化解析"}, ensure_ascii=False))
  96. except Exception as e:
  97. # 先保存文件
  98. # now_time = datetime.datetime.now()
  99. # time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
  100. # aft_modify = (str(random.random())).encode("utf-8")
  101. # aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '.json'
  102. print("【再解析】==解析失败==> word_id:{}, 报错信息:{}".format(word_id, traceback.format_exc()))
  103. new_fpath = configs.FAIL_FOLDER + '/' + str(word_id) + '.json'
  104. re_f = open(new_fpath, 'w', encoding='utf-8')
  105. json.dump(mydata, re_f, ensure_ascii=False)
  106. result["errmsgs"] = "解析失败"
  107. result["errcode"] = 1
  108. logger.info(json.dumps({"log_level": "warn",
  109. "paper_id": word_id,
  110. "status": "解析失败",
  111. "errmsg": str(e),
  112. "task_time": time.time() - st1,
  113. "task_name": "批量文本结构化解析"}, ensure_ascii=False))
  114. return json.dumps(result, ensure_ascii=False)
  115. @app.route('/danti_structure', methods=["GET", "POST"])
  116. def danti_structure():
  117. """
  118. 单题再解析、结构化
  119. :return:
  120. """
  121. word_id = request.json.get("paper_id", 0)
  122. one_item = request.json.get("single_item_data", "")
  123. item_type = request.json.get("item_type", "")
  124. source = request.json.get("source", "zxhx")
  125. subject = request.json.get("subject", "")
  126. print("【单题解析】==request.POST.dict==>word_id:{}, item_type:{}".format(word_id, item_type))
  127. # logger.info("【单题解析】==request.POST.single_item_data==>\n{}\n".format(one_item))
  128. print(word_id, item_type)
  129. loginfo = {"log_level": "info",
  130. "request_ip": request.remote_addr,
  131. "receive_data": {"paper_id": word_id,
  132. "item_type": item_type,
  133. "source": source,
  134. "subject": subject},
  135. "task_name": "单题解析"}
  136. if not word_id:
  137. word_id = get_wordid(one_item)
  138. loginfo["receive_data"]["is_auto_id"] = 1
  139. loginfo["receive_data"]["paper_id"] = word_id
  140. logger.info(json.dumps(loginfo, ensure_ascii=False))
  141. res = {"errcode": 0, "errmsgs":"", "data": {}}
  142. if item_type:
  143. one_res = single_parse(one_item, item_type, word_id, source, subject)
  144. # pprint(one_res)
  145. if type(one_res) == str:
  146. res["errcode"] = 1
  147. res["errmsgs"] = one_res
  148. else:
  149. res["data"] = one_res
  150. else:
  151. res["errcode"] = 1
  152. res["errmsgs"] = "没有选定题型"
  153. return json.dumps(res, ensure_ascii=False)
  154. # class rukuAPI(MethodView):
  155. # def post(self):
  156. # self.wordid = request.json.get("paper_id", "")
  157. # subject = request.json.get("subject", "")
  158. # items_list = request.json.get("structured_items", "") # 结构化试题
  159. # ocr_html_data = request.json.get("html_data", "") # 文本原始内容
  160. # svg_data = request.json.get("svgs", {"svg_html_data": "", "svg_path": ""}) # mathjax的相关文本
  161. # callback_info = request.json.get("user", {"callback_url": "", "source": ""})
  162. # def upload_img(self):
  163. # return
  164. @app.route('/ruku', methods=["POST"])
  165. def ruku():
  166. wordid = request.json.get("paper_id", "")
  167. subject = request.json.get("subject", "") # 实际传的subject_id(int型)
  168. items_list = request.json.get("structured_items", "") # 结构化试题
  169. ocr_html_data = request.json.get("html_data", "") # 文本原始内容
  170. svg_data = request.json.get("svgs", {"svg_html_data": "", "svg_path": ""}) # mathjax的相关文本
  171. callback_info = request.json.get("user", {"callback_url": "", "source": ""})
  172. print("【入库】==request.POST.dict==>word_id:{}, callback_info:{},subject:{}"
  173. .format(wordid, callback_info, subject))
  174. loginfo = {"log_level": "info",
  175. "request_ip": request.remote_addr,
  176. "receive_data": {"paper_id": wordid,
  177. "subject": subject,
  178. "callback_info": callback_info},
  179. "task_name": "保存入库"}
  180. # print(svg_data["svg_html_data"])
  181. # pickle.dump(items_list, open("./struct_items622.pickle", 'wb'))
  182. # pickle.dump(svg_data, open("./svg_data622.pickle", 'wb'))
  183. # logger.info("【入库】word_id:{}==公式数据==>svg_data:{}\n".format(wordid, str(svg_data).encode("utf8", "ignore")))
  184. if not wordid:
  185. wordid = get_wordid(str(items_list))
  186. loginfo["receive_data"]["is_auto_id"] = 1
  187. loginfo["receive_data"]["paper_id"] = wordid
  188. logger.info(json.dumps(loginfo, ensure_ascii=False))
  189. if wordid and items_list:
  190. st1 = time.time()
  191. # try:
  192. res = Ruku(items_list, ocr_html_data, svg_data, str(wordid), callback_info, subject).save()
  193. logger.info("【入库】==结束==> word_id:{}".format(wordid))
  194. # pprint(res)
  195. logger.info(json.dumps({"log_level": "info",
  196. "paper_id": wordid,
  197. "status": "任务结束",
  198. "task_time": time.time() - st1,
  199. "task_name": "保存入库"}, ensure_ascii=False))
  200. return json.dumps(res, ensure_ascii=False)
  201. # except Exception as e:
  202. # # print
  203. # logger.info("【入库】==失败==> word_id:{}\n{}".format(wordid, e))
  204. # return json.dumps({"errcode":1, "errmsgs": "入库失败!", "data":{}}, ensure_ascii=False)
  205. else:
  206. return "需要paperid"
  207. @app.route('/ser_static/<path:file_path>', methods=["GET"])
  208. def ser_static(file_path): # endpoint的位置是函数接口名,不能用static,与flask内部变量重名
  209. """
  210. :param file_path: 图片的本地绝对路径
  211. :return:
  212. """
  213. return send_from_directory(configs.IMG_FOLDER, file_path)
  214. def get_wordid(mydata):
  215. """
  216. 针对传参中没有wordid的情况,提取或生成wordid
  217. :return:
  218. """
  219. wordid_info1 = re.search("/(zyk/uploadfiles/wording|ser_static)/(\d+)/", mydata)
  220. wordid_info2 = re.search("/imgpaper/lqy_upload/(\d+)/", mydata)
  221. if wordid_info1:
  222. word_id = wordid_info1.group(2)
  223. elif wordid_info2:
  224. word_id = wordid_info2.group(1)
  225. else:
  226. name_list = random.sample(range(100000, 999999), 1)
  227. word_id = str(int(time.time())) + str(name_list[0])
  228. # md = hashlib.md5()
  229. # md.update(word_id_temp.encode("utf-8"))
  230. # word_id = str(md.hexdigest())
  231. return word_id
  232. if __name__ == "__main__":
  233. app.run(host=configs.server_ip, port=configs.server_port, threaded=True, debug=True) # threaded=True
  234. # app.run(processes=4) # 多进程或多线程只能选择一个,不能同时开启
  235. # 5fc64a0a4994183dda7e74b9
  236. # from gevent import pywsgi
  237. # # app.debug = True
  238. # server = pywsgi.WSGIServer((configs.server_ip, configs.server_port), app)
  239. # server.serve_forever()