#!/usr/bin/env/python # -*- coding:utf-8 -*- # from gevent import monkey # monkey.patch_all() # 打上猴子补丁 import pickle from flask import Flask, render_template, send_from_directory from flask import request, redirect, Response from flask_cors import * import re import traceback from flask.views import MethodView from multiprocessing import Process, Queue import configs from structure.danti_structure import single_parse from structure.structure_main import WordParseStructure import os, datetime, hashlib import time, json, random import pprint from utils.ruku_opera import Ruku from pprint import pprint logger = configs.myLog(__name__, log_cate="reparse_ruku_log").getlog() app = Flask(__name__) app.debug = True CORS(app, supports_credentials=True) @app.route('/word_structure', methods=["GET", "POST"]) def word_structure(): """ word 批量结构化 :return: """ mydata = request.json.get("sci_html_data", "") is_reparse = request.json.get("is_reparse", "0") word_id = request.json.get("paper_id", 0) source = request.json.get("source", "zxhx") subject = request.json.get("subject", "") must_latex = request.json.get("must_latex", 1) # 非必传 print("【再解析】==request.POST.dict==>is_reparse:{}, word_id:{}".format(is_reparse, word_id)) # print(mydata) loginfo = {"log_level": "info", "request_ip": request.remote_addr, "receive_data": {"paper_id": word_id, "is_reparse": is_reparse, "source": source, "subject": subject}, "task_name": "批量文本结构化解析"} # 接收的文件记录一下,按wordid命名 # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d_%H_%M_%S') # new_fpath = os.path.join(configs.FAIL_FOLDER, str(time_str)+".html") if not word_id: word_id = get_wordid(mydata) loginfo["receive_data"]["is_auto_id"] = 1 loginfo["receive_data"]["paper_id"] = word_id logger.info(json.dumps(loginfo, ensure_ascii=False)) if word_id: getfile_savepath = os.path.dirname(os.getcwd()) + '\\accept_files\\' + str(word_id) + ".html" if os.path.exists(getfile_savepath): print("同一份wordid文件发送多次:{}".format(word_id)) re_f = open(getfile_savepath, 'w', encoding='utf-8') re_f.write(mydata) re_f.close() result = {"errcode": 0, "errmsgs": "", "data": {}} st1 = time.time() try: if int(is_reparse) and word_id: # 再解析 res, paper_type = WordParseStructure(mydata, str(word_id), int(is_reparse), must_latex, source, subject)() # print(res) if "errcode" not in res: result["data"] = res else: result = res print("【再解析】==解析结束==> word_id:{}".format(word_id)) elif not int(is_reparse) and mydata: # 不是再解析 res, paper_type = WordParseStructure(mydata, "", source=source, subject=subject)() # print(res) if "errcode" not in res: result["data"] = res else: result = res print("【再解析】==解析结束==> word_id:{}".format(word_id)) else: result["errmsgs"] = "无data或paper_id" result["errcode"] = 1 if result["errcode"]: logger.info(json.dumps({"log_level": "info", "paper_id": word_id, "status": "解析失败", "errmsg": result["errmsgs"], "task_time": time.time() - st1, "task_name": "批量文本结构化解析"}, ensure_ascii=False)) else: logger.info(json.dumps({"log_level": "info", "paper_id": word_id, "status": "解析成功", "task_time": time.time() - st1, "task_name": "批量文本结构化解析"}, ensure_ascii=False)) except Exception as e: # 先保存文件 # now_time = datetime.datetime.now() # time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S') # aft_modify = (str(random.random())).encode("utf-8") # aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '.json' print("【再解析】==解析失败==> word_id:{}, 报错信息:{}".format(word_id, traceback.format_exc())) new_fpath = configs.FAIL_FOLDER + '/' + str(word_id) + '.json' re_f = open(new_fpath, 'w', encoding='utf-8') json.dump(mydata, re_f, ensure_ascii=False) result["errmsgs"] = "解析失败" result["errcode"] = 1 logger.info(json.dumps({"log_level": "warn", "paper_id": word_id, "status": "解析失败", "errmsg": str(e), "task_time": time.time() - st1, "task_name": "批量文本结构化解析"}, ensure_ascii=False)) return json.dumps(result, ensure_ascii=False) @app.route('/danti_structure', methods=["GET", "POST"]) def danti_structure(): """ 单题再解析、结构化 :return: """ word_id = request.json.get("paper_id", 0) one_item = request.json.get("single_item_data", "") item_type = request.json.get("item_type", "") source = request.json.get("source", "zxhx") subject = request.json.get("subject", "") print("【单题解析】==request.POST.dict==>word_id:{}, item_type:{}".format(word_id, item_type)) # logger.info("【单题解析】==request.POST.single_item_data==>\n{}\n".format(one_item)) print(word_id, item_type) loginfo = {"log_level": "info", "request_ip": request.remote_addr, "receive_data": {"paper_id": word_id, "item_type": item_type, "source": source, "subject": subject}, "task_name": "单题解析"} if not word_id: word_id = get_wordid(one_item) loginfo["receive_data"]["is_auto_id"] = 1 loginfo["receive_data"]["paper_id"] = word_id logger.info(json.dumps(loginfo, ensure_ascii=False)) res = {"errcode": 0, "errmsgs":"", "data": {}} if item_type: one_res = single_parse(one_item, item_type, word_id, source, subject) # pprint(one_res) if type(one_res) == str: res["errcode"] = 1 res["errmsgs"] = one_res else: res["data"] = one_res else: res["errcode"] = 1 res["errmsgs"] = "没有选定题型" return json.dumps(res, ensure_ascii=False) # class rukuAPI(MethodView): # def post(self): # self.wordid = request.json.get("paper_id", "") # subject = request.json.get("subject", "") # items_list = request.json.get("structured_items", "") # 结构化试题 # ocr_html_data = request.json.get("html_data", "") # 文本原始内容 # svg_data = request.json.get("svgs", {"svg_html_data": "", "svg_path": ""}) # mathjax的相关文本 # callback_info = request.json.get("user", {"callback_url": "", "source": ""}) # def upload_img(self): # return @app.route('/ruku', methods=["POST"]) def ruku(): wordid = request.json.get("paper_id", "") subject = request.json.get("subject", "") # 实际传的subject_id(int型) items_list = request.json.get("structured_items", "") # 结构化试题 ocr_html_data = request.json.get("html_data", "") # 文本原始内容 svg_data = request.json.get("svgs", {"svg_html_data": "", "svg_path": ""}) # mathjax的相关文本 callback_info = request.json.get("user", {"callback_url": "", "source": ""}) print("【入库】==request.POST.dict==>word_id:{}, callback_info:{},subject:{}" .format(wordid, callback_info, subject)) loginfo = {"log_level": "info", "request_ip": request.remote_addr, "receive_data": {"paper_id": wordid, "subject": subject, "callback_info": callback_info}, "task_name": "保存入库"} # print(svg_data["svg_html_data"]) # pickle.dump(items_list, open("./struct_items622.pickle", 'wb')) # pickle.dump(svg_data, open("./svg_data622.pickle", 'wb')) # logger.info("【入库】word_id:{}==公式数据==>svg_data:{}\n".format(wordid, str(svg_data).encode("utf8", "ignore"))) if not wordid: wordid = get_wordid(str(items_list)) loginfo["receive_data"]["is_auto_id"] = 1 loginfo["receive_data"]["paper_id"] = wordid logger.info(json.dumps(loginfo, ensure_ascii=False)) if wordid and items_list: st1 = time.time() # try: res = Ruku(items_list, ocr_html_data, svg_data, str(wordid), callback_info, subject).save() logger.info("【入库】==结束==> word_id:{}".format(wordid)) # pprint(res) logger.info(json.dumps({"log_level": "info", "paper_id": wordid, "status": "任务结束", "task_time": time.time() - st1, "task_name": "保存入库"}, ensure_ascii=False)) return json.dumps(res, ensure_ascii=False) # except Exception as e: # # print # logger.info("【入库】==失败==> word_id:{}\n{}".format(wordid, e)) # return json.dumps({"errcode":1, "errmsgs": "入库失败!", "data":{}}, ensure_ascii=False) else: return "需要paperid" @app.route('/ser_static/', methods=["GET"]) def ser_static(file_path): # endpoint的位置是函数接口名,不能用static,与flask内部变量重名 """ :param file_path: 图片的本地绝对路径 :return: """ return send_from_directory(configs.IMG_FOLDER, file_path) def get_wordid(mydata): """ 针对传参中没有wordid的情况,提取或生成wordid :return: """ wordid_info1 = re.search("/(zyk/uploadfiles/wording|ser_static)/(\d+)/", mydata) wordid_info2 = re.search("/imgpaper/lqy_upload/(\d+)/", mydata) if wordid_info1: word_id = wordid_info1.group(2) elif wordid_info2: word_id = wordid_info2.group(1) else: name_list = random.sample(range(100000, 999999), 1) word_id = str(int(time.time())) + str(name_list[0]) # md = hashlib.md5() # md.update(word_id_temp.encode("utf-8")) # word_id = str(md.hexdigest()) return word_id if __name__ == "__main__": app.run(host=configs.server_ip, port=configs.server_port, threaded=True, debug=True) # threaded=True # app.run(processes=4) # 多进程或多线程只能选择一个,不能同时开启 # 5fc64a0a4994183dda7e74b9 # from gevent import pywsgi # # app.debug = True # server = pywsgi.WSGIServer((configs.server_ip, configs.server_port), app) # server.serve_forever()