123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- # from gevent import monkey
- # monkey.patch_all() # 打上猴子补丁
- import pickle
- from flask import Flask, render_template, send_from_directory
- from flask import request, redirect, Response
- from flask_cors import *
- import re
- import traceback
- from flask.views import MethodView
- from multiprocessing import Process, Queue
- import configs
- from structure.danti_structure import single_parse
- from structure.structure_main import WordParseStructure
- import os, datetime, hashlib
- import time, json, random
- import pprint
- from utils.ruku_opera import Ruku
- from pprint import pprint
- logger = configs.myLog(__name__, log_cate="reparse_ruku_log").getlog()
- app = Flask(__name__)
- app.debug = True
- CORS(app, supports_credentials=True)
- @app.route('/word_structure', methods=["GET", "POST"])
- def word_structure():
- """
- word 批量结构化
- :return:
- """
- mydata = request.json.get("sci_html_data", "")
- is_reparse = request.json.get("is_reparse", "0")
- word_id = request.json.get("paper_id", 0)
- source = request.json.get("source", "zxhx")
- subject = request.json.get("subject", "")
- must_latex = request.json.get("must_latex", 1) # 非必传
- print("【再解析】==request.POST.dict==>is_reparse:{}, word_id:{}".format(is_reparse, word_id))
- # print(mydata)
- loginfo = {"log_level": "info",
- "request_ip": request.remote_addr,
- "receive_data": {"paper_id": word_id,
- "is_reparse": is_reparse,
- "source": source,
- "subject": subject},
- "task_name": "批量文本结构化解析"}
- # 接收的文件记录一下,按wordid命名
- # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d_%H_%M_%S')
- # new_fpath = os.path.join(configs.FAIL_FOLDER, str(time_str)+".html")
- if not word_id:
- word_id = get_wordid(mydata)
- loginfo["receive_data"]["is_auto_id"] = 1
- loginfo["receive_data"]["paper_id"] = word_id
- logger.info(json.dumps(loginfo, ensure_ascii=False))
- if word_id:
- getfile_savepath = os.path.dirname(os.getcwd()) + '\\accept_files\\' + str(word_id) + ".html"
- if os.path.exists(getfile_savepath):
- print("同一份wordid文件发送多次:{}".format(word_id))
- re_f = open(getfile_savepath, 'w', encoding='utf-8')
- re_f.write(mydata)
- re_f.close()
- result = {"errcode": 0, "errmsgs": "", "data": {}}
- st1 = time.time()
- try:
- if int(is_reparse) and word_id: # 再解析
- res, paper_type = WordParseStructure(mydata, str(word_id), int(is_reparse), must_latex, source, subject)()
- # print(res)
- if "errcode" not in res:
- result["data"] = res
- else:
- result = res
- print("【再解析】==解析结束==> word_id:{}".format(word_id))
- elif not int(is_reparse) and mydata: # 不是再解析
- res, paper_type = WordParseStructure(mydata, "", source=source, subject=subject)()
- # print(res)
- if "errcode" not in res:
- result["data"] = res
- else:
- result = res
- print("【再解析】==解析结束==> word_id:{}".format(word_id))
- else:
- result["errmsgs"] = "无data或paper_id"
- result["errcode"] = 1
- if result["errcode"]:
- logger.info(json.dumps({"log_level": "info",
- "paper_id": word_id,
- "status": "解析失败",
- "errmsg": result["errmsgs"],
- "task_time": time.time() - st1,
- "task_name": "批量文本结构化解析"}, ensure_ascii=False))
- else:
- logger.info(json.dumps({"log_level": "info",
- "paper_id": word_id,
- "status": "解析成功",
- "task_time": time.time() - st1,
- "task_name": "批量文本结构化解析"}, ensure_ascii=False))
- except Exception as e:
- # 先保存文件
- # now_time = datetime.datetime.now()
- # time_str = datetime.datetime.strftime(now_time, '%Y_%m_%d_%H_%M_%S')
- # aft_modify = (str(random.random())).encode("utf-8")
- # aft_name = hashlib.md5(aft_modify).hexdigest() + '__' + time_str + '.json'
- print("【再解析】==解析失败==> word_id:{}, 报错信息:{}".format(word_id, traceback.format_exc()))
- new_fpath = configs.FAIL_FOLDER + '/' + str(word_id) + '.json'
- re_f = open(new_fpath, 'w', encoding='utf-8')
- json.dump(mydata, re_f, ensure_ascii=False)
- result["errmsgs"] = "解析失败"
- result["errcode"] = 1
- logger.info(json.dumps({"log_level": "warn",
- "paper_id": word_id,
- "status": "解析失败",
- "errmsg": str(e),
- "task_time": time.time() - st1,
- "task_name": "批量文本结构化解析"}, ensure_ascii=False))
- return json.dumps(result, ensure_ascii=False)
- @app.route('/danti_structure', methods=["GET", "POST"])
- def danti_structure():
- """
- 单题再解析、结构化
- :return:
- """
- word_id = request.json.get("paper_id", 0)
- one_item = request.json.get("single_item_data", "")
- item_type = request.json.get("item_type", "")
- source = request.json.get("source", "zxhx")
- subject = request.json.get("subject", "")
- print("【单题解析】==request.POST.dict==>word_id:{}, item_type:{}".format(word_id, item_type))
- # logger.info("【单题解析】==request.POST.single_item_data==>\n{}\n".format(one_item))
- print(word_id, item_type)
- loginfo = {"log_level": "info",
- "request_ip": request.remote_addr,
- "receive_data": {"paper_id": word_id,
- "item_type": item_type,
- "source": source,
- "subject": subject},
- "task_name": "单题解析"}
- if not word_id:
- word_id = get_wordid(one_item)
- loginfo["receive_data"]["is_auto_id"] = 1
- loginfo["receive_data"]["paper_id"] = word_id
- logger.info(json.dumps(loginfo, ensure_ascii=False))
- res = {"errcode": 0, "errmsgs":"", "data": {}}
- if item_type:
- one_res = single_parse(one_item, item_type, word_id, source, subject)
- # pprint(one_res)
- if type(one_res) == str:
- res["errcode"] = 1
- res["errmsgs"] = one_res
- else:
- res["data"] = one_res
- else:
- res["errcode"] = 1
- res["errmsgs"] = "没有选定题型"
- return json.dumps(res, ensure_ascii=False)
- # class rukuAPI(MethodView):
- # def post(self):
- # self.wordid = request.json.get("paper_id", "")
- # subject = request.json.get("subject", "")
- # items_list = request.json.get("structured_items", "") # 结构化试题
- # ocr_html_data = request.json.get("html_data", "") # 文本原始内容
- # svg_data = request.json.get("svgs", {"svg_html_data": "", "svg_path": ""}) # mathjax的相关文本
- # callback_info = request.json.get("user", {"callback_url": "", "source": ""})
- # def upload_img(self):
- # return
- @app.route('/ruku', methods=["POST"])
- def ruku():
- wordid = request.json.get("paper_id", "")
- subject = request.json.get("subject", "") # 实际传的subject_id(int型)
- items_list = request.json.get("structured_items", "") # 结构化试题
- ocr_html_data = request.json.get("html_data", "") # 文本原始内容
- svg_data = request.json.get("svgs", {"svg_html_data": "", "svg_path": ""}) # mathjax的相关文本
- callback_info = request.json.get("user", {"callback_url": "", "source": ""})
- print("【入库】==request.POST.dict==>word_id:{}, callback_info:{},subject:{}"
- .format(wordid, callback_info, subject))
- loginfo = {"log_level": "info",
- "request_ip": request.remote_addr,
- "receive_data": {"paper_id": wordid,
- "subject": subject,
- "callback_info": callback_info},
- "task_name": "保存入库"}
- # print(svg_data["svg_html_data"])
- # pickle.dump(items_list, open("./struct_items622.pickle", 'wb'))
- # pickle.dump(svg_data, open("./svg_data622.pickle", 'wb'))
- # logger.info("【入库】word_id:{}==公式数据==>svg_data:{}\n".format(wordid, str(svg_data).encode("utf8", "ignore")))
- if not wordid:
- wordid = get_wordid(str(items_list))
- loginfo["receive_data"]["is_auto_id"] = 1
- loginfo["receive_data"]["paper_id"] = wordid
- logger.info(json.dumps(loginfo, ensure_ascii=False))
- if wordid and items_list:
- st1 = time.time()
- # try:
- res = Ruku(items_list, ocr_html_data, svg_data, str(wordid), callback_info, subject).save()
- logger.info("【入库】==结束==> word_id:{}".format(wordid))
- # pprint(res)
- logger.info(json.dumps({"log_level": "info",
- "paper_id": wordid,
- "status": "任务结束",
- "task_time": time.time() - st1,
- "task_name": "保存入库"}, ensure_ascii=False))
- return json.dumps(res, ensure_ascii=False)
- # except Exception as e:
- # # print
- # logger.info("【入库】==失败==> word_id:{}\n{}".format(wordid, e))
- # return json.dumps({"errcode":1, "errmsgs": "入库失败!", "data":{}}, ensure_ascii=False)
- else:
- return "需要paperid"
- @app.route('/ser_static/<path:file_path>', methods=["GET"])
- def ser_static(file_path): # endpoint的位置是函数接口名,不能用static,与flask内部变量重名
- """
- :param file_path: 图片的本地绝对路径
- :return:
- """
- return send_from_directory(configs.IMG_FOLDER, file_path)
- def get_wordid(mydata):
- """
- 针对传参中没有wordid的情况,提取或生成wordid
- :return:
- """
- wordid_info1 = re.search("/(zyk/uploadfiles/wording|ser_static)/(\d+)/", mydata)
- wordid_info2 = re.search("/imgpaper/lqy_upload/(\d+)/", mydata)
- if wordid_info1:
- word_id = wordid_info1.group(2)
- elif wordid_info2:
- word_id = wordid_info2.group(1)
- else:
- name_list = random.sample(range(100000, 999999), 1)
- word_id = str(int(time.time())) + str(name_list[0])
- # md = hashlib.md5()
- # md.update(word_id_temp.encode("utf-8"))
- # word_id = str(md.hexdigest())
- return word_id
- if __name__ == "__main__":
- app.run(host=configs.server_ip, port=configs.server_port, threaded=True, debug=True) # threaded=True
- # app.run(processes=4) # 多进程或多线程只能选择一个,不能同时开启
- # 5fc64a0a4994183dda7e74b9
- # from gevent import pywsgi
- # # app.debug = True
- # server = pywsgi.WSGIServer((configs.server_ip, configs.server_port), app)
- # server.serve_forever()
|