#!/usr/bin/env/python
# -*- coding:utf-8 -*-
# import sys
# sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci")
from pprint import pprint
from typing import Any
# from utils.exam_type import get_exam_type
# from utils.get_data import Mongo
from structure.final_structure import one_item_structure
from utils.stem_ans_split import get_split_pos
from utils.washutil import *
from utils.washutil_for_DL_way import HtmlWash_2
from structure.three_parse_structure import *
from utils.pic_pos_judge import img_regroup
from func_timeout import func_set_timeout
import requests
import time
from structure.ans_structure import get_ans_match
from utils.xuanzuoti2slave import toslave_bef, toslave_aft
logger = configs.myLog(__name__, log_cate="reparse_ruku_log").getlog()
paper_types = ["第三种试卷格式:题目与答案分开",
"第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
"第一种试卷格式:教师用卷,含答案和解析关键字"]
class WordParseStructure:
"""
基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
"""
def __init__(self, html, wordid, is_reparse=0, must_latex=0, source="zxhx", subject="数学"):
self.html = html
self.is_reparse = is_reparse
self.wordid = wordid
self.must_latex = must_latex
self.source = source
self.subject = subject
def __call__(self):
if self.source in ["school", "qtk"] and re.search("物理|数学", self.subject): # "school" "xue_guan", "teacher"
t1 = time.time()
res = self.structure_combine_DL()
if not res[0]:
logger.info("----【paper_id:{}】模型切题没切出来".format(self.wordid))
return self.structure()
logger.info("----【paper_id:{}】采用切题服务花费时间:{}".format(self.wordid, time.time()-t1))
return res
else:
return self.structure()
def structure_combine_DL(self):
# 第一步:清洗
htmltext, row_list, new_html = HtmlWash_2(self.html, self.wordid, self.is_reparse,
must_latex=self.must_latex).html_cleal()
if not row_list:
return {"errcode": 1, "errmsgs": "题文没有有效信息", "data": {}}, ""
# 第二步:寻找题目和答案的切分点,一定要有“答案”关键字
split_res = get_split_pos(row_list)
if type(split_res) == str:
return {"errcode": 1, "errmsgs": split_res, "data": {}}, paper_types[0]
row_list, items_list, ans_list, _ = split_res
rd1_may_fail = 0
paper_type = ""
item_res = {}
if "【答案】" in "".join(items_list) or "【解析】" in "".join(items_list):
rd1_may_fail = 1
elif items_list:
paper_type = "第三种试卷格式:题目与答案分开"
try:
r1 = requests.post(url=configs.topic_segment_ip,
json={"content": "
".join(items_list), "subject": self.subject,
"paper_id": self.wordid, "text_type": "stem_block"})
item_res = r1.json()["res"]
# 试卷开头容易切错,需判断一下;也可以不判断
if len(item_res)>1 and re.match('', item_res[0]['stem']) and \
"$" not in item_res[0]['stem'] and (re.search("试[题卷]", item_res[0]['stem']) or
re.match("\s*[\u4e00-\u9fa5\d]{,20}$")):
item_res = item_res[1:]
# print(item_res)
r2 = requests.post(url=configs.topic_segment_ip,
json={"content": "
".join(ans_list), "subject": self.subject,
"paper_id": self.wordid, "text_type": "answer_block"})
all_ans, ans_no = r2.json()["res"]
# print(1111111111111,all_ans)
print(ans_no)
# 根据ans_no纠正切错的all_ans,如[2, 6, 4, None, 7, None, 5, None, 1]
if abs(len([i for i in ans_no if i]) - len(item_res)) <= 2:
last_idx = None
new_ans_no = ans_no.copy()
for i, no in enumerate(ans_no):
if no is not None:
last_idx = i
if i > 0 and no is None and last_idx is not None:
all_ans[last_idx] += "\n"+all_ans[i]
all_ans[i] = ""
new_ans_no[i] = "del"
all_ans = [j for j in all_ans if j]
ans_no = [i for i in new_ans_no if i != 'del']
if abs(len(ans_no) - len(item_res)) > 2:
item_res = ans_block_split(ans_list, item_res)
else:
item_res = get_ans_match(item_res, all_ans, ans_no, {}, 'model_split')
except Exception as e:
logger.info("----【paper_id:{}】切题服务异常:{}".format(self.wordid, e))
else:
rd1_may_fail = 1
if rd1_may_fail:
try:
r3 = requests.post(url=configs.topic_segment_ip,
json={"content": "
".join(row_list), "subject": self.subject,
"paper_id": self.wordid, "text_type": "stem_block"})
item_res = r3.json()["res"]
# 还需判断下教师卷
for k, one_res in enumerate(item_res):
if re.search('\n【(答案|[解分][析答]|详解|点[评睛]|考点|专题)】', one_res["stem"]):
case = "case1" # 默认有“答案”关键字
if re.search(r'\n【答案】|[\n】]\s*答案\s*[::]', one_res["stem"]) is None:
# 没“答案”关键字
case = "case0"
dd1 = stem_ans_split(one_res, case) # 对切分后的每道题再细分
one_res["stem"] = dd1["stem"]
del dd1["stem"]
one_res.update(dd1)
else: # 没有解析的情况
one_res.update({"key": "", "parse": ""})
except Exception as e:
logger.info("----【paper_id:{}】切题服务异常:{}".format(self.wordid, e))
# ==========小题结构化========
if item_res:
# 答案解析字段完善
for i, one_item in enumerate(item_res):
if 'key' not in one_item:
item_res[i]['key'] = ""
if 'parse' not in one_item:
item_res[i]['parse'] = ""
# 单题结构化
consumer = ['noslave'] * len(item_res)
items_no_type = [1] * len(item_res)
xyz = zip(item_res, consumer, items_no_type)
res = list(map(one_item_structure, xyz)) # 和多进程相比,这样速度也很快
# pprint(res)
# ==========最后的清洗=========
res = wash_after(res, self.wordid, self.subject)
# 针对模型可能切错的地方纠正,放在切割模型预测中纠正了
# for i, one_item in enumerate(res):
# if i>0 and one_item['topic_num'] is None and res[i-1]['topic_num'] is not None and res[i+1]['topic_num'] is not None \
# and res[i+1]['topic_num'] - res[i-1]['topic_num'] == 1 and not one_item['key'] and not one_item['parse']:
# if res[i-1]["parse"]:
# res[i - 1]["parse"] += one_item['stem']
# del res[i]
# elif res[i-1]["key"]:
# res[i - 1]["key"] += one_item['stem']
# del res[i]
# pprint(res)
# 结果返回
if self.is_reparse:
return {"html":new_html, "items": res}, paper_type
else:
return {"items": res}, paper_type
else:
return {}, paper_type
def img_repl(self, one_dict):
"""
初步拆分题目后,图片信息的替换
:return:
"""
imgs = {s: re.findall("
"+"
\n".join(html.split("\n"))+"
" # with open(r"F:\zwj\Text_Structure\fail_files3\c5e222c5fbded2a2264ae002907fc92c__2021_04_16_18_43_23.json", 'r') as load_f: # html = json.load(load_f) # print(load_dict) # path2 = r"C:\Users\Python\Desktop\bug\5-9\663c90361ec1003b58557474.html" path2 = r"F:\zwj\Text_Structure\accept_files\667cb9c0c3c4da9e7009b8c4.html" path2 = r"F:\zwj\Text_Structure\accept_files\668f4d57c3c4da9e7009bcd8.html" # path2 = r"C:\Users\Python\Desktop\bug\6419746d11a1cdad550f5502.html" # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\data\620bbf7aa7d375f4518b98e1.html" # path2 = r"F:\zwj\new_word_text_extract_v2\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级(下)第二次联考地理试卷-普通用卷.html" # path2 = r"F:\zwj\new_word_parse_2021\data\huaxue\huexue2.html" # path2 = r"C:\Users\Python\Desktop\bug\6258cc7af84c0e279ac64301.html" # 正则卡死 # path2 = r"C:\Users\Python\Desktop\bug\629073b9f84c0e279ac64811.html" # 正则卡死 # 62650d5cf84c0e279ac643f1 6258cc7af84c0e279ac64301 62660fa2f84c0e279ac643f5 # path2 = r"C:\Users\Python\Desktop\123\666fcb5bc3c4da9e7009b607_2.html" html = open(path2, "r", encoding="utf-8").read() # html = """ # \n1.下列化学符号中的数字“”表示的意义不正确的是
\nA.:“”表示两个氧原子
\nB.:“”表示一个二氧化氮分子含有两个氧原子
\nC.:“”表示两个氢氧根离子
\nD.:“”表示氧化镁中镁元素的化合价为价
\n【答案】
\nA
\n【解析】
\n根据元素符号前面的数字表示原子的个数,元素符号右下角的数字表示一个分子中的原子个数,离子符号前面的数字表示离子的个数,元素符号正上方的数字表示元素的化合价。A.:“”表示一个氧分子由两个氧原子组成,故选项表示的意义不正确;B.元素符号右下角的数字表示一个分子中的原子个数,故:“”表示一个二氧化氮分子含有两个氧原子,故表示的意义正确;C.离子符号前面的数字表示离子的个数,故:“”表示两个氢氧根离子,故表示的意义正确;D.元素符号正上方的数字表示元素的化合价,故中的“”表示镁元素的化合价为价,故表示的意义正确。故选:A。
\n\n
2.亚油酸具有降低人体血液中胆固醇及血脂的作用,它的化学式为,下列说法中正确的是
\nA.亚油酸是由三个元素构成的化合物
\nB.每个亚油酸分子中含有个原子
\nC.亚油酸中碳.氧元素的质量比为
\nD.每个亚油酸分子中含有个碳原子、个氢原子、个氧分子
\n【答案】
\nC
\n【解析】
\nA.由化学式可知,亚油酸是由、、三种元素组成的化合物,A错误。B.每个亚油酸分子中含有个原子,B错误。C.亚油酸中碳.氧元素的质量比为,C正确。D.每个亚油酸分子中含有个碳原子、个氢原子、个氧原子,D错误。故选:C。
\n # """ # print(html) # html = "\n1、已知集合M满足{1,2}≤M≤{1,2,5,6,7},则\n符合条件的集合M有__个." # html = html.replace('