5 月之前 · 52bc8fe44d
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
--- a/__pycache__/configs.cpython-36.pyc
+++ b/__pycache__/configs.cpython-36.pyc
--- a/ceshi.py
+++ b/ceshi.py
@@ -232,3 +232,6 @@ repeat_ip = "http://82.156.68.22:8888/repeat/subject"  # 全学科查重
 
				 # print(repeat_r.content)
			
 
				 import os
			
 
				 print(os.environ.get('APPDATA'))
			
 
				+# repeat_r = requests.post(url=configs.repeat_ip, json=[chachong_item_dict]).json()
			
 
				+# print(repeat_r)
			
 
				+
			
--- a/configs.py
+++ b/configs.py
@@ -89,7 +89,9 @@ class TestingCfg:  # testing
 
				     kps_phy_ip = "http://192.168.1.86:11088/phy_mark_and_connect"
			
 
				     kps_Hmath_ip = "http://192.168.1.192:13356/auto_labels"
			
 
				     repeat_ip = "http://192.168.1.192:8866/api/repeat/subject"
			
 
				-    # repeat_ip = "http://82.156.68.22:8888/repeat/subject"  # 全学科查重
			
 
				+    topic_segment_ip = "http://192.168.1.204:10622/math_phy_TopicSegment_predict"
			
 
				+    phy_topicType_ip = "http://192.168.1.204:10611/phy_topicType_predict"
			
 
				+    # repeaty_ip = "http://82.156.68.22:8888/repeat/subject"  # 全学科查重
			
 
				     # repeat_ip = "http://82.156.68.22:8888/api/repeat/subject"  # 保存入库查重
			
 
				     # callback_url_taskcheck = "http://zsytk3api.dev.xueping.com/v1/interior-api/record"
			
 
				     callback_url_taskcheck = "http://zsytk3api.testing.xueping.com/v1/interior-api/record"
			
@@ -112,6 +114,8 @@ class ProductionCfg:  # production
 
				     kps_phy_ip = "http:/49.232.72.198:11088/phy_mark_and_connect"
			
 
				     kps_Hmath_ip = "http://172.16.2.5:13356/auto_labels"
			
 
				     repeat_ip = "http://10.19.1.18:8866/api/repeat/subject"
			
 
				+    topic_segment_ip = "http://10.19.1.14:10622/math_phy_TopicSegment_predict"
			
 
				+    phy_topicType_ip = "http://10.19.1.6:10611/phy_topicType_predict"
			
 
				     callback_url_taskcheck = "http://api.tk.zhixinhuixue.com/v1/interior-api/record"
			
 
				 
			
 
				 
			
@@ -162,6 +166,8 @@ kps_phy_ip = config_class.kps_phy_ip
 
				 kps_Hmath_ip = config_class.kps_Hmath_ip
			
 
				 callback_url_taskcheck = config_class.callback_url_taskcheck
			
 
				 repeat_ip = config_class.repeat_ip
			
 
				+topic_segment_ip = config_class.topic_segment_ip
			
 
				+phy_topicType_ip = config_class.phy_topicType_ip
			
 
				 
			
 
				 # 注意：
			
 
				 # 单题解析中，线上css_conflict_deal与线下不一样
			
--- a/server.py
+++ b/server.py
@@ -38,13 +38,17 @@ def word_structure():
 
				     mydata = request.json.get("sci_html_data", "")
			
 
				     is_reparse = request.json.get("is_reparse", "0")
			
 
				     word_id = request.json.get("paper_id", 0)
			
 
				-    must_latex = request.form.get("must_latex", 1)
			
 
				+    source = request.json.get("source", "zxhx")
			
 
				+    subject = request.json.get("subject", "")
			
 
				+    must_latex = request.json.get("must_latex", 1)  # 非必传
			
 
				     print("【再解析】==request.POST.dict==>is_reparse:{}, word_id:{}".format(is_reparse, word_id))
			
 
				     # print(mydata)
			
 
				     loginfo = {"log_level": "info",
			
 
				                "request_ip": request.remote_addr,
			
 
				                "receive_data": {"paper_id": word_id,
			
 
				-                                "is_reparse": is_reparse},
			
 
				+                                "is_reparse": is_reparse,
			
 
				+                                "source": source,
			
 
				+                                "subject": subject},
			
 
				                "task_name": "批量文本结构化解析"}
			
 
				 
			
 
				     # 接收的文件记录一下,按wordid命名
			
@@ -68,7 +72,7 @@ def word_structure():
 
				     st1 = time.time()
			
 
				     try:
			
 
				         if int(is_reparse) and word_id:  # 再解析
			
 
				-            res, paper_type = WordParseStructure(mydata, str(word_id), int(is_reparse), must_latex).structure()
			
 
				+            res, paper_type = WordParseStructure(mydata, str(word_id), int(is_reparse), must_latex, source, subject)()
			
 
				             # print(res)
			
 
				             if "errcode" not in res:
			
 
				                 result["data"] = res
			
@@ -76,7 +80,7 @@ def word_structure():
 
				                 result = res
			
 
				             print("【再解析】==解析结束==> word_id:{}".format(word_id))
			
 
				         elif not int(is_reparse) and mydata:  # 不是再解析
			
 
				-            res, paper_type = WordParseStructure(mydata, "").structure()
			
 
				+            res, paper_type = WordParseStructure(mydata, "", source=source, subject=subject)()
			
 
				             # print(res)
			
 
				             if "errcode" not in res:
			
 
				                 result["data"] = res
			
@@ -129,13 +133,17 @@ def danti_structure():
 
				     word_id = request.json.get("paper_id", 0)
			
 
				     one_item = request.json.get("single_item_data", "")
			
 
				     item_type = request.json.get("item_type", "")
			
 
				+    source = request.json.get("source", "zxhx")
			
 
				+    subject = request.json.get("subject", "")
			
 
				     print("【单题解析】==request.POST.dict==>word_id:{}, item_type:{}".format(word_id, item_type))
			
 
				     # logger.info("【单题解析】==request.POST.single_item_data==>\n{}\n".format(one_item))
			
 
				     print(word_id, item_type)
			
 
				     loginfo = {"log_level": "info",
			
 
				                "request_ip": request.remote_addr,
			
 
				                "receive_data": {"paper_id": word_id,
			
 
				-                                "item_type": item_type},
			
 
				+                                "item_type": item_type,
			
 
				+                                "source": source,
			
 
				+                                "subject": subject},
			
 
				                "task_name": "单题解析"}
			
 
				 
			
 
				     if not word_id:
			
@@ -146,7 +154,7 @@ def danti_structure():
 
				 
			
 
				     res = {"errcode": 0, "errmsgs":"", "data": {}}
			
 
				     if item_type:
			
 
				-        one_res = single_parse(one_item, item_type, word_id)
			
 
				+        one_res = single_parse(one_item, item_type, word_id, source, subject)
			
 
				         # pprint(one_res)
			
 
				         if type(one_res) == str:
			
 
				             res["errcode"] = 1
			
--- a/structure/__pycache__/ans_structure.cpython-36.pyc
+++ b/structure/__pycache__/ans_structure.cpython-36.pyc
--- a/structure/__pycache__/danti_structure.cpython-36.pyc
+++ b/structure/__pycache__/danti_structure.cpython-36.pyc
--- a/structure/__pycache__/dati2slave.cpython-36.pyc
+++ b/structure/__pycache__/dati2slave.cpython-36.pyc
--- a/structure/__pycache__/final_structure.cpython-36.pyc
+++ b/structure/__pycache__/final_structure.cpython-36.pyc
--- a/structure/__pycache__/option.cpython-36.pyc
+++ b/structure/__pycache__/option.cpython-36.pyc
--- a/structure/__pycache__/stems_structure.cpython-36.pyc
+++ b/structure/__pycache__/stems_structure.cpython-36.pyc
--- a/structure/__pycache__/structure_main.cpython-36.pyc
+++ b/structure/__pycache__/structure_main.cpython-36.pyc
--- a/structure/__pycache__/three_parse_structure.cpython-36.pyc
+++ b/structure/__pycache__/three_parse_structure.cpython-36.pyc
--- a/structure/ans_structure.py
+++ b/structure/ans_structure.py
@@ -314,6 +314,7 @@ def only_parse_split(one_item_ans, item_type, res_con, reparse_n=1):
 
				     ：reparse_n == 1：表示再解析
			
 
				     :return:{'key': ,"parse": }
			
 
				     """
			
 
				+    one_item_ans = re.sub(r"(\[.*?\])?\(.*?\d+分\)", "", one_item_ans[:20]) + one_item_ans[20:]
			
 
				     one_item_ans = re.sub("\n\s*(化学|物理|生物|和|与)+\s*【答案】\s*$", '', one_item_ans)
			
 
				     dd = {'parse': one_item_ans, 'key': ""}
			
 
				     if "选修" in one_item_ans.replace(" ", "")[:10] or \
			
@@ -345,13 +346,27 @@ def only_parse_split(one_item_ans, item_type, res_con, reparse_n=1):
 
				                            re.split(r"(解)\s*[：:]", one_item_ans, maxsplit=1)))
			
 
				         if "【答案】" in temp_ans:
			
 
				             dd["key"] = dd1["key"]
			
 
				+            if not dd["key"] and dd1["parse"].strip():
			
 
				+                dd["key"] = "见解析"
			
 
				             if len(dd1) >= 3:
			
 
				                 dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
			
 
				                 del dd1["parse_title"]
			
 
				             return dd
			
 
				         if len(dd1) >= 3:
			
 
				-            dd["key"] = dd1["key"]
			
 
				-            dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
			
 
				+            dd["key"] = dd1["key"].strip()
			
 
				+            rest_parse = ""
			
 
				+            # 细节处理2024.5.7
			
 
				+            if re.search("^<img .+?/>$", dd["key"]):
			
 
				+                dd["key"] = "见解析"
			
 
				+                rest_parse = dd1["key"].strip()
			
 
				+            if dd1["parse_title"] == "解":
			
 
				+                dd["parse"] = "解:" + dd1["parse"]
			
 
				+            else:
			
 
				+                dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
			
 
				+            if rest_parse:
			
 
				+                dd["parse"] = rest_parse + "\n" + dd["parse"]
			
 
				+            if not dd["key"] and (dd1["parse"].strip() or rest_parse):
			
 
				+                dd["key"] = "见解析"
			
 
				             del dd1["parse_title"]
			
 
				             return dd
			
 
				         sim_parse = re.split("【点评】|【点睛】", dd["parse"])[0].strip()
			
@@ -684,8 +699,8 @@ def ans_structure_step2(anss, item_type_classify, item_res, *group):
 
				             print("ans_no:::",ans_no)   # ans_no只记录表格答案和排列型答案
			
 
				             print("::::", ans_no0, ans_no_idx0)
			
 
				             pre_split_ansinfo_list = all_item_ans, ans_no, ans_no0, ans_no_idx0, anss_str, is_from_0, ans_item_no_type
			
 
				-            all_item_ans, ans_no = ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
			
 
				-            item_res = get_ans_match(item_res, all_item_ans, ans_no, group)
			
 
				+            all_item_ans, ans_no, repet_ans= ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
			
 
				+            item_res = get_ans_match(item_res, all_item_ans, ans_no, repet_ans, group)
			
 
				 
			
 
				     return item_res
			
 
				 
			
@@ -798,7 +813,17 @@ def ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
 
				     """
			
 
				     all_item_ans, ans_no, ans_no0, ans_no_idx0, anss_str, is_from_0, ans_item_no_type = pre_split_ansinfo_list
			
 
				     ans_no1 = ans_no.copy()
			
 
				-    ans_no1.extend(ans_no0)
			
 
				+    # 开头的答案是选择题时，存在后面答案仍然存在选择题的详解
			
 
				+    repet_ans = {}
			
 
				+    if len(ans_no0) == len(item_res):
			
 
				+        ans_no1 = ans_no0.copy()
			
 
				+        repet_ans = dict(zip(ans_no, all_item_ans))
			
 
				+        all_item_ans = []
			
 
				+    else:
			
 
				+        # ans_no0 = [i for i in ans_no0 if i not in ans_no1]
			
 
				+        # del_no = [i for i,v in enumerate(ans_no0) if v not in ans_no1]
			
 
				+        # rest_item_split = [v for i,v in enumerate(rest_item_split) if i not in del_no]
			
 
				+        ans_no1.extend(ans_no0)
			
 
				     # 先按换行格式获取答案（没有一行多个答案的情况）
			
 
				     print("ans_no1:",ans_no1)
			
 
				     print("item_type_num:",item_type_num)
			
@@ -911,10 +936,10 @@ def ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
 
				         all_item_ans.extend(rest_item_split)
			
 
				         ans_no = ans_no1
			
 
				 
			
 
				-    return all_item_ans, ans_no
			
 
				+    return all_item_ans, ans_no, repet_ans
			
 
				 
			
 
				 
			
 
				-def get_ans_match(item_res, all_ans, ans_no, *group):
			
 
				+def get_ans_match(item_res, all_ans, ans_no, repet_ans=None, *group):
			
 
				     """
			
 
				     根据切分后的答案及其题号，与前面试题进行匹配更新,all_ans和ans_no的个数应该是相同的
			
 
				     :param item_res:
			
@@ -938,6 +963,8 @@ def get_ans_match(item_res, all_ans, ans_no, *group):
 
				                 if k<= len(item_res)-1:
			
 
				                     simp_res = only_parse_split(one_ans, item_res[temp_id]["type"], res_con)
			
 
				                     item_res[temp_id].update(simp_res)
			
 
				+                    if repet_ans and item_res[temp_id]["item_id"] in repet_ans:
			
 
				+                        item_res[temp_id]["key"] = repet_ans[item_res[temp_id]["item_id"]]
			
 
				                 else:
			
 
				                     item_res[temp_id].update({'key': "", 'parse': ""})
			
 
				             else:
			
@@ -948,6 +975,9 @@ def get_ans_match(item_res, all_ans, ans_no, *group):
 
				                     item_res[temp_id]['parse'] = one_ans
			
 
				                     if not item_res[temp_id]['key']:
			
 
				                         item_res[temp_id]['key'] = '见解析'
			
 
				+                if group[0] == 'model_split':
			
 
				+                    simp_res = only_parse_split(one_ans, item_res[temp_id]["type"], res_con)
			
 
				+                    item_res[temp_id].update(simp_res)
			
 
				     return item_res
			
 
				 
			
 
				 
			
@@ -970,9 +1000,6 @@ def anss_split_contain_slave(subject, ans_str):
 
				             temp_ans_no1, ans_no_idx1 = get_right_no((ans_no_idx1, temp_ans_no1), 1)  # 筛选
			
 
				 
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				 #     one_item_ans = """
			
 
				 #     (12分)
			
--- a/structure/danti_structure.py
+++ b/structure/danti_structure.py
@@ -9,12 +9,12 @@ import re
 
				 from structure.option import option_structure
			
 
				 from utils.equation_extract import get_simpstr2eqn, get_equation_instr
			
 
				 from utils.html_again_parse import css_label_wash
			
 
				-from utils.washutil import base642img, css_conflict_deal, convert_huanhang
			
 
				+from utils.washutil import base642img, css_conflict_deal, convert_huanhang, wash_after
			
 
				 from utils.field_eq2latex import latex_wash
			
 
				 from structure.dati2slave import get_slave
			
 
				 
			
 
				 
			
 
				-def single_parse(one_item, item_type, wordid):
			
 
				+def single_parse(one_item, item_type, wordid, source="zxhx", subject="数学"):
			
 
				     """
			
 
				     rtype:题型
			
 
				     :return:
			
@@ -51,7 +51,7 @@ def single_parse(one_item, item_type, wordid):
 
				         # elif len(res_list) < 5:
			
 
				         #     return "编辑后的文本掉了【答案】或【解析】字段，请添加，每个字段保留唯一"
			
 
				 
			
 
				-    new_item_struct = {}
			
 
				+    new_item_struct = {"type": item_type}
			
 
				     new_item_struct["key"] = ""
			
 
				     new_item_struct["parse"] = ""
			
 
				     new_item_struct["stem"] = res_list[0]
			
@@ -71,7 +71,7 @@ def single_parse(one_item, item_type, wordid):
 
				             new_item_struct["parse"] = res_list[2]
			
 
				 
			
 
				     item_ids = re.findall("^([1-9][0-9]|[1-9])\s*[.．、､]", new_item_struct["stem"].strip())
			
 
				-    new_item_struct["topic_num"] = int(item_ids[0]) if item_ids else 0
			
 
				+    new_item_struct["item_id"] = int(item_ids[0]) if item_ids else 0
			
 
				 
			
 
				     new_item_struct["stem"] = re.sub("^([1-9][0-9]|[1-9])\s*[.．、､]", "", new_item_struct["stem"].strip())
			
 
				     if len(new_item_struct["stem"].strip()) < 3:
			
@@ -87,13 +87,12 @@ def single_parse(one_item, item_type, wordid):
 
				                                                  if not i.replace("：", "").strip()]):  # 空选项中：被当成了内容
			
 
				             return "存在选项为空，请补充完整"
			
 
				         new_item_struct["answer_type"] = "选择题"
			
 
				-    else:
			
 
				-        new_item_struct["type"] = item_type
			
 
				+    elif source in ["xue_guan", "teacher"] and subject not in ["数学", "物理"]:  # 拆小题
			
 
				         new_item_struct = get_slave(new_item_struct, new_item_struct["stem"], new_item_struct["parse"], new_item_struct["key"])
			
 
				-        if "item_id" in new_item_struct:
			
 
				-            del new_item_struct["item_id"]
			
 
				+
			
 
				+    new_item_struct = wash_after([new_item_struct], subject)[0]
			
 
				     # 换行符替换
			
 
				-    convert_huanhang(new_item_struct)
			
 
				+    # convert_huanhang(new_item_struct)
			
 
				     # new_item_struct["stem"] = new_item_struct["stem"].strip().replace("\n\n", "\n").replace("\n", "<br/>")  # 2020/4/10 gai
			
 
				     # new_item_struct["key"] = new_item_struct["key"].strip().replace("\n\n", "\n").replace("\n", "<br/>")
			
 
				     # new_item_struct["parse"] = new_item_struct["parse"].strip().replace("\n\n", "\n").replace("\n", "<br/>")
			
--- a/structure/structure_main.py
+++ b/structure/structure_main.py
@@ -3,16 +3,21 @@
 
				 
			
 
				 
			
 
				 from pprint import pprint
			
 
				+from typing import Any
			
 
				 # from utils.exam_type import get_exam_type
			
 
				 # from utils.get_data import Mongo
			
 
				 from structure.final_structure import one_item_structure
			
 
				 from utils.stem_ans_split import get_split_pos
			
 
				 from utils.washutil import *
			
 
				+from utils.washutil_for_DL_way import HtmlWash_2
			
 
				 from structure.three_parse_structure import *
			
 
				 from utils.pic_pos_judge import img_regroup
			
 
				 from func_timeout import func_set_timeout
			
 
				+import requests
			
 
				+from structure.ans_structure import get_ans_match
			
 
				 
			
 
				 from utils.xuanzuoti2slave import toslave_bef, toslave_aft
			
 
				+logger = configs.myLog(__name__, log_cate="reparse_ruku_log").getlog()
			
 
				 
			
 
				 paper_types = ["第三种试卷格式：题目与答案分开",
			
 
				                "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
			
@@ -23,11 +28,135 @@ class WordParseStructure:
 
				         基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, html, wordid, is_reparse=0, must_latex=0):
			
 
				+    def __init__(self, html, wordid, is_reparse=0, must_latex=0, source="zxhx", subject="数学"):
			
 
				         self.html = html
			
 
				         self.is_reparse = is_reparse
			
 
				         self.wordid = wordid
			
 
				         self.must_latex = must_latex
			
 
				+        self.source = source
			
 
				+        self.subject = subject
			
 
				+
			
 
				+    def __call__(self):
			
 
				+        if self.source not in ["school"]: # == "school" "xue_guan", "teacher":
			
 
				+            res = self.structure_combine_DL()
			
 
				+            if not res[0]:
			
 
				+                return self.structure()
			
 
				+            logger.info("----【paper_id:{}】采用切题服务".format(self.wordid))
			
 
				+            return res
			
 
				+        else:
			
 
				+            return self.structure()
			
 
				+
			
 
				+    
			
 
				+    def structure_combine_DL(self):
			
 
				+        # 第一步：清洗
			
 
				+        htmltext, row_list, new_html = HtmlWash_2(self.html, self.wordid, self.is_reparse,
			
 
				+                                                     must_latex=self.must_latex).html_cleal()
			
 
				+        if not row_list:
			
 
				+            return {"errcode": 1, "errmsgs": "题文没有有效信息", "data": {}}, ""
			
 
				+        # 第二步：寻找题目和答案的切分点，一定要有“答案”关键字
			
 
				+        split_res = get_split_pos(row_list)
			
 
				+        if type(split_res) == str:
			
 
				+            return {"errcode": 1, "errmsgs": split_res, "data": {}}, paper_types[0]
			
 
				+        row_list, items_list, ans_list, _ = split_res
			
 
				+        rd1_may_fail = 0
			
 
				+        paper_type = ""
			
 
				+        item_res = {}
			
 
				+        if "【答案】" in "".join(items_list) or "【解析】" in "".join(items_list):
			
 
				+            rd1_may_fail = 1
			
 
				+        elif items_list:
			
 
				+            paper_type = "第三种试卷格式：题目与答案分开"
			
 
				+            try:
			
 
				+                r1 = requests.post(url=configs.topic_segment_ip,
			
 
				+                                  json={"content": "<br>".join(items_list), "subject": self.subject,
			
 
				+                                        "paper_id": self.wordid, "text_type": "stem_block"})
			
 
				+                item_res = r1.json()["res"]
			
 
				+                # print(item_res)
			
 
				+                r2 = requests.post(url=configs.topic_segment_ip,
			
 
				+                                   json={"content": "<br>".join(ans_list), "subject": self.subject,
			
 
				+                                         "paper_id": self.wordid, "text_type": "answer_block"})
			
 
				+                all_ans, ans_no = r2.json()["res"]
			
 
				+                # print(1111111111111,all_ans)
			
 
				+                print(ans_no)
			
 
				+                # 根据ans_no纠正切错的all_ans,如[2, 6, 4, None, 7, None, 5, None, 1]
			
 
				+                if abs(len([i for i in ans_no if i]) - len(item_res)) <= 2:
			
 
				+                    last_idx = None
			
 
				+                    new_ans_no = ans_no.copy()
			
 
				+                    for i, no in enumerate(ans_no):
			
 
				+                        if no is not None:
			
 
				+                            last_idx = i
			
 
				+                        if i > 0 and no is None and last_idx is not None:
			
 
				+                            all_ans[last_idx] += "\n"+all_ans[i]
			
 
				+                            all_ans[i] = ""
			
 
				+                            new_ans_no[i] = "del"
			
 
				+                    all_ans = [j for j in all_ans if j]
			
 
				+                    ans_no = [i for i in new_ans_no if i != 'del']
			
 
				+
			
 
				+                if abs(len(ans_no) - len(item_res)) > 2:
			
 
				+                    item_res = ans_block_split(ans_list, item_res)
			
 
				+                else:
			
 
				+                    item_res = get_ans_match(item_res, all_ans, ans_no, {}, 'model_split')
			
 
				+            except Exception as e:
			
 
				+                logger.info("----【paper_id:{}】切题服务异常：{}".format(self.wordid, e))
			
 
				+        else:
			
 
				+            rd1_may_fail = 1
			
 
				+        
			
 
				+        if rd1_may_fail:
			
 
				+            try:
			
 
				+                r3 = requests.post(url=configs.topic_segment_ip,
			
 
				+                                         json={"content": htmltext, "subject": self.subject,
			
 
				+                                               "paper_id": self.wordid, "text_type": "stem_block"})
			
 
				+                item_res = r3.json()["res"]
			
 
				+                # 还需判断下教师卷
			
 
				+                for k, one_res in enumerate(item_res):
			
 
				+                    if re.search('\n【(答案|[解分][析答]|详解|点[评睛]|考点|专题)】', one_res["stem"]):
			
 
				+                        case = "case1"  # 默认有“答案”关键字
			
 
				+                        if re.search(r'\n【答案】|[\n】]\s*答案\s*[：:]', one_res["stem"]) is None:
			
 
				+                            # 没“答案”关键字
			
 
				+                            case = "case0"
			
 
				+                        dd1 = stem_ans_split(one_res, case)  # 对切分后的每道题再细分
			
 
				+                        one_res["stem"] = dd1["stem"]
			
 
				+                        del dd1["stem"]
			
 
				+                        one_res.update(dd1)
			
 
				+                    else:  # 没有解析的情况
			
 
				+                        one_res.update({"key": "", "parse": ""})
			
 
				+            except Exception as e:
			
 
				+                logger.info("----【paper_id:{}】切题服务异常：{}".format(self.wordid, e))
			
 
				+        
			
 
				+        # ==========小题结构化========
			
 
				+        if item_res:
			
 
				+            # 答案解析字段完善
			
 
				+            for i, one_item in enumerate(item_res):
			
 
				+                if 'key' not in one_item:
			
 
				+                    item_res[i]['key'] = ""
			
 
				+                if 'parse' not in one_item:
			
 
				+                    item_res[i]['parse'] = ""
			
 
				+            # 单题结构化
			
 
				+            consumer = ['noslave'] * len(item_res)
			
 
				+            items_no_type = [1] * len(item_res)
			
 
				+            xyz = zip(item_res, consumer, items_no_type)
			
 
				+            res = list(map(one_item_structure, xyz))  # 和多进程相比，这样速度也很快
			
 
				+            # pprint(res)
			
 
				+            # ==========最后的清洗=========
			
 
				+            res = wash_after(res, self.subject)
			
 
				+            # 针对模型可能切错的地方纠正，放在切割模型预测中纠正了
			
 
				+            # for i, one_item in enumerate(res):
			
 
				+            #     if i>0 and one_item['topic_num'] is None and res[i-1]['topic_num'] is not None and res[i+1]['topic_num'] is not None \
			
 
				+            #         and res[i+1]['topic_num'] - res[i-1]['topic_num'] == 1 and not one_item['key'] and not one_item['parse']:
			
 
				+            #         if res[i-1]["parse"]:
			
 
				+            #             res[i - 1]["parse"] += one_item['stem']
			
 
				+            #             del res[i]
			
 
				+            #         elif res[i-1]["key"]:
			
 
				+            #             res[i - 1]["key"] += one_item['stem']
			
 
				+            #             del res[i]
			
 
				+            # pprint(res)
			
 
				+            # 结果返回
			
 
				+            if self.is_reparse:
			
 
				+                return {"html":new_html, "items": res}, paper_type
			
 
				+            else:
			
 
				+                return {"items": res}, paper_type
			
 
				+        else:
			
 
				+            return {}, paper_type
			
 
				+
			
 
				 
			
 
				     def img_repl(self, one_dict):
			
 
				         """
			
@@ -80,7 +209,7 @@ class WordParseStructure:
 
				                     if len(reform_res)==2:
			
 
				                         item_res = reform_res
			
 
				                     else:
			
 
				-                        item_res, item_no_type, rd2_is_fail= reform_res
			
 
				+                        item_res, item_no_type, rd2_is_fail = reform_res
			
 
				 
			
 
				         if not items_list or rd1_may_fail or (is_may_ans and rd2_is_fail):
			
 
				             ans_n = re.findall("【答案】", "\n".join(row_list))
			
@@ -202,7 +331,8 @@ if __name__ == '__main__':
 
				 
			
 
				         # print(load_dict)
			
 
				 
			
 
				-    path2 = r"F:\zwj\Text_Structure\accept_files\64994dc4a3693ef35281fbc5.html"
			
 
				+    # path2 = r"C:\Users\Python\Desktop\bug\5-9\663c90361ec1003b58557474.html"
			
 
				+    path2 = r"F:\zwj\Text_Structure\accept_files\664597dd71453ba19c20977f.html"
			
 
				     # path2 = r"C:\Users\Python\Desktop\bug\6419746d11a1cdad550f5502.html"
			
 
				     # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\data\620bbf7aa7d375f4518b98e1.html"
			
 
				     # path2 = r"F:\zwj\new_word_text_extract_v2\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级（下）第二次联考地理试卷-普通用卷.html"
			
@@ -216,13 +346,16 @@ if __name__ == '__main__':
 
				     # """
			
 
				 
			
 
				     # print(html)
			
 
				-    res1 = WordParseStructure(html, "", 1).structure()
			
 
				+    # html = "\n1、已知集合M满足{1，2}≤M≤{1，2，5，6，7}，则\n符合条件的集合M有__个."
			
 
				+    res1 = WordParseStructure(html, "664597dd71453ba19c20977f",
			
 
				+                              is_reparse=0, must_latex=0,
			
 
				+                              source="ai", subject="物理")()
			
 
				     # new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html")
			
 
				     # re_f = open(new_fpath, 'a+', encoding='utf-8')
			
 
				     # for i in res1[0]["items"]:
			
 
				     #     re_f.write(str(i))
			
 
				     # pprint(res1)
			
 
				-    # pprint(res1[0]['items'])
			
 
				+    pprint(res1[0]['items'])
			
 
				     print('题目数量：', len(res1[0]["items"]))
			
 
				 
			
 
				     # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"
			
--- a/structure/three_parse_structure.py
+++ b/structure/three_parse_structure.py
@@ -130,6 +130,48 @@ def items_ans_reform(items_list, ans_list):
 
				     return item_res, item_no_type, rd2_is_fail
			
 
				 
			
 
				 
			
 
				+def ans_block_split(ans_list, item_res):
			
 
				+    anss1 = list(filter(lambda x: x.strip() != "", ans_list))
			
 
				+    if re.match(".+?省.+?试[卷题]|.*?答题?[卷卡页]", anss1[0]):
			
 
				+        anss1 = anss1[1:]
			
 
				+
			
 
				+    rd1_is_fail = 0
			
 
				+    have_type_line = re.search(r"[一二三四五六七八九十]\s*[、.．､]\s*[^必考基础综合中等(（\[]{2,5}题", "\n".join(anss1))
			
 
				+    if have_type_line:
			
 
				+        # 这里的anss1的清洗不应该影响rd2_is_fail中的原始文本！！先不修改看看再说
			
 
				+        anss1_cy = anss1.copy()  # 复制一份，保证不能影响后面
			
 
				+        while re.search(r"<td><p>[A-F]</p></td>|</td><td>[A-F]</td><td>|([A-F]\s*){3,}", anss1_cy[0]) is None and \
			
 
				+                (re.search(r"[\u4e00-\u9fa5]", anss1_cy[0]) is None
			
 
				+                 or re.search(r"[一二三四五六七八九十]\s*[、.．､]\s*(<imgsrc.*?/>)?\s*.{2,5}题", anss1_cy[0]) is None):
			
 
				+            del anss1_cy[0]
			
 
				+        # 答案中的题型
			
 
				+        all_type2 = re.findall(r"\n\s*[一二三四五六七八九十]\s*[、.．､:：]\s*([^必考基础综合中共等:：(（\[]{2,5}题)|"
			
 
				+                               r"\n\s*[、.．､:：]?\s*(单选题|非?选择题|不定选择题|多选题|填空题|计算题|[解简]答题|实验题|作图题|论述题|探究题)",
			
 
				+                               "\n" + "\n".join(anss1_cy))
			
 
				+        all_type2 = ["".join(a) for a in all_type2]
			
 
				+        # '本大题' 后面处理
			
 
				+        print("答案中的题型:", all_type2)
			
 
				+        ans_str = "\n" + "\n".join(anss1_cy)
			
 
				+        item_res, rd1_is_fail = anss_structure_with_type(item_res, ans_str, [], all_type2, [], {})
			
 
				+    # 没有题型行或第一次解析失败
			
 
				+    rd2_is_fail = 0
			
 
				+    if not have_type_line or rd1_is_fail:  # 答案中没有题型行 或题型行名称不规范
			
 
				+        print('没有题型行或题目和答案的题型个数不一致或第一次解析失败')
			
 
				+        anss1 = list(
			
 
				+            map(lambda x: re.sub(r"(\n|^)\s*[一二三四五六七八九十]\s*[、.．､：:]?\s*(<p>)?"
			
 
				+                                 r"(\s*.{2,5}题.+?分\s*[.。]?\s*$|.*?[(（].+?[得共]\d+分.*?[)）].*?$"
			
 
				+                                 r"|\s*.{2,5}题\s*([(（].+?[)）])?).*?$|(\n|^)\s*[^\d]{2,5}题(.+?分\s*[)）])?\s*$", "", x),
			
 
				+                anss1))
			
 
				+        # print("anss1:", anss1)
			
 
				+        raw_item_res = item_res
			
 
				+        # try:
			
 
				+        item_res = ans_structure_step1(anss1, {}, item_res)  # 答案整体结构化
			
 
				+        if str(raw_item_res) != str(item_res):
			
 
				+            rd2_is_fail = 1
			
 
				+
			
 
				+    return item_res
			
 
				+
			
 
				+
			
 
				 def split_by_keywords(con_list):
			
 
				     """
			
 
				     第一种试卷格式：教师用卷，含答案和解析关键字
			
--- a/utils/__pycache__/equation_extract.cpython-36.pyc
+++ b/utils/__pycache__/equation_extract.cpython-36.pyc
--- a/utils/__pycache__/field_eq2latex.cpython-36.pyc
+++ b/utils/__pycache__/field_eq2latex.cpython-36.pyc
--- a/utils/__pycache__/html_again_parse.cpython-36.pyc
+++ b/utils/__pycache__/html_again_parse.cpython-36.pyc
--- a/utils/__pycache__/image_convert.cpython-36.pyc
+++ b/utils/__pycache__/image_convert.cpython-36.pyc
--- a/utils/__pycache__/insert_keywords.cpython-36.pyc
+++ b/utils/__pycache__/insert_keywords.cpython-36.pyc
--- a/utils/__pycache__/item_type_line.cpython-36.pyc
+++ b/utils/__pycache__/item_type_line.cpython-36.pyc
--- a/utils/__pycache__/pic_pos_judge.cpython-36.pyc
+++ b/utils/__pycache__/pic_pos_judge.cpython-36.pyc
--- a/utils/__pycache__/qcloud_bucket.cpython-36.pyc
+++ b/utils/__pycache__/qcloud_bucket.cpython-36.pyc
--- a/utils/__pycache__/ruku_opera.cpython-36.pyc
+++ b/utils/__pycache__/ruku_opera.cpython-36.pyc
--- a/utils/__pycache__/stem_ans_split.cpython-36.pyc
+++ b/utils/__pycache__/stem_ans_split.cpython-36.pyc
--- a/utils/__pycache__/topic_no.cpython-36.pyc
+++ b/utils/__pycache__/topic_no.cpython-36.pyc
--- a/utils/__pycache__/washutil.cpython-36.pyc
+++ b/utils/__pycache__/washutil.cpython-36.pyc
--- a/utils/__pycache__/xuanzuoti2slave.cpython-36.pyc
+++ b/utils/__pycache__/xuanzuoti2slave.cpython-36.pyc
--- a/utils/html_again_parse.py
+++ b/utils/html_again_parse.py
@@ -160,6 +160,8 @@ def css_label_wash(content):
 
				             a.append(content.strip())
			
 
				 
			
 
				         new_a = "\n".join(list(map(lambda x: str(x).strip(), a)))
			
 
				+        new_a = re.sub("(\n\s*)+", "\n", new_a)
			
 
				+        # print("newa:::", new_a)
			
 
				         if subs2img:
			
 
				             new_a = re.sub("|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a)
			
 
				         new_a = "<p>" + new_a.replace("\n\n", "\n").replace("\n", "</p>\n<p>") + "</p>"
			
@@ -187,13 +189,117 @@ if __name__ == '__main__':
 
				     cons1 = '''
			
 
				     9 . 中国古代的政治权力由“传贤”转变为“传子”，“家天下”制度开始形成于<table name=\"optionsTable\" style=\"width:100%;table-layout:fixed;\" cols=\"4\"><tr><td>A．夏朝</td><td>B．商朝</td><td>C．周朝</td><td>D．秦朝</td></tr></table>
			
 
				     '''
			
 
				-
			
 
				+    cons2 = '''
			
 
				+    <p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">1.下列对这首诗的赏析，不正确的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">小寒食舟中作</span></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">杜甫</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">佳辰强饮食犹寒，隐几萧条戴鹖冠。春水船如天上坐，老年花似雾中看。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">娟娟戏蝶过闲幔，片片轻鸥下急湍。云白山青万余里，愁看直北是长安。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">[注]这首诗写于大历五年春诗人淹留潭州时，即诗人去世前半年多。鹖（hé）冠：传为楚隐者鹖冠子所戴的帽子。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联中“强饮”一词是痛快豪饮的意思，表明诗人晚年要纵酒人生。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.颔联写诗人在船上所见所感，春来水涨，江流浩瀚，自己老眼昏花。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颈联运用叠词，具有韵律美，写景由近及远，由蝴蝶而鸥鸟，层次分明。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联总收全诗，诗人北望长安，思朝廷，忧愁顿生，有沉郁苍茫之美。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">A</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“强饮”理解有误。应是“勉强吃一点饭”的意思。故选A。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">2.下列对这首诗的赏析，不正确的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">送客归江州</span></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">韩翃</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">东归复得采真</span><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">①</span></span></sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">游，江水迎君日夜流。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">客舍不离青雀舫，人家旧在白鸥洲</span><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">②</span></span></sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">风吹山带遥知雨，露湿荷裳已报秋。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">闻道泉明</span><span style="font-family: 宋体;">③居止近，篮舆相访为淹留。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【注】</span><span style="font-family: 宋体;">①采真：道教语，指顺乎天性，放任自然。②白鸥洲：指白鸥翔集的沙洲。此处借指客之家乡。③泉明：指晋陶渊明，此称其为泉明，乃避唐高祖李渊之讳。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.这首诗写诗人送客人归江州隐居，但并无送别时的伤感，更多的是一种美好的祝福。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.“江水迎君”采用拟人手法，客人归心似箭、归程片刻不能迟的心态跃然纸上。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.“青雀舫”“白鸥洲”写出了诗人对客人旅舟华美，家乡景色宜人的赞美与羡慕。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联写诗人听说陶渊明居所离客人很近，定会借探访陶渊明居所之机去拜访客人。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">D</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">D项，“定会借探访陶渊明居所之机去拜访客人”错误。尾联的意思是听说陶渊明居住的地方就在附近，你可以常常乘着竹轿，前往拜访。表达了诗人对客人隐逸情怀的赞美与羡慕。故选D。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">3.对下面这首词的赏析，不恰当的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">渔家傲</span></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">范仲淹</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">塞下秋来风景异，衡阳雁去无留意。四面边声连角起，千嶂里，长烟落日孤城闭。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">浊酒一杯家万里，燕然未勒归无计。羌管悠悠霜满地，人不寐，将军白发征夫泪。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.这首词写出了我国北方秋季的景物特点，从词中的“塞下”“霜”等词语可以看出。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.“衡阳雁去”是说“大雁向衡阳飞去”而不是“大雁从衡阳飞走了”。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.这首词既表达了将士的爱国之心，又流露出思念亲人和家乡的感情。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.这首词感情悲观而消极，表达了鲜明的反战、厌战情绪。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">D</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】这首《渔家傲》为范仲淹创作，以描写北方秋季景物为背景，表达了作者对家国、亲人的思念以及将士们的英勇豪情。</span><span style="font-family: 宋体;">A项正确，词中的“塞下”“霜”等词语揭示了北方的秋季特点；B项正确，作者借衡阳雁南飞的景象暗示将士们向往家乡的渴望；C项正确，通过浓烈的爱国情感以及思念亲人的情绪表现，展现了作者的家国情怀和将士们的壮志豪情。不过，D项表述错误，词中并未明显表达反战、厌战情绪，其主要表达了将士们为国家和民族拼搏的精神。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">4.对下面这首唐诗，赏析不恰当的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">早梅</span></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">万木冻欲折，孤根暖独回。前村深雪里，昨夜一枝开。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">风递幽香出，禽窥素艳来。明年如应律，先发望春台。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联把梅花与万木进行对比，万木的干枯摧折既有力地衬托了梅花的迎风斗雪，又好地照应了诗题中的“早”。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.颔联用华丽的语言为读者描绘出了一幅浓艳、高贵的雪中梅花图。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颈联写梅花的风韵和姿色，尾联寄寓诗人深深的情思。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.这首咏梅诗，语言清丽，笔墨含蓄，有着强烈的艺术感染力。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">B</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】选项</span><span style="font-family: 宋体;">B不太恰当。颔联并没有用华丽的语言来描绘梅花，只是表达了梅花在寒雪中展现出的独立、高洁的风韵。这里并没有像选项B所说的“浓艳、高贵”。其余选项都能恰当地反映这首诗的特点和内容，因此答案选B。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">5.对下面这首宋诗理解与赏析，不恰当的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">村行</span></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">王禹偁</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">马穿山径菊初黄，信马悠悠野兴长。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">万壑有声含晚籁，数峰无语立斜阳。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">棠梨叶落胭脂色，荞麦花开白雪香。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">何事吟余忽惆怅？村桥原树似吾乡。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联照应题目，点明地点和时令，写出了诗人信马徐行、观赏山野景色的悠然兴致。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.第二联上下句构成对比，生动地表现出山中有时喧响有时静穆的景象。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.第三联以“胭脂”和“白雪”为喻，形象地描绘出山村绚丽多彩的秋景。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.最后两句设为问答，抒发了诗人由外界景物所触发的浓浓的思乡之情。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">B</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】本题考查了对宋诗《村行》的理解与赏析。首先，</span><span style="font-family: 宋体;">A选项指出首联照应题目，点明了诗人信马行走在山间小路，看到菊花初黄，意境开阔。B选项提到第二联表现了山中有时喧响有时静穆的景象，但该联实际上并没有对比色彩，而是展示出千山万壑中奔涌着生机勃勃的晚响，无言的数峰沐浴在斜阳中。C选项正确地概括了第三联的内容，诗人通过赞美胭脂色的棠梨叶和白雪般芬芳的荞麦花存在世上，描绘出色彩斑斓的美景。D选项陈述了诗末以问答形式流露出的诗人对故乡情感的深刻思索。因此，答案为B选项，不恰当地解读了第二联。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">6.下列对这首诗的赏析，不正确的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">酬元九侍御赠璧竹鞭长句</span></strong><strong><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">①</span></span></sup></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">刘禹锡</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">碧玉孤根生在林，美人相赠比双金。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">初开郢客缄封后，想见巴山冰雪深。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">多节本怀端直性，露青犹有岁寒心。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">何时策马同归去，关树扶疏</span><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">②</span></span></sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">敲镫吟。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">[注]①此诗写于“永贞革新”失败后，作者被贬为朗州（今湖南）司马之时。元九，即诗人元稹，当时被贬为江陵（今湖北荆州）府士曹参军。②关树：关中之树。扶疏：枝叶繁茂。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联运用比兴手法，以碧玉般竹鞭的名贵，暗示赠鞭者的高尚，赞扬之情跃然纸上</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.颔联写诗人看到朋友赠礼后内心非常欣喜，很想去观赏生长碧竹的巴山冰雪美景。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颈联通过“节”字，将“竹节”与“节操”相关联，把咏鞭与赞人联系在了一起。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联由竹鞭引发联想，表达了诗人愿与友人“策马同去”“敲镫吟诗”的美好愿望。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">B</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“很想去观赏生长碧竹的巴山冰雪美景”赏析有误。领联表达的意思是，我一打开郢客的缄封之后，立刻想到冰冻巴山雪深深。目睹竹鞭而展开联想，写出了制鞭之竹在“巴山冰雪”中傲然挺立的景象。这是对元稹不畏权势、宁折不弯的形象写照。是以竹喻人，表达对友人的赞美。译文：绿如碧玉的孤竹生在深林，用它制的璧竹鞭名贵万分；贤稳之人将竹鞭赠送给我，这份厚礼胜过了万两黄金。我一打开郢客的绒封之后，立刻想到冰冻巴山雪深深。鞭上多节，节节怀着端直性，遍体露青犹有岁寒后凋心。我们何时才能策马同归去，在扶疏的关树下敲镫高吟？</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">7.下列对这首诗的赏析，不正确的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">见别离者因赠之</span></strong><strong><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">①</span></span></sup></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">韩偓</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">征人草草尽戎装，征马萧萧立路傍。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">尊酒阑珊将远别，秋山迤逦更斜阳。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">白髭兄弟中年后，瘴海程途万里长。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">曾向天涯怀此恨，见君呜咽更凄凉。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">[注]①诗人生活在唐末战乱之际，当时自北而南，沿路所见，皆发于诗。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.标题点明本诗写作的原由，“别离”一词陡生无限伤感情绪，奠定全诗情感基调。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.首联紧扣“征人”与“征马”两个形象，真切地描绘了出征时的情景，画面感很强</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颔联描写的是别后想象的虚景，诗人想象征人在离别亲人后沿着秋山远行的景象。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联直抒胸臆，眼前的别离勾起了诗人对自身的感叹，抒发了心中的无奈和感慨。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">C</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“颔联描写的是别后想象的虚景”说法错误，“尊酒阑珊将远别”是眼前实景。</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">8.下列对这首诗的赏析，不正确的一项是（ &nbsp;&nbsp;）</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">大热五首（其一）</span></strong></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">戴复古</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">天地一大窑，阳炭烹六月。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">万物此陶镕，人何怨炎热。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">君看百谷秋，亦自暑中结。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">田水沸如汤，背汗湿如泼。</span></p>
			
 
				+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">农夫方夏耘，安坐吾敢食！</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">【注】陶镕：陶铸熔炼，比喻培育、造就。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.诗人把六月的天地比作一个大窑，太阳像炭火一样熔炼着其中的一切。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.暑天虽极炎热，诗人却认为不应抱怨，因为秋天的谷物均赖此而结实。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.田中的水被晒得似乎要沸腾，诗人的背上汗水流得就像刚刚用水泼过。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.这首诗描写暑热多用比喻和夸张修辞，语言平易浅近，风格质朴自然。</span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">C</span></span></p>
			
 
				+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“背汗湿如泼”描写的是</span></span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">农夫在暑热中辛苦劳作的情景</span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">，</span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">而不是指</span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">诗人自己。</span></p>
			
 
				+    '''
			
 
				     # pprint(cons)
			
 
				     # print(again_parse(cons))
			
 
				     # print(again_parse(cons))
			
 
				     # print(list(map(lambda x: str(x).replace("     ", " "), again_parse(cons))))
			
 
				     # con1 = r'<p>解：A．研究跨栏动作时，刘翔的大小和形状不能忽略，不能看作质点，故A错误；<br/>B．选取不同的参考系，物体的运动状态是不相同的，故B错误；<br/>C．出租车收费是按路程收费的，故C错误；<br/>D．第<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />是指<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553931930702.png" data-latex="${1 \rm{s} }$" width="12",height="11" />的时间，是指从<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930220437.png" data-latex="${3 \rm{s} }$" width="13",height="11" />末到<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />末这一段时间，故D正确；<br/>故选：D．</p>'
			
 
				 
			
 
				-    cons = css_label_wash(cons1)
			
 
				+    cons = css_label_wash(cons2)
			
 
				 
			
 
				     print(cons)
			
--- a/utils/ruku_opera.py
+++ b/utils/ruku_opera.py
@@ -47,7 +47,7 @@ class Ruku():
 
				         self.wordid = wordid
			
 
				         self.callback_url = callback_info["callback_url"]
			
 
				         # self.callback_url = "123456"
			
 
				-        self.source = callback_info["source"]
			
 
				+        self.source = callback_info["source"]  # {"xue_guan": "1", "teacher": "2", "ai": "3", "qtk": 4,"school":5}
			
 
				         self.subject = subject  # items_list[0]["period"] + items_list[0]["subject"]
			
 
				         self.callback_code = 0
			
 
				         self.callback_err = ""
			
@@ -340,6 +340,8 @@ class Ruku():
 
				                 if s:
			
 
				                     s = re.sub(r'(<img src="[^"]*?[a-z\d])\\(?!\\)([^"]*?")', r"\1/\2", str(s))  # 将路径中的\改为/
			
 
				                     s = s.replace(new_img_local, new_img_online)
			
 
				+                    # 将latex的标红标签去掉
			
 
				+                    s = re.sub(r'<span style=\"color: red\">([^"]+?)</span>', r"\1", s)
			
 
				                     if old_img_local:
			
 
				                         return s.replace(old_img_local, new_img_online)
			
 
				                 return s
			
--- a/utils/stem_ans_split.py
+++ b/utils/stem_ans_split.py
@@ -142,9 +142,11 @@ def get_split_pos(row_list):
 
				     """
			
 
				     # 寻找题目和答案的切分点，一定要有“答案”关键字
			
 
				     split_p1 = [k for k, v in enumerate(row_list)
			
 
				-                if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$|答案[和与及]?解析([(（].*?[)）])?$'  # |答\s*案$
			
 
				+                if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$'
			
 
				+                            r'|答案[和与及]?解析([(（].*?[)）])?$'  # |答\s*案$
			
 
				                             r'|.{,15}(参考|考试|(考?试|检测)[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*$'
			
 
				-                            r'|.{,15}评分(标准|参考)|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s.．、､]+$'
			
 
				+                            r'|.{,15}评分(标准|意见|细则|参考)$'
			
 
				+                            r'|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s.．、､]+$'
			
 
				                             r'|.{,15}(参考|考试|(考?试|检测)[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*(物理|理综|数学|化学|生物)?\s*$'
			
 
				                             r'|.{,15}解析[和与及]答案$',
			
 
				                             re.sub(r"[上下]?学[年期]|[\d—【】.．·、､：:(（）)年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]"
			
--- a/utils/washutil.py
+++ b/utils/washutil.py
@@ -13,6 +13,7 @@ import shutil
 
				 # from PIL import Image
			
 
				 import base64, os, random
			
 
				 import time
			
 
				+import requests
			
 
				 import hashlib
			
 
				 from pprint import pprint
			
 
				 # from bs4 import BeautifulSoup
			
@@ -365,6 +366,7 @@ class HtmlWash():
 
				             # kk = re.search('(<img src=".*?image\d+\.(png|gif|jpg|jpeg))', src)
			
 
				             # new_src = src.replace(kk.group(1), self.img_url[kk.group(1)]) if type(self.img_url) == dict and kk else src
			
 
				             # 图片信息简化替换
			
 
				+            print(src)
			
 
				             new_src = re.sub(r'( data-latex)="\s*\\\[(.*?)\\\]\s*"', r'\1="$\2$"', src)
			
 
				             new_src = re.sub(r'( data-latex="\$[^"]+?\$")',
			
 
				                              lambda x: x.group(1).replace("<", " \lt ").replace("  ", " "), new_src)
			
@@ -375,7 +377,13 @@ class HtmlWash():
 
				             w_h_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?width="([\d.]+)[pxt]*?"\s*height="([\d.]+)[pxt]*?"', src)
			
 
				             w_h = " w_h=" + w_h_info.group(3).split('.')[0] + "*" + w_h_info.group(4).split('.')[0] \
			
 
				                 if w_h_info and not mathpix else ""  # w_h 和 mathpix只存在一个
			
 
				-            image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
			
 
				+            # image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
			
 
				+            image_info = re.search(r'<img src=".*?/([^/]+?)/(new_)?image([\da-z]+)\.', src)  # 2023.12.1
			
 
				+
			
 
				+            print(image_info.groups())
			
 
				+            image_id = image_info.group(1) + image_info.group(3)
			
 
				+            if len(image_id) > 10:
			
 
				+                image_id = image_id[-10:]
			
 
				             src2subs[src] = '<imgsrc' + image_id + w_h + mathpix + "/>"
			
 
				             subs2src['<imgsrc' + image_id + w_h + mathpix + "/>"] = new_src
			
 
				         for k, v in src2subs.items():
			
@@ -481,7 +489,7 @@ def get_md5(image_id):
 
				     return str(md.hexdigest())
			
 
				 
			
 
				 
			
 
				-def wash_after(res_dict):
			
 
				+def wash_after(res_dict, subject="数学"):
			
 
				     """
			
 
				     1.处理最终结果多余的换行符;2.对题文中已给答案的选择填空进行替换；3.选择题的细分
			
 
				     :param res_dict:
			
@@ -496,6 +504,8 @@ def wash_after(res_dict):
 
				     is_optional = False
			
 
				     option_score = 0
			
 
				     select_type_id = []
			
 
				+    all_content_str_list = []
			
 
				+    topic_type_list = []
			
 
				 
			
 
				     for num, sr in enumerate(res_dict):
			
 
				         sr["stem"] = re.sub(r"\n[_\-\s]*密[…O•.\s]*封[….O•\s]*装?[…O•.\s]*订?[….O•\s]*线?"
			
@@ -602,8 +612,8 @@ def wash_after(res_dict):
 
				                 sr["answer_type"] = configs.answer_type[sr["answer_type"]]
			
 
				 
			
 
				             if not sr["parse"] and not sr["key"]:  # 答案和解析都没有
			
 
				-                sr["parse"] = "略"
			
 
				-                sr["key"] = "略"
			
 
				+                # sr["parse"] = "略"
			
 
				+                # sr["key"] = "略"
			
 
				                 sr['errmsgs'].append("本题缺少答案和解析")
			
 
				             elif not sr["key"] and sr["parse"]:
			
 
				                 sr["key"] = ""  # 见解析
			
@@ -643,6 +653,7 @@ def wash_after(res_dict):
 
				         # if "type1" in sr:
			
 
				         #     del sr["type1"]
			
 
				 
			
 
				+        # 题型纠正
			
 
				         # 将选择题改为单选或多选,"is_multiple_choice"
			
 
				         sr['type'] = re.sub("([单多])项选择题?", r"\1选题", sr['type'])
			
 
				         sr['type'] = sr['type'].replace("题题", "题")  # .replace("简答", "解答")
			
@@ -653,6 +664,8 @@ def wash_after(res_dict):
 
				                 sr['type'] = '多选题'
			
 
				             elif len(re.findall("[A-Z]", sr["key"])) == 1:
			
 
				                 sr['type'] = '单选题'
			
 
				+            elif "数学" in subject or "物理" in subject:
			
 
				+                sr['type'] = '单选题'
			
 
				             info_x = re.search("^[（(](多)选题?[）)]", sr["stem"].replace(" ", ""))
			
 
				             if info_x:
			
 
				                 sr['type'] = '{}选题'.format(info_x.group(1))
			
@@ -672,15 +685,51 @@ def wash_after(res_dict):
 
				                 sr['type'] = '多选题'
			
 
				             elif len(re.findall("[A-Z]", sr["key"])) == 1:
			
 
				                 sr['type'] = '单选题'
			
 
				+            elif "数学" in subject or "物理" in subject:
			
 
				+                sr['type'] = '单选题'
			
 
				             else:
			
 
				                 sr['type'] = '选择题'
			
 
				                 if "缺少答案" not in "".join(sr['errmsgs']):
			
 
				                     sr['errmsgs'].append("本题缺少答案")
			
 
				+        elif "数学" in subject:
			
 
				+            if sr['type'].replace("题", "") == "填空":
			
 
				+                if sr['blank_num'] > 1:
			
 
				+                    sr['type'] = "多空题"
			
 
				+                else:
			
 
				+                    sr['type'] = "单空题"
			
 
				+            elif sr['type'].replace("题", "") not in ["单空", "多空"]:
			
 
				+                sr['type'] = "解答题"
			
 
				+        # elif "物理" in subject:
			
 
				+        #     # 用第一版模型预测
			
 
				+        #     content = sr['stem']
			
 
				+        #     if "options" in sr and sr["options"]:
			
 
				+        #         content+= "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
			
 
				+        #                                     for idm, option in enumerate(sr["options"])])
			
 
				+        #     try:
			
 
				+        #         r = requests.post(url=configs.phy_topicType_ip,
			
 
				+        #                            json={"content": content, "period": "高中",
			
 
				+        #                                 "topic_type": sr['type']})
			
 
				+        #         sr['type'] = r.json()["res"]
			
 
				+        #         if sr['type'] == "简答题":
			
 
				+        #             sr['type'] = "解答题"
			
 
				+        #     except Exception as e:
			
 
				+        #         print(e)
			
 
				+        #         if sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
			
 
				+        #             sr['type'] = "填空题"
			
 
				+        #         else:
			
 
				+        #             sr['type'] = "解答题"
			
 
				         elif sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
			
 
				             sr['type'] = "填空题"
			
 
				         elif sr['type'] not in ["选择", "选择题"]:
			
 
				             sr['type'] = "解答题"
			
 
				 
			
 
				+        content = sr['stem']
			
 
				+        if "options" in sr and sr["options"]:
			
 
				+            content += "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
			
 
				+                                         for idm, option in enumerate(sr["options"])])
			
 
				+        all_content_str_list.append(content)
			
 
				+        topic_type_list.append(sr['type'])
			
 
				+
			
 
				         # """按照原先高中数学解析的最后输出格式整理输出"""
			
 
				         # sr["type"] = sr['type'].replace("非选择", "解答").replace("题题", "题")  #
			
 
				         sr["topic_num"] = sr['item_id']
			
@@ -694,7 +743,9 @@ def wash_after(res_dict):
 
				             del sr['is_optional']
			
 
				         if 'spliterr_point' in sr:
			
 
				             del sr['spliterr_point']
			
 
				-        del sr['score'], sr['item_id']
			
 
				+        if 'score' in sr:
			
 
				+            del sr['score']
			
 
				+        del sr['item_id']
			
 
				 
			
 
				         # ---------------------字符串公式处理--------------------------------
			
 
				         # sr["stem"] = get_equation_instr(sr["stem"])
			
@@ -703,6 +754,43 @@ def wash_after(res_dict):
 
				         # if "options" in sr:
			
 
				         #     sr["options"] = list(map(get_equation_instr, sr["options"]))
			
 
				         # ----------------------------------------------------------------
			
 
				+    # 物理题型批量调接口:节约时间
			
 
				+    if "物理" in subject:
			
 
				+        epoches = int(len(all_content_str_list) / 10)
			
 
				+        pred_topic_types = []
			
 
				+        if epoches > 0:
			
 
				+            last = 0
			
 
				+            for epoch in range(epoches):
			
 
				+                input_data = {"content": all_content_str_list[last:(epoch+1)*10], "period": "高中",
			
 
				+                              "topic_type": topic_type_list[last:(epoch+1)*10]}
			
 
				+                last = (epoch+1)*10
			
 
				+                try:
			
 
				+                    r = requests.post(url=configs.phy_topicType_ip, json=input_data)
			
 
				+                    pred_topic_types.extend(r.json()["res"])
			
 
				+                except Exception as e:
			
 
				+                    print(e)
			
 
				+                    pred_topic_types.extend([""]*10)
			
 
				+            rest_con = all_content_str_list[last:]
			
 
				+            rest_topic_type = topic_type_list[last:]
			
 
				+        else:
			
 
				+            rest_con = all_content_str_list
			
 
				+            rest_topic_type = topic_type_list
			
 
				+        if rest_con:
			
 
				+            input_data = {"content": rest_con, "period": "高中", "topic_type": rest_topic_type}
			
 
				+            try:
			
 
				+                r = requests.post(url=configs.phy_topicType_ip, json=input_data)
			
 
				+                pred_topic_types.extend(r.json()["res"])
			
 
				+            except Exception as e:
			
 
				+                print(e)
			
 
				+                pred_topic_types.extend([""] * len(rest_con))
			
 
				+        # 将预测题型替换到res_dict中
			
 
				+        if any([True for i in pred_topic_types if i]) and len(pred_topic_types) == len(res_dict):
			
 
				+            for idx, pred_type in enumerate(pred_topic_types):
			
 
				+                if pred_type and res_dict[idx]['type'] in ["填空题", "解答题"]:
			
 
				+                    if pred_type == "简答题":
			
 
				+                        pred_type = "解答题"
			
 
				+                    res_dict[idx]['type'] = pred_type
			
 
				+    # --------------------------------------------------------------
			
 
				     # 换行符替换
			
 
				     convert_huanhang(res_dict)
			
 
				     # ------------------------------------------------------------------------
			
--- a/utils/washutil_for_DL_way.py
+++ b/utils/washutil_for_DL_way.py
@@ -0,0 +1,496 @@
 
				+#!/usr/bin/env/python
			
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import re
			
 
				+import base64, os, random
			
 
				+import time
			
 
				+from pprint import pprint
			
 
				+import configs
			
 
				+from utils.field_eq2latex import get_latex
			
 
				+from utils.html_again_parse import css_label_wash
			
 
				+
			
 
				+
			
 
				+def table_label_cleal(con):
			
 
				+    """
			
 
				+    去掉表格中的【换行符】
			
 
				+    """
			
 
				+    # print(con)
			
 
				+    # print('------------------------------------------')
			
 
				+    con = re.sub(r"\n(\s|\n|\t)+", "\n", con)
			
 
				+    count = 1
			
 
				+    while re.search(r"</?[a-z]+>\n(</?[a-z]+>|<td\s+\n*[a-z=\"\d]+>)", con, re.S) and count <= 10:
			
 
				+        con = re.sub("(</?t[dr]>|</?table>|</?tbody>|</?div>)\n(</?t[dr]>|</div>|</?table>|</?tbody>|<p>)",
			
 
				+                     r"\1\2", con, flags=re.S)
			
 
				+        con = re.sub(r'(</?t[rd]>)\n(<td\s.+?>)', r'\1\2', con, flags=re.S)
			
 
				+        count += 1
			
 
				+    # if re.search(r"<table>(.|\n)+?</table>", con, re.S|re.M):
			
 
				+    #     aa = re.search(r"(<table>(.|\n)+?</table>)", con, re.S|re.M)
			
 
				+    #     con = con.replace(aa.group(1),aa.group(1).replace("\n",""))
			
 
				+
			
 
				+    # 将空表格的情况去掉
			
 
				+    con = re.sub(r'<table>[\s\n\t]*?<tbody>[\s\n\t]*?(<tr>[\s\n\t]*?<td[^<>]*?>[\s\n\t]*?<p>[\s\n\t]*?</p>'
			
 
				+                 r'[\s\n\t]*?</td>[\s\n\t]*?</tr>[\s\n\t]*?)+</tbody>[\s\n\t]*?</table>[\s\n\t]*?<p>', "", con,
			
 
				+                 flags=re.S)
			
 
				+    con = re.sub(r'(</table><p>)\s*([(（]\s*\d\s*[)）])', r'\1\n\2', con)
			
 
				+    return con
			
 
				+
			
 
				+
			
 
				+def base642img(html_data, wordid):
			
 
				+    """
			
 
				+    【基于mathjax渲染输出是css-html格式】
			
 
				+    将base64编码的图片保存到本地
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 二进制图片进行转化， 按“word_id”建立文件夹
			
 
				+    # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d')
			
 
				+    # file_path = configs.IMG_FOLDER + '/' + str(self.wordid)
			
 
				+    # if not os.path.exists(file_path):
			
 
				+    #     os.makedirs(file_path)
			
 
				+    # else:
			
 
				+    # 思路1：删除图片,重建文件夹，【所有的新图片都是以base64格式传过来的】
			
 
				+    # shutil.rmtree(file_path)
			
 
				+    # os.makedirs(file_path)
			
 
				+    # 思路2：每一次再解析都将base64图片保存到本地再以路径形式返回
			
 
				+    # st = len(os.listdir(file_path))  # 不要以序号索引的形式命名
			
 
				+
			
 
				+    # 统计所有base64编码
			
 
				+    all_base64_image = re.findall(r'(<img ([a-z]+="[^"]*?" )?src="(data:image[^>"]+?)"(.*?)\s*/?>)', str(html_data),
			
 
				+                                  flags=re.S)
			
 
				+    if all_base64_image:
			
 
				+        file_path = configs.IMG_FOLDER + '/' + str(wordid)
			
 
				+        if not os.path.exists(file_path):
			
 
				+            os.makedirs(file_path)
			
 
				+        # 新图片命名
			
 
				+        name_list = random.sample(range(100000, 999999), len(all_base64_image))
			
 
				+        for n, img in enumerate(all_base64_image):
			
 
				+            img1 = img[2].split(",", maxsplit=1)
			
 
				+            img_type_info = re.search("data:image/(.+?);base64", img1[0])
			
 
				+            img_type = img_type_info.group(1) if img_type_info else ""
			
 
				+            # 可能还有alt和style的属性，暂时先不要
			
 
				+            w_info = re.search('( width="\d+")', img[3])
			
 
				+            h_info = re.search('( height="\d+")', img[3])
			
 
				+            img_data = base64.b64decode(str(img1[-1]))
			
 
				+            if img_type:
			
 
				+                # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape)
			
 
				+                img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type
			
 
				+                save_path = os.path.join(file_path, img_name)
			
 
				+                with open(save_path, 'wb') as f:
			
 
				+                    f.write(img_data)
			
 
				+                # self.localnewpic_list.append(save_path)
			
 
				+                # put_key = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(img_name)
			
 
				+                # self.put_key_list.append(save_path)
			
 
				+                flag_behind = '" />'
			
 
				+                if w_info and h_info:
			
 
				+                    flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />'
			
 
				+                temp_img = '<img src="' + configs.new_img_ip + '/' + str(wordid) + '/' + img_name + flag_behind
			
 
				+                # new_img = '<img src="http://' + configs.public_bucket_addr + put_key + '" />'
			
 
				+                html_data = html_data.replace(img[0], temp_img)
			
 
				+    return html_data
			
 
				+
			
 
				+
			
 
				+class HtmlWash_2():
			
 
				+    def __init__(self, html, wordid, is_reparse=0, img_url="", must_latex=0):
			
 
				+        """
			
 
				+        html文本清洗
			
 
				+        批量再解析中，新增图片信息替换的文本返回作为ocr保存文本，
			
 
				+        继续往下清洗的文本，则进入结构化解析逻辑中
			
 
				+        """
			
 
				+        # super().__init__(html, wordid, is_reparse, must_latex)
			
 
				+        self.html = html
			
 
				+        self.img_url = img_url
			
 
				+        self.wordid = wordid
			
 
				+        self.is_reparse = is_reparse
			
 
				+        self.must_latex = must_latex
			
 
				+        # self.put_key_list = []
			
 
				+        # self.localnewpic_list =[]
			
 
				+        self.sub_list = ["</?div>", "</?b>", "</?caption>", "</?center>", "</?cite>", "</?code>", "</?colgroup>",
			
 
				+                         "</?menu>", "</?dd>", "</?dir>", "</?li>", "</?em>", "</?article>", "</?header>", "</?ruby>",
			
 
				+                         "</?summary>", "</?details>", "</?strong>", "</?strike>", "</?small>", "</?select>",
			
 
				+                         "</?section>", "</?script>", "</?[su]>", "</?var>", "</?ul>", "</?tt>", "</?title>",
			
 
				+                         "</?thead>",
			
 
				+                         "</?tfoot>", "<hr />", "<hr>", ""]
			
 
				+        self.sub_dd = {'&times;': '×',
			
 
				+                       '&divide;': '÷',
			
 
				+                       '&deg;': '°',
			
 
				+                       '&middot;': '·',
			
 
				+                       '&plusmn;': '±',
			
 
				+                       '&ordm;': 'º',
			
 
				+                       '&sup1;': '¹',
			
 
				+                       '&sup2;': '²',
			
 
				+                       '&sup3;': '³',
			
 
				+                       '&frac12;': '1/2',
			
 
				+                       '&frac14;': '¼',
			
 
				+                       '&frac34;': '¾',
			
 
				+                       '&yen;': '¥',
			
 
				+                       'm&sup3;': 'm³',
			
 
				+                       # '&lt;': '<',
			
 
				+                       '&pound;': '£',
			
 
				+                       # '∠&lt;': '&lt;',
			
 
				+                       '&gt;': '>',
			
 
				+                       "Ａ": "A",
			
 
				+                       "А": "A",
			
 
				+                       "Α": "A",
			
 
				+                       "Ｂ": "B",
			
 
				+                       "В": "B",
			
 
				+                       "в": "B",
			
 
				+                       "Β": "B",
			
 
				+                       "Ｃ": "C",
			
 
				+                       "С": "C",
			
 
				+                       "ｃ": "c",
			
 
				+                       "с": "c",
			
 
				+                       "Ｄ": "D",
			
 
				+                       "Ε": "E",
			
 
				+                       "Ｅ": "E",
			
 
				+                       "Ｆ": "F",
			
 
				+                       "Ｇ": "G",
			
 
				+                       "ｇ": "g",
			
 
				+                       "ｍ": "m",
			
 
				+                       "Ｎ": "N",
			
 
				+                       "ｓ": "s",
			
 
				+                       "ｔ": "t",
			
 
				+                       "／": "/",
			
 
				+                       "＝": "=",
			
 
				+                       "－": "-",
			
 
				+                       "２": "2", "３": "3", "４": "4", "５": "5", "６": "6",
			
 
				+                       "７": "7", "８": "8", "９": "9", "１": "1", "０": "0",
			
 
				+                       '&nbsp;&nbsp;': ' ',
			
 
				+                       '&nbsp;': ' ',
			
 
				+                       "〖": '【',
			
 
				+                       "〗": '】',
			
 
				+                       "題": '题',
			
 
				+                       "单项选择": '单选',
			
 
				+                       "多项选择": '多选',
			
 
				+                       # "不定项选择": '选择',
			
 
				+                       "双项选择": '多选',
			
 
				+                       "实验与探究题": '实验',
			
 
				+                       "原理综合题": '原理题',
			
 
				+                       }
			
 
				+
			
 
				+    def new_pic_sub(self):
			
 
				+        """
			
 
				+        针对base64图片先保存到本地，入库时再换成腾讯云线上地址
			
 
				+        # 第一版：再解析中，将二进制图片进行转化,图片怎么保存比较好，先再“天数”建立文件夹
			
 
				+        第一版：再解析中，根据“word_id”建立文件夹
			
 
				+        :return:
			
 
				+        """
			
 
				+        if self.is_reparse:
			
 
				+            # css 标签清洗
			
 
				+            self.html = css_label_wash(self.html)
			
 
				+            # 保存base64编码的图片
			
 
				+            self.html = base642img(self.html, self.wordid)
			
 
				+        self.new_html = self.html
			
 
				+
			
 
				+    def html_cleal(self):
			
 
				+        # =======清洗mathjax标签========
			
 
				+        if "MathJax" in self.html:  # 再解析中存在mathjax公式渲染的标签
			
 
				+            all_mathjax = re.findall('(<span class="MathJax_Preview".*?</script>(</span>)*)', self.html)
			
 
				+            for jax in all_mathjax:
			
 
				+                latex = re.findall('<script .+?">(((?!(</)).)*?)</script>(</span>)*', jax[0])
			
 
				+                if latex:
			
 
				+                    latex = "${}$".format(latex[0][0])
			
 
				+                    self.html = self.html.replace(jax[0], latex)
			
 
				+                else:
			
 
				+                    self.html = self.html.replace(jax[0], "")
			
 
				+
			
 
				+        # ======再解析中的新图片处理=====
			
 
				+        self.new_pic_sub()
			
 
				+
			
 
				+        # =====特殊符号处理=====
			
 
				+        html2txt = re.sub(r"|".join(self.sub_list), "", str(self.html))  # ("", " ")   #2020/4/7
			
 
				+        html2txt = re.sub("|".join(self.sub_dd.keys()), lambda x: self.sub_dd[x.group()], html2txt)  # 2020/4/1,4/7,4/20
			
 
				+        html2txt = re.sub("[不非]定[向项]选择", "不定选择", html2txt)
			
 
				+        html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \
			
 
				+            .replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \
			
 
				+            .replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ") \
			
 
				+            .replace("\u2003", " ").replace("\x7f", " ").replace("\xa0", "")
			
 
				+        html2txt = re.sub(r"(<p>\s*)【例题(\d+)】", r"\1\2、", html2txt)
			
 
				+        html2txt = re.sub(r"\\\(|\\\)", "$", html2txt)
			
 
				+
			
 
				+        # 域公式的转化处理；<sub>\<sup>可以在前端显示，不需要用latex渲染
			
 
				+        try:
			
 
				+            html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid, self.must_latex)
			
 
				+            if newhml:  # 存在域公式转图片时，需要将原文本的域公式也转为图片信息
			
 
				+                self.new_html = newhml
			
 
				+            html2txt = html2txt.replace("【omml-latex】", "")
			
 
				+        except:
			
 
				+            html2txt = html2txt.replace("【omml-latex】", "")
			
 
				+
			
 
				+        # 字符串公式的处理：如Fe<sub>2</sub>O<sub>3</sub>, 在结构化之后处理比较好
			
 
				+        # <br/>处理
			
 
				+        html2txt = re.sub(r"<br\s*/?>", "\n", html2txt)
			
 
				+        html2txt = re.sub(r"[（(]\s*(\d)\s*\$分\s*[)）]", r"$(\1分)", html2txt)
			
 
				+
			
 
				+        # =====题型行的统一处理=====
			
 
				+        # ---->>>>>题型行可能放在表格中
			
 
				+        if len(re.findall("</table>", html2txt)) >= 8:  # 这个限制还不太严谨
			
 
				+            for tt in re.finditer('<tr>(((?!(</?tr>)).)*)</tr>', html2txt, re.S):
			
 
				+                tt_list = re.split(r'^\s*<td[^<>]*?>|</p></td>|</td>[\n\s]*?<td[^<>]*?>'
			
 
				+                                   r'|</td>\s*\n|</td>\s*$|\n\s*<td[^<>]*?>|<td[^<>]*?><p>',
			
 
				+                                   tt.group(1).strip())  # </td>\s*[$\n]这样无效
			
 
				+                tt_list = [col for col in tt_list if col.strip()]
			
 
				+                if " ".join(tt_list).replace(" ", "") in ['得分评卷人', '评卷人得分']:
			
 
				+                    html2txt = html2txt.replace(tt.group(0), "")
			
 
				+                else:
			
 
				+                    pass
			
 
				+                    # html2txt = html2txt.replace(tt.group(0), "<p>" + " ".join(tt_list) + "</p>")
			
 
				+                # html2txt = re.sub(r"</?tbody>|</?table>|</?div>", "", html2txt)
			
 
				+        # ---->>>>>end
			
 
				+        html2txt = re.sub(r"(</table>)\s*([一二三四五六七八九十]\s*[、.．､：:]?.{2,6}题)", r"\1</p>\2", html2txt)
			
 
				+        html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]\s*(论述|填空|探究)题?[与和､、，,\s]*?(计算题|实验题)', r"\1､\3", html2txt)
			
 
				+        html2txt = re.sub(r'<td[^<>]*?><p>(([一二三四五六七八九十])\s*[、.．､，,：:]\s*(.{2,4}题)\s*</p>)</td>[^p]*?<p>', r"\1",
			
 
				+                          str(html2txt), flags=re.S)
			
 
				+        html2txt = re.sub(r"<p>\s*([一二三四五六七八九十])\s*[、.．､,，：:]?\s*(计算|[解简]答|实验|作图)题?[与和、､，,\s]*?(计算|[解简]答|实验|作图)",
			
 
				+                          r"<p>\1､\2题", html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*[(（]\s*[一二三四五六]\s*[)）]\s*必考题\s*(.?|.+?分\s*[.。．]?)\s*</p>', "", html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*[(（]\s*[一二三四五六]\s*[)）]\s*选考题\s*.?\s*.{,4}(?<!\d)(\d+分)\s*[,，。].{,50}</p>',
			
 
				+                          r"<p>【选做题】:'\1'</p>", html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*[(（]\s*[一二三四五六]\s*[)）]\s*选考题\s*(.?|.+?分\s*[.。．]?)\s*</p>', "<p>【选做题】</p>", html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、.．､，,：:]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*</p>',
			
 
				+                          r"<p>\1､\2题</p>", html2txt)
			
 
				+        html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)'
			
 
				+                          r'([(（]\s*本题|.*?\d分)', r"\1" + "､" + r'\2' + "题" + r"\3", html2txt)
			
 
				+        html2txt = re.sub(r'([一二三四五六])\s*[、.．､，,:：]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题',
			
 
				+                          r"\1" + "､" + r'\2' + "题", html2txt)
			
 
				+        # html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]?\s*[(（]\s*本大题(.*?选项中)', r"\1" + "､" + "选择题", html2txt)  # + r"\2"
			
 
				+        # html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、.．､，,]?\s*[(（本大题]*?(.*?选项中)', r"\1" + "､" + "选择题", html2txt)
			
 
				+        html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]?\s*([(（]\s*(每小题|本大?题)((?!(选项)).)+?[）)]|综合题)',
			
 
				+                          r"\1" + "､" + "解答题", html2txt)
			
 
				+        html2txt = re.sub(r'(?<!<p>)\s*([一二三四五六七八九十]\s*[、.．､，,：:]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)',
			
 
				+                          r'</p>\n<p>\1', html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、.．､，,：:]?\s*[(（]?本?大?题((?!(选项)).)+?[)）]?\s*</p>', r"<p>\1､本大题</p>",
			
 
				+                          html2txt)
			
 
				+
			
 
				+        # html2txt = re.sub(r'<p>\s*[^一二三四五六七八九十]{,3}\s*[、.．､]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"<p>一､\1题", html2txt)
			
 
				+
			
 
				+        # =====图片的处理=====
			
 
				+        # 1>>根据图片宽高的异常值判断删除隐藏图片
			
 
				+        def sub1(ss):
			
 
				+            if float(ss.group(1)) <= 3 and float(ss.group(2)) <= 3:
			
 
				+                return ""
			
 
				+            else:
			
 
				+                return ss.group(0)
			
 
				+
			
 
				+        html2txt = re.sub(r'<img src=.*? width="([\d.]+)p[xt]" height="([\d.]+)p[xt]"\s*/?>', sub1, html2txt)
			
 
				+
			
 
				+        # 2>>将图片中带有的汉字去掉
			
 
				+        html2txt = re.sub(r'(<img src=.*?) alt=".+?"', r"\1", html2txt)
			
 
				+        # html2txt = re.sub(r'(<img src=.+?(?<!\\)\")>', r"\1 />", html2txt)  # 将">换为" />
			
 
				+        html2txt = re.sub(r'(<img src=(?!\sstyle=)+?(?<!\\)\")>', r"\1 />", html2txt)  # 将">换为" />
			
 
				+
			
 
				+        # =====答案解析关键字的统一处理=====
			
 
				+        html2txt = re.sub(r'【\s*(<img src=((?!/>).)+?/>\s*)*?([解答])\s*(<img src=((?!/>).)+?/>\s*)*?([析案])\s*'
			
 
				+                          r'(<img src=((?!/>).)+?/>\s*)*?】', r"【\3\6】", str(html2txt))  # 2022/4/28
			
 
				+        html2txt = re.sub(r'<p>\s*(解\s*[：:])', r"<p>【解答】", str(html2txt))
			
 
				+        html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt))
			
 
				+        # html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答])[^【】]*?】', r"【\1】", str(html2txt))
			
 
				+        html2txt = re.sub(r'(\n\s*|<p>\s*|\s{2,}|\n\s*\d{,2}\s*[、.．､]\s*)(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[：:]', r"\1【\2】",
			
 
				+                          str(html2txt))
			
 
				+        html2txt = re.sub(r'(\n|^|<p>)\s*(([1-9]|[1-9][0-9])\s*[.．、､])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]',
			
 
				+                          r"\1\2【\4】", str(html2txt))
			
 
				+        html2txt = re.sub(r'([A-D])\s*\[\s*(解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\n【\2】", str(html2txt))
			
 
				+        html2txt = re.sub(r'(\n|^|<p>)\s*(分析)\s*[：:]', r"【\2】", str(html2txt))
			
 
				+        if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt:
			
 
				+            html2txt = re.sub(r'【解答】', "【解析】", str(html2txt))
			
 
				+
			
 
				+        # =====其他关键字的处理=====
			
 
				+        html2txt = re.sub(r'<p>\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?</p>', "", str(html2txt))
			
 
				+        html2txt = re.sub(r'<p>\s*(选修[\d-]*?[：:].{2,15})\s*</p>', r"<p>【章节】\1</p>", html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*([一二三四五六]\s*[、.．､]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([(（]\d+分[)）])?\s*</p>',
			
 
				+                          r"<p>【章节】\2</p>", html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*(基础|中档|综合)题[^p题]*?</p>|<p>\s*【(考点|专题)】[^p]*?</p>', "", str(html2txt))
			
 
				+        html2txt = re.sub(r'<p>\s*(基础训练|提升训练|探究培优)</p>', "", str(html2txt))
			
 
				+        html2txt = re.sub(r'<p>注意事项[:：]\s*</p>(\n+\s*<p>\s*\d\s*[、.．､][^/]+?</p>){1,}', "", html2txt, flags=re.S)
			
 
				+        html2txt = re.sub(r'<p>注意事项[:：]\s*\d\s*[、.．､][^/]+?</p>(\n+\s*<p>\s*\d\s*[、.．､][^/]+?</p>){1,}', "", html2txt,
			
 
				+                          flags=re.S)
			
 
				+        html2txt = re.sub(r'[(（]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[)）]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt)
			
 
				+        html2txt = re.sub(r'[(（](\s*\d\s*\d?\s*分?\s*)[)）]', "(" + r'\1'.replace(" ", "") + ")", html2txt)
			
 
				+        html2txt = re.sub(r'\[来源:.*?\]', "", html2txt)
			
 
				+        html2txt = re.sub('<p>欢迎访问.*?</p>', '', html2txt)
			
 
				+        html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?<!["“=\'])http:.*?\.(com|cn|org)', "",
			
 
				+                          html2txt)  # ww w.gkstk.c om
			
 
				+        html2txt = re.sub(r'<(table|tr) [a-z]+="\d+">', r'<\1>', html2txt)  # <td rowspan="2">保留
			
 
				+        html2txt = re.sub(r'<(table)( [a-z]+=".*?")+>', r'<\1>', html2txt)
			
 
				+        html2txt = re.sub(r'<p>\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([(（].*?[)）]|非?选择题.{,8})?\s*</p>', "<p>【非选择题】</p>", html2txt)
			
 
				+        # == == =对可能的题型行的处理 == ==
			
 
				+        html2txt = re.sub("<p>【非选择题】</p>((\s|\n|<p>|</p>)*\d{1,2}\s*[.．、､].+?)", r"<p>二、解答题</p>\1", html2txt) \
			
 
				+            .replace("【非选择题】", "")
			
 
				+
			
 
				+        # =====选项的处理=====
			
 
				+        html2txt = re.sub(r'(<p>\s*([1-9]|[1-9][0-9])\s*[.．、､].+?[(（]\s*[）)])\s*(A\s*[.．、､][^/]*?</p>)',
			
 
				+                          r"\1</p>\n<p>\3", str(html2txt))
			
 
				+
			
 
				+        # =====题号的处理=====
			
 
				+        html2txt = re.sub(r'([ED]\s*[、.．､].*?((?<![:：])\s+|</su[pb]>\s*))(([1-9]|[1-9][0-9])\s*[、.．､])',
			
 
				+                          r"\1</p>\n<p>\3", html2txt)
			
 
				+        html2txt = re.sub(r'((</?p>|\n)\s*(<img src=.*?"\s*/?>\s*)?([1-9]|[1-9][0-9]))\s*'
			
 
				+                          r'([（(]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[)）]|解析?\s*[:：]|【解析】)', r"</p>\1､\5", html2txt)
			
 
				+        html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([(（]20\d{2}\s*[\u4e00-\u9fa5、､]{2,9}[)）])", r"<p>\1､\2",
			
 
				+                          html2txt)
			
 
				+        html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[:：]|\[(答案|解析)\])", r"<p>\1､\2",
			
 
				+                          html2txt)
			
 
				+        html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([(（]\s*\d+\s*分?\s*[)）])?(【(解析?|答案?)】|(解析?|答案?)\s*[:：]"
			
 
				+                          r"|\[(答案|解析)\])", r"<p>\1､\2\3", html2txt)
			
 
				+        # 图片和题号相连情况
			
 
				+        html2txt = re.sub(r"<p>\s*(<img src=((?!/>).)+?/>)\s*([1-9]|[1-9][0-9])\s*"
			
 
				+                          r"([(（]20\d{2}\s*[\u4e00-\u9fa5、､]{2,9}[)）])", r"<p>\3､\1\4", html2txt)  # 2024.5.6
			
 
				+        html2txt = re.sub(r'<p>((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+p[tx]"\s*/?>(\s*</su[bp]>)?\s*)'
			
 
				+                          r'(([1-9]|[1-9][0-9])\s*[、.．､])', r"<p>\4\1", html2txt)  # 2024.5.6
			
 
				+        html2txt = re.sub(r"(</p>|\n)\s*(<img src=((?!/>).)+?/>)\s*([1-9]|[1-9][0-9])\s*"
			
 
				+                          r"([(（]20\d{2}\s*[\u4e00-\u9fa5、､]{2,9}[)）])", r"<p>\2</p>" + "\n" + r"<p>\4､\5",
			
 
				+                          html2txt)  # 【susp_img】
			
 
				+        html2txt = re.sub(r'(</p>|\n)((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+p[tx]"\s*/?>(\s*</su[bp]>)?\s*)'
			
 
				+                          r'(([1-9]|[1-9][0-9])\s*[、.．､])', r"</p>\2</p>" + "\n" + r"\5", html2txt)
			
 
				+        html2txt = re.sub(r"(<p>((?!<p>).)+?(\s|[/\"]>))(([1-9]|[1-9][0-9])\s*[、.．､].{,20}本[大小]?题\d+分)",
			
 
				+                          r"\1</p>" + "\n<p>" + r"\4", html2txt)
			
 
				+        # 多张图片和题号相连情况
			
 
				+        html2txt = re.sub(r"</?p>((\s*<su[bp]>\s*)?<img src=.*?/>(\s*</su[bp]>)?"
			
 
				+                          r"((\s*<su[bp]>\s*)?<img src=((?!/>).)+?/>(\s*</su[bp]>)?)*?\s*)\s*(([1-9]|[1-9][0-9])\s*[、.．､])",
			
 
				+                          r"</p>\1</p>" + "\n<p>" + r"\8", html2txt, flags=re.S)
			
 
				+        html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、.．､].*?)</p>',
			
 
				+                          r"\1</p>\n<p>\2</p>", html2txt)
			
 
				+        html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、.．､].*?)</p>',
			
 
				+                          r"\1</p>\n<p>\2</p>", html2txt)
			
 
				+        html2txt = re.sub(r'(<p>.*?[.．]{6,}\s*\d+分)\s*(([1-9]|[1-9][0-9])\s*[、.．､].*?)</p>', r"\1</p>\n<p>\2</p>",
			
 
				+                          html2txt)
			
 
				+        html2txt = re.sub(r'([1-9]|[1-9][0-9])\s*([（(]\s*\d{1,2}[.\s\d]*?分\s*[)）])\s*[、.．､]', r"\1" + "､" + r"\2",
			
 
				+                          html2txt)
			
 
				+
			
 
				+        # 3>>建立图片id字典,对原图片信息第一次替换
			
 
				+        html2txt = re.sub(r'( src=".*?files)\\image', r"\1/image", html2txt)
			
 
				+        # all_image = re.findall(r'<img src=".*?image[\da-z]+\..*?[/\"]>', html2txt)
			
 
				+        # src2subs = {}
			
 
				+        # subs2src = {}
			
 
				+        # for src in all_image:
			
 
				+        #     # 校本题库上传的图片名称是随机数,故设置映射
			
 
				+        #     # kk = re.search('(<img src=".*?image\d+\.(png|gif|jpg|jpeg))', src)
			
 
				+        #     # new_src = src.replace(kk.group(1), self.img_url[kk.group(1)]) if type(self.img_url) == dict and kk else src
			
 
				+        #     # 图片信息简化替换
			
 
				+        #     print(src)
			
 
				+        #     new_src = re.sub(r'( data-latex)="\s*\\\[(.*?)\\\]\s*"', r'\1="$\2$"', src)
			
 
				+        #     new_src = re.sub(r'( data-latex="\$[^"]+?\$")',
			
 
				+        #                      lambda x: x.group(1).replace("<", " \lt ").replace("  ", " "), new_src)
			
 
				+        #     latex_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?(data-latex=".*?")', src)
			
 
				+        #     mathpix = " " + latex_info.group(3).replace("\n", "").strip().replace("  ", " ") if latex_info else ""
			
 
				+        #     if mathpix and len(mathpix) > 20:
			
 
				+        #         mathpix = ""
			
 
				+        #     w_h_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?width="([\d.]+)[pxt]*?"\s*height="([\d.]+)[pxt]*?"', src)
			
 
				+        #     w_h = " w_h=" + w_h_info.group(3).split('.')[0] + "*" + w_h_info.group(4).split('.')[0] \
			
 
				+        #         if w_h_info and not mathpix else ""  # w_h 和 mathpix只存在一个
			
 
				+        #     # image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
			
 
				+        #     image_info = re.search(r'<img src=".*?/([^/]+?)/(new_)?image([\da-z]+)\.', src)  # 2023.12.1
			
 
				+
			
 
				+        #     print(image_info.groups())
			
 
				+        #     image_id = image_info.group(1) + image_info.group(3)
			
 
				+        #     if len(image_id) > 10:
			
 
				+        #         image_id = image_id[-10:]
			
 
				+        #     src2subs[src] = '<imgsrc' + image_id + w_h + mathpix + "/>"
			
 
				+        #     subs2src['<imgsrc' + image_id + w_h + mathpix + "/>"] = new_src
			
 
				+        # for k, v in src2subs.items():
			
 
				+        #     html2txt = html2txt.replace(k, v)
			
 
				+        # ------------------------------------------------------------------------
			
 
				+
			
 
				+        # ========html 转 list=========
			
 
				+        html2txt = re.sub(r'(</?div>|</table>|</?body>)(\n\s*)*?<p>', r"\1</p>" + "\n<p>", html2txt, flags=re.S)
			
 
				+        # >>>>>> <table>先替换后再切割
			
 
				+        # 不能简单按 \n 切割，表格里面也可能有换行，应该先替换后再切割
			
 
				+        subs2table = {}
			
 
				+        all_table = re.findall(r'<table>.*?</table>', html2txt, flags=re.S)
			
 
				+        for k, v in enumerate(all_table):
			
 
				+            html2txt = html2txt.replace(v, "<t{}b>".format(k))
			
 
				+            # 将表格中的换行去掉
			
 
				+            v = re.sub(r'<p>\s*(</?t[drh]( .*?")?>|</?table>|</?tbody>)\s*</p>', r"\1", v)
			
 
				+            v = re.sub(r'</td></p>[\n\s]*<p><td>', "</td><td>", v)
			
 
				+            v = re.sub(r'<td>(<p>|\s|</p>|\n)*</td>', "<td> </td>", v)
			
 
				+            v = re.sub(r'</tbody></?p></table>', "</tbody></table>", v)
			
 
				+            v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(\s*<p>\s*</p>)[\s\n]*?(<br\s*/?>|\n)+', r"\1", v,
			
 
				+                       flags=re.S)
			
 
				+            v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(<br\s*/?>|\n|</p>|\s)+', r"\1", v, flags=re.S)
			
 
				+            v = re.sub(r'(</t[drh]( .*?")?>|</table>|</tbody>)(<br\s*/?>|\n|<p>|\s)+', r"\1", v, flags=re.S)
			
 
				+
			
 
				+            # 暂时还有table标签首尾的换行没去掉
			
 
				+            subs2table["<t{}b>".format(str(k))] = v
			
 
				+
			
 
				+        # <造成的css标签冲突处理  2021-10-13
			
 
				+        def sub2(ss):
			
 
				+            if re.search(r'^(img|/?h[123456]|/?su[bp]>|t\d+b>|br\s*/?>'
			
 
				+                         r'|/?(p|span|font|article|ul|ol|div|table|t?body|html|head|t[drh])(\s*|\s+style=.*?")>'
			
 
				+                         r'|/?[a-z]+ style=.*?">)', ss.group(1)) is None:
			
 
				+                return "&lt;{}".format(ss.group(1))
			
 
				+            else:
			
 
				+                return "<{}".format(ss.group(1))
			
 
				+
			
 
				+        html2txt = re.sub("<([^<]{1,30})", sub2, html2txt)
			
 
				+        if subs2table:
			
 
				+            html2txt = re.sub(r"|".join(subs2table.keys()), lambda x: subs2table[x.group()], html2txt)
			
 
				+        # print(html2txt)
			
 
				+        # >>>>>> html 切割
			
 
				+        con_list = sum([re.split('<p>|<h[12345]>', i) if len(re.findall("<p>|<h[12345]>", i)) > 1 else [i] for i in
			
 
				+                        re.split(r"\n+|</p>(?!</td>)|</h[12345]>", html2txt)], [])  # html2txt)[:-1]
			
 
				+        con_list = [re.sub(r"^\n*\s*(<p>|<h[12345]>)+", "", ii) for ii in con_list]
			
 
				+        # 剩余个别标签处理
			
 
				+        con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*</\1>$", "", i.strip()) for i in con_list]  # 2020/4/7,14
			
 
				+        con_list = [re.sub(r"^(<table>|</td>|<td[^<>]*?>|</?tr>)+?(.|\n)+?([一二三四五六七八九十])\s*[、.．､]\s*(.{2,4}题)(.|\n)+?</table>",
			
 
				+                           r"\3､\4", i.strip())
			
 
				+                    for i in con_list]
			
 
				+        # 把最后可能还存在的</?p>或考号信息去掉
			
 
				+        con_list = [re.sub("</?p>|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$"
			
 
				+                           "|((学校|班级|姓名|座位号|准考号|[学考]号)[\s：:_]*?){2,}$", "", i.strip()) for i in con_list]
			
 
				+        # =====答案行格式处理====
			
 
				+        temp_list = [re.split(r"^((\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+)", v.strip(), maxsplit=1)[1::3]
			
 
				+                     if re.match(r'(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$'
			
 
				+                                 r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?评分标准'
			
 
				+                                 r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$',
			
 
				+                                 re.sub(r"[上下]?学[年期]|[\d—【】.．、､：:(（）)年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "",
			
 
				+                                        v.strip())) else [v] for v in con_list]
			
 
				+        con_list = sum(temp_list, [])
			
 
				+        # =====对可能的题号的处理====  如2、3、4、5、      加了【fei】  # 重新修改！！！！！！！！！！
			
 
				+        con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[.．、､])", r"【fei】\1", i.strip())
			
 
				+                    if (len(re.findall(r"(^|\s*[.．、､])\s*[1-9][0-9]?\s*[.．、､]", i)) >= 3
			
 
				+                        and len(re.sub(r"[\d.．、､\s]", "", i)) < 2) else i for i in con_list]
			
 
				+
			
 
				+        # =====头尾清除没用的信息=====
			
 
				+        if con_list and re.search(r"[\u4e00-\u9fa5]|<img ", con_list[0]) is None:
			
 
				+            con_list = con_list[1:]
			
 
				+        while con_list and re.search(r"声明[：:].*?著作权属.*?所有|(邮箱|用户|日期|QQ)\s*[：:].+?", con_list[-1]):
			
 
				+            con_list = con_list[:-1]
			
 
				+        return html2txt, con_list, self.new_html  # subs2table
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # -------------生成requirements.txt---------------
			
 
				+    # pip freeze > requirements.txt
			
 
				+    # import os, sys
			
 
				+    #
			
 
				+    # project_root = os.path.dirname(os.path.realpath(__file__))  # 找到当前目录
			
 
				+    # print(project_root)
			
 
				+    #
			
 
				+    # # 找到解释器，虚拟环境目录
			
 
				+    # python_root = sys.exec_prefix
			
 
				+    # print(python_root)
			
 
				+    #
			
 
				+    # # 拼接生成requirements命令
			
 
				+    # command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt'
			
 
				+    # print(command)
			
 
				+    #
			
 
				+    # # 执行命令。
			
 
				+    # os.system(command)
			
 
				+
			
 
				+    # ----------------一键安装 requirements.txt------------
			
 
				+    # pip install -r requirement.txt
			
 
				+    # python_root + '\Scripts\' + pip install -r requirements.txt
			
 
				+
			
 
				+    # import os
			
 
				+    # rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx")
			
 
				+    # print(rrr)
			
 
				+    # item = "<a 我没发你的接口 $2366<a$   <a 我没发你的接口 $2366<a$  <img 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$   <a 我没发你的接口 $2366<a$"
			
 
				+    # item = r"2.下列选项中,使不等式\( x<\frac{1}{x}< x_{2} \)"
			
 
				+    # ww = css_conflict_deal(item)
			
 
				+    # print(ww)
			
 
				+    p1 = r"/home/cv/workspace/tujintao/document_segmentation/Data/samples/真实样例/6264fa25f84c0e279ac643ef.html"
			
 
				+    t1 = open(p1, 'r', encoding="utf8").read()
			
 
				+    row_list, new_html = HtmlWash_2(t1, '11111111', is_reparse=1, must_latex=1).html_cleal()
			
 
				+    row_list = list(filter(lambda x: x.strip() != "", row_list))
			
 
				+    pprint(row_list)
			
 
				+    print(len(row_list))
			
 
				+    # html, wordid, is_reparse=0, img_url="", must_latex=0)
			
 
				+