浏览代码

add topicSegment by model

莺声燕语 5 月之前
父节点
当前提交
52bc8fe44d
共有 35 个文件被更改,包括 1280 次插入315 次删除
  1. 327 274
      .idea/workspace.xml
  2. 二进制
      __pycache__/configs.cpython-36.pyc
  3. 3 0
      ceshi.py
  4. 7 1
      configs.py
  5. 14 6
      server.py
  6. 二进制
      structure/__pycache__/ans_structure.cpython-36.pyc
  7. 二进制
      structure/__pycache__/danti_structure.cpython-36.pyc
  8. 二进制
      structure/__pycache__/dati2slave.cpython-36.pyc
  9. 二进制
      structure/__pycache__/final_structure.cpython-36.pyc
  10. 二进制
      structure/__pycache__/option.cpython-36.pyc
  11. 二进制
      structure/__pycache__/stems_structure.cpython-36.pyc
  12. 二进制
      structure/__pycache__/structure_main.cpython-36.pyc
  13. 二进制
      structure/__pycache__/three_parse_structure.cpython-36.pyc
  14. 37 10
      structure/ans_structure.py
  15. 8 9
      structure/danti_structure.py
  16. 138 5
      structure/structure_main.py
  17. 42 0
      structure/three_parse_structure.py
  18. 二进制
      utils/__pycache__/equation_extract.cpython-36.pyc
  19. 二进制
      utils/__pycache__/field_eq2latex.cpython-36.pyc
  20. 二进制
      utils/__pycache__/html_again_parse.cpython-36.pyc
  21. 二进制
      utils/__pycache__/image_convert.cpython-36.pyc
  22. 二进制
      utils/__pycache__/insert_keywords.cpython-36.pyc
  23. 二进制
      utils/__pycache__/item_type_line.cpython-36.pyc
  24. 二进制
      utils/__pycache__/pic_pos_judge.cpython-36.pyc
  25. 二进制
      utils/__pycache__/qcloud_bucket.cpython-36.pyc
  26. 二进制
      utils/__pycache__/ruku_opera.cpython-36.pyc
  27. 二进制
      utils/__pycache__/stem_ans_split.cpython-36.pyc
  28. 二进制
      utils/__pycache__/topic_no.cpython-36.pyc
  29. 二进制
      utils/__pycache__/washutil.cpython-36.pyc
  30. 二进制
      utils/__pycache__/xuanzuoti2slave.cpython-36.pyc
  31. 108 2
      utils/html_again_parse.py
  32. 3 1
      utils/ruku_opera.py
  33. 4 2
      utils/stem_ans_split.py
  34. 93 5
      utils/washutil.py
  35. 496 0
      utils/washutil_for_DL_way.py

文件差异内容过多而无法显示
+ 327 - 274
.idea/workspace.xml


二进制
__pycache__/configs.cpython-36.pyc


+ 3 - 0
ceshi.py

@@ -232,3 +232,6 @@ repeat_ip = "http://82.156.68.22:8888/repeat/subject"  # 全学科查重
 # print(repeat_r.content)
 import os
 print(os.environ.get('APPDATA'))
+# repeat_r = requests.post(url=configs.repeat_ip, json=[chachong_item_dict]).json()
+# print(repeat_r)
+

+ 7 - 1
configs.py

@@ -89,7 +89,9 @@ class TestingCfg:  # testing
     kps_phy_ip = "http://192.168.1.86:11088/phy_mark_and_connect"
     kps_Hmath_ip = "http://192.168.1.192:13356/auto_labels"
     repeat_ip = "http://192.168.1.192:8866/api/repeat/subject"
-    # repeat_ip = "http://82.156.68.22:8888/repeat/subject"  # 全学科查重
+    topic_segment_ip = "http://192.168.1.204:10622/math_phy_TopicSegment_predict"
+    phy_topicType_ip = "http://192.168.1.204:10611/phy_topicType_predict"
+    # repeaty_ip = "http://82.156.68.22:8888/repeat/subject"  # 全学科查重
     # repeat_ip = "http://82.156.68.22:8888/api/repeat/subject"  # 保存入库查重
     # callback_url_taskcheck = "http://zsytk3api.dev.xueping.com/v1/interior-api/record"
     callback_url_taskcheck = "http://zsytk3api.testing.xueping.com/v1/interior-api/record"
@@ -112,6 +114,8 @@ class ProductionCfg:  # production
     kps_phy_ip = "http:/49.232.72.198:11088/phy_mark_and_connect"
     kps_Hmath_ip = "http://172.16.2.5:13356/auto_labels"
     repeat_ip = "http://10.19.1.18:8866/api/repeat/subject"
+    topic_segment_ip = "http://10.19.1.14:10622/math_phy_TopicSegment_predict"
+    phy_topicType_ip = "http://10.19.1.6:10611/phy_topicType_predict"
     callback_url_taskcheck = "http://api.tk.zhixinhuixue.com/v1/interior-api/record"
 
 
@@ -162,6 +166,8 @@ kps_phy_ip = config_class.kps_phy_ip
 kps_Hmath_ip = config_class.kps_Hmath_ip
 callback_url_taskcheck = config_class.callback_url_taskcheck
 repeat_ip = config_class.repeat_ip
+topic_segment_ip = config_class.topic_segment_ip
+phy_topicType_ip = config_class.phy_topicType_ip
 
 # 注意:
 # 单题解析中,线上css_conflict_deal与线下不一样

+ 14 - 6
server.py

@@ -38,13 +38,17 @@ def word_structure():
     mydata = request.json.get("sci_html_data", "")
     is_reparse = request.json.get("is_reparse", "0")
     word_id = request.json.get("paper_id", 0)
-    must_latex = request.form.get("must_latex", 1)
+    source = request.json.get("source", "zxhx")
+    subject = request.json.get("subject", "")
+    must_latex = request.json.get("must_latex", 1)  # 非必传
     print("【再解析】==request.POST.dict==>is_reparse:{}, word_id:{}".format(is_reparse, word_id))
     # print(mydata)
     loginfo = {"log_level": "info",
                "request_ip": request.remote_addr,
                "receive_data": {"paper_id": word_id,
-                                "is_reparse": is_reparse},
+                                "is_reparse": is_reparse,
+                                "source": source,
+                                "subject": subject},
                "task_name": "批量文本结构化解析"}
 
     # 接收的文件记录一下,按wordid命名
@@ -68,7 +72,7 @@ def word_structure():
     st1 = time.time()
     try:
         if int(is_reparse) and word_id:  # 再解析
-            res, paper_type = WordParseStructure(mydata, str(word_id), int(is_reparse), must_latex).structure()
+            res, paper_type = WordParseStructure(mydata, str(word_id), int(is_reparse), must_latex, source, subject)()
             # print(res)
             if "errcode" not in res:
                 result["data"] = res
@@ -76,7 +80,7 @@ def word_structure():
                 result = res
             print("【再解析】==解析结束==> word_id:{}".format(word_id))
         elif not int(is_reparse) and mydata:  # 不是再解析
-            res, paper_type = WordParseStructure(mydata, "").structure()
+            res, paper_type = WordParseStructure(mydata, "", source=source, subject=subject)()
             # print(res)
             if "errcode" not in res:
                 result["data"] = res
@@ -129,13 +133,17 @@ def danti_structure():
     word_id = request.json.get("paper_id", 0)
     one_item = request.json.get("single_item_data", "")
     item_type = request.json.get("item_type", "")
+    source = request.json.get("source", "zxhx")
+    subject = request.json.get("subject", "")
     print("【单题解析】==request.POST.dict==>word_id:{}, item_type:{}".format(word_id, item_type))
     # logger.info("【单题解析】==request.POST.single_item_data==>\n{}\n".format(one_item))
     print(word_id, item_type)
     loginfo = {"log_level": "info",
                "request_ip": request.remote_addr,
                "receive_data": {"paper_id": word_id,
-                                "item_type": item_type},
+                                "item_type": item_type,
+                                "source": source,
+                                "subject": subject},
                "task_name": "单题解析"}
 
     if not word_id:
@@ -146,7 +154,7 @@ def danti_structure():
 
     res = {"errcode": 0, "errmsgs":"", "data": {}}
     if item_type:
-        one_res = single_parse(one_item, item_type, word_id)
+        one_res = single_parse(one_item, item_type, word_id, source, subject)
         # pprint(one_res)
         if type(one_res) == str:
             res["errcode"] = 1

二进制
structure/__pycache__/ans_structure.cpython-36.pyc


二进制
structure/__pycache__/danti_structure.cpython-36.pyc


二进制
structure/__pycache__/dati2slave.cpython-36.pyc


二进制
structure/__pycache__/final_structure.cpython-36.pyc


二进制
structure/__pycache__/option.cpython-36.pyc


二进制
structure/__pycache__/stems_structure.cpython-36.pyc


二进制
structure/__pycache__/structure_main.cpython-36.pyc


二进制
structure/__pycache__/three_parse_structure.cpython-36.pyc


+ 37 - 10
structure/ans_structure.py

@@ -314,6 +314,7 @@ def only_parse_split(one_item_ans, item_type, res_con, reparse_n=1):
     :reparse_n == 1:表示再解析
     :return:{'key': ,"parse": }
     """
+    one_item_ans = re.sub(r"(\[.*?\])?\(.*?\d+分\)", "", one_item_ans[:20]) + one_item_ans[20:]
     one_item_ans = re.sub("\n\s*(化学|物理|生物|和|与)+\s*【答案】\s*$", '', one_item_ans)
     dd = {'parse': one_item_ans, 'key': ""}
     if "选修" in one_item_ans.replace(" ", "")[:10] or \
@@ -345,13 +346,27 @@ def only_parse_split(one_item_ans, item_type, res_con, reparse_n=1):
                            re.split(r"(解)\s*[::]", one_item_ans, maxsplit=1)))
         if "【答案】" in temp_ans:
             dd["key"] = dd1["key"]
+            if not dd["key"] and dd1["parse"].strip():
+                dd["key"] = "见解析"
             if len(dd1) >= 3:
                 dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
                 del dd1["parse_title"]
             return dd
         if len(dd1) >= 3:
-            dd["key"] = dd1["key"]
-            dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
+            dd["key"] = dd1["key"].strip()
+            rest_parse = ""
+            # 细节处理2024.5.7
+            if re.search("^<img .+?/>$", dd["key"]):
+                dd["key"] = "见解析"
+                rest_parse = dd1["key"].strip()
+            if dd1["parse_title"] == "解":
+                dd["parse"] = "解:" + dd1["parse"]
+            else:
+                dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
+            if rest_parse:
+                dd["parse"] = rest_parse + "\n" + dd["parse"]
+            if not dd["key"] and (dd1["parse"].strip() or rest_parse):
+                dd["key"] = "见解析"
             del dd1["parse_title"]
             return dd
         sim_parse = re.split("【点评】|【点睛】", dd["parse"])[0].strip()
@@ -684,8 +699,8 @@ def ans_structure_step2(anss, item_type_classify, item_res, *group):
             print("ans_no:::",ans_no)   # ans_no只记录表格答案和排列型答案
             print("::::", ans_no0, ans_no_idx0)
             pre_split_ansinfo_list = all_item_ans, ans_no, ans_no0, ans_no_idx0, anss_str, is_from_0, ans_item_no_type
-            all_item_ans, ans_no = ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
-            item_res = get_ans_match(item_res, all_item_ans, ans_no, group)
+            all_item_ans, ans_no, repet_ans= ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
+            item_res = get_ans_match(item_res, all_item_ans, ans_no, repet_ans, group)
 
     return item_res
 
@@ -798,7 +813,17 @@ def ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
     """
     all_item_ans, ans_no, ans_no0, ans_no_idx0, anss_str, is_from_0, ans_item_no_type = pre_split_ansinfo_list
     ans_no1 = ans_no.copy()
-    ans_no1.extend(ans_no0)
+    # 开头的答案是选择题时,存在后面答案仍然存在选择题的详解
+    repet_ans = {}
+    if len(ans_no0) == len(item_res):
+        ans_no1 = ans_no0.copy()
+        repet_ans = dict(zip(ans_no, all_item_ans))
+        all_item_ans = []
+    else:
+        # ans_no0 = [i for i in ans_no0 if i not in ans_no1]
+        # del_no = [i for i,v in enumerate(ans_no0) if v not in ans_no1]
+        # rest_item_split = [v for i,v in enumerate(rest_item_split) if i not in del_no]
+        ans_no1.extend(ans_no0)
     # 先按换行格式获取答案(没有一行多个答案的情况)
     print("ans_no1:",ans_no1)
     print("item_type_num:",item_type_num)
@@ -911,10 +936,10 @@ def ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
         all_item_ans.extend(rest_item_split)
         ans_no = ans_no1
 
-    return all_item_ans, ans_no
+    return all_item_ans, ans_no, repet_ans
 
 
-def get_ans_match(item_res, all_ans, ans_no, *group):
+def get_ans_match(item_res, all_ans, ans_no, repet_ans=None, *group):
     """
     根据切分后的答案及其题号,与前面试题进行匹配更新,all_ans和ans_no的个数应该是相同的
     :param item_res:
@@ -938,6 +963,8 @@ def get_ans_match(item_res, all_ans, ans_no, *group):
                 if k<= len(item_res)-1:
                     simp_res = only_parse_split(one_ans, item_res[temp_id]["type"], res_con)
                     item_res[temp_id].update(simp_res)
+                    if repet_ans and item_res[temp_id]["item_id"] in repet_ans:
+                        item_res[temp_id]["key"] = repet_ans[item_res[temp_id]["item_id"]]
                 else:
                     item_res[temp_id].update({'key': "", 'parse': ""})
             else:
@@ -948,6 +975,9 @@ def get_ans_match(item_res, all_ans, ans_no, *group):
                     item_res[temp_id]['parse'] = one_ans
                     if not item_res[temp_id]['key']:
                         item_res[temp_id]['key'] = '见解析'
+                if group[0] == 'model_split':
+                    simp_res = only_parse_split(one_ans, item_res[temp_id]["type"], res_con)
+                    item_res[temp_id].update(simp_res)
     return item_res
 
 
@@ -970,9 +1000,6 @@ def anss_split_contain_slave(subject, ans_str):
             temp_ans_no1, ans_no_idx1 = get_right_no((ans_no_idx1, temp_ans_no1), 1)  # 筛选
 
 
-
-
-
 if __name__ == '__main__':
 #     one_item_ans = """
 #     (12分)

+ 8 - 9
structure/danti_structure.py

@@ -9,12 +9,12 @@ import re
 from structure.option import option_structure
 from utils.equation_extract import get_simpstr2eqn, get_equation_instr
 from utils.html_again_parse import css_label_wash
-from utils.washutil import base642img, css_conflict_deal, convert_huanhang
+from utils.washutil import base642img, css_conflict_deal, convert_huanhang, wash_after
 from utils.field_eq2latex import latex_wash
 from structure.dati2slave import get_slave
 
 
-def single_parse(one_item, item_type, wordid):
+def single_parse(one_item, item_type, wordid, source="zxhx", subject="数学"):
     """
     rtype:题型
     :return:
@@ -51,7 +51,7 @@ def single_parse(one_item, item_type, wordid):
         # elif len(res_list) < 5:
         #     return "编辑后的文本掉了【答案】或【解析】字段,请添加,每个字段保留唯一"
 
-    new_item_struct = {}
+    new_item_struct = {"type": item_type}
     new_item_struct["key"] = ""
     new_item_struct["parse"] = ""
     new_item_struct["stem"] = res_list[0]
@@ -71,7 +71,7 @@ def single_parse(one_item, item_type, wordid):
             new_item_struct["parse"] = res_list[2]
 
     item_ids = re.findall("^([1-9][0-9]|[1-9])\s*[..、、]", new_item_struct["stem"].strip())
-    new_item_struct["topic_num"] = int(item_ids[0]) if item_ids else 0
+    new_item_struct["item_id"] = int(item_ids[0]) if item_ids else 0
 
     new_item_struct["stem"] = re.sub("^([1-9][0-9]|[1-9])\s*[..、、]", "", new_item_struct["stem"].strip())
     if len(new_item_struct["stem"].strip()) < 3:
@@ -87,13 +87,12 @@ def single_parse(one_item, item_type, wordid):
                                                  if not i.replace(":", "").strip()]):  # 空选项中:被当成了内容
             return "存在选项为空,请补充完整"
         new_item_struct["answer_type"] = "选择题"
-    else:
-        new_item_struct["type"] = item_type
+    elif source in ["xue_guan", "teacher"] and subject not in ["数学", "物理"]:  # 拆小题
         new_item_struct = get_slave(new_item_struct, new_item_struct["stem"], new_item_struct["parse"], new_item_struct["key"])
-        if "item_id" in new_item_struct:
-            del new_item_struct["item_id"]
+
+    new_item_struct = wash_after([new_item_struct], subject)[0]
     # 换行符替换
-    convert_huanhang(new_item_struct)
+    # convert_huanhang(new_item_struct)
     # new_item_struct["stem"] = new_item_struct["stem"].strip().replace("\n\n", "\n").replace("\n", "<br/>")  # 2020/4/10 gai
     # new_item_struct["key"] = new_item_struct["key"].strip().replace("\n\n", "\n").replace("\n", "<br/>")
     # new_item_struct["parse"] = new_item_struct["parse"].strip().replace("\n\n", "\n").replace("\n", "<br/>")

+ 138 - 5
structure/structure_main.py

@@ -3,16 +3,21 @@
 
 
 from pprint import pprint
+from typing import Any
 # from utils.exam_type import get_exam_type
 # from utils.get_data import Mongo
 from structure.final_structure import one_item_structure
 from utils.stem_ans_split import get_split_pos
 from utils.washutil import *
+from utils.washutil_for_DL_way import HtmlWash_2
 from structure.three_parse_structure import *
 from utils.pic_pos_judge import img_regroup
 from func_timeout import func_set_timeout
+import requests
+from structure.ans_structure import get_ans_match
 
 from utils.xuanzuoti2slave import toslave_bef, toslave_aft
+logger = configs.myLog(__name__, log_cate="reparse_ruku_log").getlog()
 
 paper_types = ["第三种试卷格式:题目与答案分开",
                "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
@@ -23,11 +28,135 @@ class WordParseStructure:
         基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
     """
 
-    def __init__(self, html, wordid, is_reparse=0, must_latex=0):
+    def __init__(self, html, wordid, is_reparse=0, must_latex=0, source="zxhx", subject="数学"):
         self.html = html
         self.is_reparse = is_reparse
         self.wordid = wordid
         self.must_latex = must_latex
+        self.source = source
+        self.subject = subject
+
+    def __call__(self):
+        if self.source not in ["school"]: # == "school" "xue_guan", "teacher":
+            res = self.structure_combine_DL()
+            if not res[0]:
+                return self.structure()
+            logger.info("----【paper_id:{}】采用切题服务".format(self.wordid))
+            return res
+        else:
+            return self.structure()
+
+    
+    def structure_combine_DL(self):
+        # 第一步:清洗
+        htmltext, row_list, new_html = HtmlWash_2(self.html, self.wordid, self.is_reparse,
+                                                     must_latex=self.must_latex).html_cleal()
+        if not row_list:
+            return {"errcode": 1, "errmsgs": "题文没有有效信息", "data": {}}, ""
+        # 第二步:寻找题目和答案的切分点,一定要有“答案”关键字
+        split_res = get_split_pos(row_list)
+        if type(split_res) == str:
+            return {"errcode": 1, "errmsgs": split_res, "data": {}}, paper_types[0]
+        row_list, items_list, ans_list, _ = split_res
+        rd1_may_fail = 0
+        paper_type = ""
+        item_res = {}
+        if "【答案】" in "".join(items_list) or "【解析】" in "".join(items_list):
+            rd1_may_fail = 1
+        elif items_list:
+            paper_type = "第三种试卷格式:题目与答案分开"
+            try:
+                r1 = requests.post(url=configs.topic_segment_ip,
+                                  json={"content": "<br>".join(items_list), "subject": self.subject,
+                                        "paper_id": self.wordid, "text_type": "stem_block"})
+                item_res = r1.json()["res"]
+                # print(item_res)
+                r2 = requests.post(url=configs.topic_segment_ip,
+                                   json={"content": "<br>".join(ans_list), "subject": self.subject,
+                                         "paper_id": self.wordid, "text_type": "answer_block"})
+                all_ans, ans_no = r2.json()["res"]
+                # print(1111111111111,all_ans)
+                print(ans_no)
+                # 根据ans_no纠正切错的all_ans,如[2, 6, 4, None, 7, None, 5, None, 1]
+                if abs(len([i for i in ans_no if i]) - len(item_res)) <= 2:
+                    last_idx = None
+                    new_ans_no = ans_no.copy()
+                    for i, no in enumerate(ans_no):
+                        if no is not None:
+                            last_idx = i
+                        if i > 0 and no is None and last_idx is not None:
+                            all_ans[last_idx] += "\n"+all_ans[i]
+                            all_ans[i] = ""
+                            new_ans_no[i] = "del"
+                    all_ans = [j for j in all_ans if j]
+                    ans_no = [i for i in new_ans_no if i != 'del']
+
+                if abs(len(ans_no) - len(item_res)) > 2:
+                    item_res = ans_block_split(ans_list, item_res)
+                else:
+                    item_res = get_ans_match(item_res, all_ans, ans_no, {}, 'model_split')
+            except Exception as e:
+                logger.info("----【paper_id:{}】切题服务异常:{}".format(self.wordid, e))
+        else:
+            rd1_may_fail = 1
+        
+        if rd1_may_fail:
+            try:
+                r3 = requests.post(url=configs.topic_segment_ip,
+                                         json={"content": htmltext, "subject": self.subject,
+                                               "paper_id": self.wordid, "text_type": "stem_block"})
+                item_res = r3.json()["res"]
+                # 还需判断下教师卷
+                for k, one_res in enumerate(item_res):
+                    if re.search('\n【(答案|[解分][析答]|详解|点[评睛]|考点|专题)】', one_res["stem"]):
+                        case = "case1"  # 默认有“答案”关键字
+                        if re.search(r'\n【答案】|[\n】]\s*答案\s*[::]', one_res["stem"]) is None:
+                            # 没“答案”关键字
+                            case = "case0"
+                        dd1 = stem_ans_split(one_res, case)  # 对切分后的每道题再细分
+                        one_res["stem"] = dd1["stem"]
+                        del dd1["stem"]
+                        one_res.update(dd1)
+                    else:  # 没有解析的情况
+                        one_res.update({"key": "", "parse": ""})
+            except Exception as e:
+                logger.info("----【paper_id:{}】切题服务异常:{}".format(self.wordid, e))
+        
+        # ==========小题结构化========
+        if item_res:
+            # 答案解析字段完善
+            for i, one_item in enumerate(item_res):
+                if 'key' not in one_item:
+                    item_res[i]['key'] = ""
+                if 'parse' not in one_item:
+                    item_res[i]['parse'] = ""
+            # 单题结构化
+            consumer = ['noslave'] * len(item_res)
+            items_no_type = [1] * len(item_res)
+            xyz = zip(item_res, consumer, items_no_type)
+            res = list(map(one_item_structure, xyz))  # 和多进程相比,这样速度也很快
+            # pprint(res)
+            # ==========最后的清洗=========
+            res = wash_after(res, self.subject)
+            # 针对模型可能切错的地方纠正,放在切割模型预测中纠正了
+            # for i, one_item in enumerate(res):
+            #     if i>0 and one_item['topic_num'] is None and res[i-1]['topic_num'] is not None and res[i+1]['topic_num'] is not None \
+            #         and res[i+1]['topic_num'] - res[i-1]['topic_num'] == 1 and not one_item['key'] and not one_item['parse']:
+            #         if res[i-1]["parse"]:
+            #             res[i - 1]["parse"] += one_item['stem']
+            #             del res[i]
+            #         elif res[i-1]["key"]:
+            #             res[i - 1]["key"] += one_item['stem']
+            #             del res[i]
+            # pprint(res)
+            # 结果返回
+            if self.is_reparse:
+                return {"html":new_html, "items": res}, paper_type
+            else:
+                return {"items": res}, paper_type
+        else:
+            return {}, paper_type
+
 
     def img_repl(self, one_dict):
         """
@@ -80,7 +209,7 @@ class WordParseStructure:
                     if len(reform_res)==2:
                         item_res = reform_res
                     else:
-                        item_res, item_no_type, rd2_is_fail= reform_res
+                        item_res, item_no_type, rd2_is_fail = reform_res
 
         if not items_list or rd1_may_fail or (is_may_ans and rd2_is_fail):
             ans_n = re.findall("【答案】", "\n".join(row_list))
@@ -202,7 +331,8 @@ if __name__ == '__main__':
 
         # print(load_dict)
 
-    path2 = r"F:\zwj\Text_Structure\accept_files\64994dc4a3693ef35281fbc5.html"
+    # path2 = r"C:\Users\Python\Desktop\bug\5-9\663c90361ec1003b58557474.html"
+    path2 = r"F:\zwj\Text_Structure\accept_files\664597dd71453ba19c20977f.html"
     # path2 = r"C:\Users\Python\Desktop\bug\6419746d11a1cdad550f5502.html"
     # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\data\620bbf7aa7d375f4518b98e1.html"
     # path2 = r"F:\zwj\new_word_text_extract_v2\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级(下)第二次联考地理试卷-普通用卷.html"
@@ -216,13 +346,16 @@ if __name__ == '__main__':
     # """
 
     # print(html)
-    res1 = WordParseStructure(html, "", 1).structure()
+    # html = "\n1、已知集合M满足{1,2}≤M≤{1,2,5,6,7},则\n符合条件的集合M有__个."
+    res1 = WordParseStructure(html, "664597dd71453ba19c20977f",
+                              is_reparse=0, must_latex=0,
+                              source="ai", subject="物理")()
     # new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html")
     # re_f = open(new_fpath, 'a+', encoding='utf-8')
     # for i in res1[0]["items"]:
     #     re_f.write(str(i))
     # pprint(res1)
-    # pprint(res1[0]['items'])
+    pprint(res1[0]['items'])
     print('题目数量:', len(res1[0]["items"]))
 
     # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"

+ 42 - 0
structure/three_parse_structure.py

@@ -130,6 +130,48 @@ def items_ans_reform(items_list, ans_list):
     return item_res, item_no_type, rd2_is_fail
 
 
+def ans_block_split(ans_list, item_res):
+    anss1 = list(filter(lambda x: x.strip() != "", ans_list))
+    if re.match(".+?省.+?试[卷题]|.*?答题?[卷卡页]", anss1[0]):
+        anss1 = anss1[1:]
+
+    rd1_is_fail = 0
+    have_type_line = re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等((\[]{2,5}题", "\n".join(anss1))
+    if have_type_line:
+        # 这里的anss1的清洗不应该影响rd2_is_fail中的原始文本!!先不修改看看再说
+        anss1_cy = anss1.copy()  # 复制一份,保证不能影响后面
+        while re.search(r"<td><p>[A-F]</p></td>|</td><td>[A-F]</td><td>|([A-F]\s*){3,}", anss1_cy[0]) is None and \
+                (re.search(r"[\u4e00-\u9fa5]", anss1_cy[0]) is None
+                 or re.search(r"[一二三四五六七八九十]\s*[、..、]\s*(<imgsrc.*?/>)?\s*.{2,5}题", anss1_cy[0]) is None):
+            del anss1_cy[0]
+        # 答案中的题型
+        all_type2 = re.findall(r"\n\s*[一二三四五六七八九十]\s*[、..、::]\s*([^必考基础综合中共等::((\[]{2,5}题)|"
+                               r"\n\s*[、..、::]?\s*(单选题|非?选择题|不定选择题|多选题|填空题|计算题|[解简]答题|实验题|作图题|论述题|探究题)",
+                               "\n" + "\n".join(anss1_cy))
+        all_type2 = ["".join(a) for a in all_type2]
+        # '本大题' 后面处理
+        print("答案中的题型:", all_type2)
+        ans_str = "\n" + "\n".join(anss1_cy)
+        item_res, rd1_is_fail = anss_structure_with_type(item_res, ans_str, [], all_type2, [], {})
+    # 没有题型行或第一次解析失败
+    rd2_is_fail = 0
+    if not have_type_line or rd1_is_fail:  # 答案中没有题型行 或题型行名称不规范
+        print('没有题型行或题目和答案的题型个数不一致或第一次解析失败')
+        anss1 = list(
+            map(lambda x: re.sub(r"(\n|^)\s*[一二三四五六七八九十]\s*[、..、::]?\s*(<p>)?"
+                                 r"(\s*.{2,5}题.+?分\s*[.。]?\s*$|.*?[((].+?[得共]\d+分.*?[))].*?$"
+                                 r"|\s*.{2,5}题\s*([((].+?[))])?).*?$|(\n|^)\s*[^\d]{2,5}题(.+?分\s*[))])?\s*$", "", x),
+                anss1))
+        # print("anss1:", anss1)
+        raw_item_res = item_res
+        # try:
+        item_res = ans_structure_step1(anss1, {}, item_res)  # 答案整体结构化
+        if str(raw_item_res) != str(item_res):
+            rd2_is_fail = 1
+
+    return item_res
+
+
 def split_by_keywords(con_list):
     """
     第一种试卷格式:教师用卷,含答案和解析关键字

二进制
utils/__pycache__/equation_extract.cpython-36.pyc


二进制
utils/__pycache__/field_eq2latex.cpython-36.pyc


二进制
utils/__pycache__/html_again_parse.cpython-36.pyc


二进制
utils/__pycache__/image_convert.cpython-36.pyc


二进制
utils/__pycache__/insert_keywords.cpython-36.pyc


二进制
utils/__pycache__/item_type_line.cpython-36.pyc


二进制
utils/__pycache__/pic_pos_judge.cpython-36.pyc


二进制
utils/__pycache__/qcloud_bucket.cpython-36.pyc


二进制
utils/__pycache__/ruku_opera.cpython-36.pyc


二进制
utils/__pycache__/stem_ans_split.cpython-36.pyc


二进制
utils/__pycache__/topic_no.cpython-36.pyc


二进制
utils/__pycache__/washutil.cpython-36.pyc


二进制
utils/__pycache__/xuanzuoti2slave.cpython-36.pyc


+ 108 - 2
utils/html_again_parse.py

@@ -160,6 +160,8 @@ def css_label_wash(content):
             a.append(content.strip())
 
         new_a = "\n".join(list(map(lambda x: str(x).strip(), a)))
+        new_a = re.sub("(\n\s*)+", "\n", new_a)
+        # print("newa:::", new_a)
         if subs2img:
             new_a = re.sub("|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a)
         new_a = "<p>" + new_a.replace("\n\n", "\n").replace("\n", "</p>\n<p>") + "</p>"
@@ -187,13 +189,117 @@ if __name__ == '__main__':
     cons1 = '''
     9 . 中国古代的政治权力由“传贤”转变为“传子”,“家天下”制度开始形成于<table name=\"optionsTable\" style=\"width:100%;table-layout:fixed;\" cols=\"4\"><tr><td>A.夏朝</td><td>B.商朝</td><td>C.周朝</td><td>D.秦朝</td></tr></table>
     '''
-
+    cons2 = '''
+    <p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">1.下列对这首诗的赏析,不正确的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">小寒食舟中作</span></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">杜甫</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">佳辰强饮食犹寒,隐几萧条戴鹖冠。春水船如天上坐,老年花似雾中看。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">娟娟戏蝶过闲幔,片片轻鸥下急湍。云白山青万余里,愁看直北是长安。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">[注]这首诗写于大历五年春诗人淹留潭州时,即诗人去世前半年多。鹖(hé)冠:传为楚隐者鹖冠子所戴的帽子。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联中“强饮”一词是痛快豪饮的意思,表明诗人晚年要纵酒人生。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.颔联写诗人在船上所见所感,春来水涨,江流浩瀚,自己老眼昏花。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颈联运用叠词,具有韵律美,写景由近及远,由蝴蝶而鸥鸟,层次分明。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联总收全诗,诗人北望长安,思朝廷,忧愁顿生,有沉郁苍茫之美。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">A</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“强饮”理解有误。应是“勉强吃一点饭”的意思。故选A。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">2.下列对这首诗的赏析,不正确的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">送客归江州</span></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">韩翃</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">东归复得采真</span><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">①</span></span></sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">游,江水迎君日夜流。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">客舍不离青雀舫,人家旧在白鸥洲</span><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">②</span></span></sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">风吹山带遥知雨,露湿荷裳已报秋。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">闻道泉明</span><span style="font-family: 宋体;">③居止近,篮舆相访为淹留。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【注】</span><span style="font-family: 宋体;">①采真:道教语,指顺乎天性,放任自然。②白鸥洲:指白鸥翔集的沙洲。此处借指客之家乡。③泉明:指晋陶渊明,此称其为泉明,乃避唐高祖李渊之讳。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.这首诗写诗人送客人归江州隐居,但并无送别时的伤感,更多的是一种美好的祝福。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.“江水迎君”采用拟人手法,客人归心似箭、归程片刻不能迟的心态跃然纸上。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.“青雀舫”“白鸥洲”写出了诗人对客人旅舟华美,家乡景色宜人的赞美与羡慕。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联写诗人听说陶渊明居所离客人很近,定会借探访陶渊明居所之机去拜访客人。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">D</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">D项,“定会借探访陶渊明居所之机去拜访客人”错误。尾联的意思是听说陶渊明居住的地方就在附近,你可以常常乘着竹轿,前往拜访。表达了诗人对客人隐逸情怀的赞美与羡慕。故选D。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">3.对下面这首词的赏析,不恰当的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">渔家傲</span></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">范仲淹</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">塞下秋来风景异,衡阳雁去无留意。四面边声连角起,千嶂里,长烟落日孤城闭。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">浊酒一杯家万里,燕然未勒归无计。羌管悠悠霜满地,人不寐,将军白发征夫泪。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.这首词写出了我国北方秋季的景物特点,从词中的“塞下”“霜”等词语可以看出。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.“衡阳雁去”是说“大雁向衡阳飞去”而不是“大雁从衡阳飞走了”。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.这首词既表达了将士的爱国之心,又流露出思念亲人和家乡的感情。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.这首词感情悲观而消极,表达了鲜明的反战、厌战情绪。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">D</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】这首《渔家傲》为范仲淹创作,以描写北方秋季景物为背景,表达了作者对家国、亲人的思念以及将士们的英勇豪情。</span><span style="font-family: 宋体;">A项正确,词中的“塞下”“霜”等词语揭示了北方的秋季特点;B项正确,作者借衡阳雁南飞的景象暗示将士们向往家乡的渴望;C项正确,通过浓烈的爱国情感以及思念亲人的情绪表现,展现了作者的家国情怀和将士们的壮志豪情。不过,D项表述错误,词中并未明显表达反战、厌战情绪,其主要表达了将士们为国家和民族拼搏的精神。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">4.对下面这首唐诗,赏析不恰当的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">早梅</span></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">万木冻欲折,孤根暖独回。前村深雪里,昨夜一枝开。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">风递幽香出,禽窥素艳来。明年如应律,先发望春台。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联把梅花与万木进行对比,万木的干枯摧折既有力地衬托了梅花的迎风斗雪,又好地照应了诗题中的“早”。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.颔联用华丽的语言为读者描绘出了一幅浓艳、高贵的雪中梅花图。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颈联写梅花的风韵和姿色,尾联寄寓诗人深深的情思。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.这首咏梅诗,语言清丽,笔墨含蓄,有着强烈的艺术感染力。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">B</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】选项</span><span style="font-family: 宋体;">B不太恰当。颔联并没有用华丽的语言来描绘梅花,只是表达了梅花在寒雪中展现出的独立、高洁的风韵。这里并没有像选项B所说的“浓艳、高贵”。其余选项都能恰当地反映这首诗的特点和内容,因此答案选B。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">5.对下面这首宋诗理解与赏析,不恰当的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">村行</span></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">王禹偁</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">马穿山径菊初黄,信马悠悠野兴长。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">万壑有声含晚籁,数峰无语立斜阳。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">棠梨叶落胭脂色,荞麦花开白雪香。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">何事吟余忽惆怅?村桥原树似吾乡。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联照应题目,点明地点和时令,写出了诗人信马徐行、观赏山野景色的悠然兴致。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.第二联上下句构成对比,生动地表现出山中有时喧响有时静穆的景象。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.第三联以“胭脂”和“白雪”为喻,形象地描绘出山村绚丽多彩的秋景。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.最后两句设为问答,抒发了诗人由外界景物所触发的浓浓的思乡之情。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">B</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】本题考查了对宋诗《村行》的理解与赏析。首先,</span><span style="font-family: 宋体;">A选项指出首联照应题目,点明了诗人信马行走在山间小路,看到菊花初黄,意境开阔。B选项提到第二联表现了山中有时喧响有时静穆的景象,但该联实际上并没有对比色彩,而是展示出千山万壑中奔涌着生机勃勃的晚响,无言的数峰沐浴在斜阳中。C选项正确地概括了第三联的内容,诗人通过赞美胭脂色的棠梨叶和白雪般芬芳的荞麦花存在世上,描绘出色彩斑斓的美景。D选项陈述了诗末以问答形式流露出的诗人对故乡情感的深刻思索。因此,答案为B选项,不恰当地解读了第二联。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">6.下列对这首诗的赏析,不正确的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">酬元九侍御赠璧竹鞭长句</span></strong><strong><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">①</span></span></sup></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">刘禹锡</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">碧玉孤根生在林,美人相赠比双金。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">初开郢客缄封后,想见巴山冰雪深。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">多节本怀端直性,露青犹有岁寒心。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">何时策马同归去,关树扶疏</span><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">②</span></span></sup><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">敲镫吟。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">[注]①此诗写于“永贞革新”失败后,作者被贬为朗州(今湖南)司马之时。元九,即诗人元稹,当时被贬为江陵(今湖北荆州)府士曹参军。②关树:关中之树。扶疏:枝叶繁茂。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.首联运用比兴手法,以碧玉般竹鞭的名贵,暗示赠鞭者的高尚,赞扬之情跃然纸上</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.颔联写诗人看到朋友赠礼后内心非常欣喜,很想去观赏生长碧竹的巴山冰雪美景。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颈联通过“节”字,将“竹节”与“节操”相关联,把咏鞭与赞人联系在了一起。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联由竹鞭引发联想,表达了诗人愿与友人“策马同去”“敲镫吟诗”的美好愿望。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">B</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“很想去观赏生长碧竹的巴山冰雪美景”赏析有误。领联表达的意思是,我一打开郢客的缄封之后,立刻想到冰冻巴山雪深深。目睹竹鞭而展开联想,写出了制鞭之竹在“巴山冰雪”中傲然挺立的景象。这是对元稹不畏权势、宁折不弯的形象写照。是以竹喻人,表达对友人的赞美。译文:绿如碧玉的孤竹生在深林,用它制的璧竹鞭名贵万分;贤稳之人将竹鞭赠送给我,这份厚礼胜过了万两黄金。我一打开郢客的绒封之后,立刻想到冰冻巴山雪深深。鞭上多节,节节怀着端直性,遍体露青犹有岁寒后凋心。我们何时才能策马同归去,在扶疏的关树下敲镫高吟?</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">7.下列对这首诗的赏析,不正确的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">见别离者因赠之</span></strong><strong><sup><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt; vertical-align: super;"><span style="font-family: 宋体;">①</span></span></sup></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">韩偓</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">征人草草尽戎装,征马萧萧立路傍。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">尊酒阑珊将远别,秋山迤逦更斜阳。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">白髭兄弟中年后,瘴海程途万里长。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">曾向天涯怀此恨,见君呜咽更凄凉。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">[注]①诗人生活在唐末战乱之际,当时自北而南,沿路所见,皆发于诗。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.标题点明本诗写作的原由,“别离”一词陡生无限伤感情绪,奠定全诗情感基调。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.首联紧扣“征人”与“征马”两个形象,真切地描绘了出征时的情景,画面感很强</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.颔联描写的是别后想象的虚景,诗人想象征人在离别亲人后沿着秋山远行的景象。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.尾联直抒胸臆,眼前的别离勾起了诗人对自身的感叹,抒发了心中的无奈和感慨。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">C</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“颔联描写的是别后想象的虚景”说法错误,“尊酒阑珊将远别”是眼前实景。</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">8.下列对这首诗的赏析,不正确的一项是( &nbsp;&nbsp;)</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><strong><span style="mso-spacerun: 'yes'; font-family: 宋体; mso-ansi-font-weight: bold; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">大热五首(其一)</span></strong></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">戴复古</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">天地一大窑,阳炭烹六月。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">万物此陶镕,人何怨炎热。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">君看百谷秋,亦自暑中结。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">田水沸如汤,背汗湿如泼。</span></p>
+<p class="MsoNormal" style="text-align: center; vertical-align: middle; line-height: 150%;" align="center"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">农夫方夏耘,安坐吾敢食!</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">【注】陶镕:陶铸熔炼,比喻培育、造就。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">A.诗人把六月的天地比作一个大窑,太阳像炭火一样熔炼着其中的一切。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">B.暑天虽极炎热,诗人却认为不应抱怨,因为秋天的谷物均赖此而结实。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">C.田中的水被晒得似乎要沸腾,诗人的背上汗水流得就像刚刚用水泼过。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">D.这首诗描写暑热多用比喻和夸张修辞,语言平易浅近,风格质朴自然。</span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【答案】</span><span style="font-family: 宋体;">C</span></span></p>
+<p class="MsoNormal" style="vertical-align: middle; line-height: 150%;"><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;"><span style="font-family: 宋体;">【解析】</span><span style="font-family: 宋体;">“背汗湿如泼”描写的是</span></span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">农夫在暑热中辛苦劳作的情景</span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">,</span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">而不是指</span><span style="mso-spacerun: 'yes'; font-family: 宋体; font-size: 10.5000pt; mso-font-kerning: 1.0000pt;">诗人自己。</span></p>
+    '''
     # pprint(cons)
     # print(again_parse(cons))
     # print(again_parse(cons))
     # print(list(map(lambda x: str(x).replace("     ", " "), again_parse(cons))))
     # con1 = r'<p>解:A.研究跨栏动作时,刘翔的大小和形状不能忽略,不能看作质点,故A错误;<br/>B.选取不同的参考系,物体的运动状态是不相同的,故B错误;<br/>C.出租车收费是按路程收费的,故C错误;<br/>D.第<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />是指<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553931930702.png" data-latex="${1 \rm{s} }$" width="12",height="11" />的时间,是指从<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930220437.png" data-latex="${3 \rm{s} }$" width="13",height="11" />末到<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />末这一段时间,故D正确;<br/>故选:D.</p>'
 
-    cons = css_label_wash(cons1)
+    cons = css_label_wash(cons2)
 
     print(cons)

+ 3 - 1
utils/ruku_opera.py

@@ -47,7 +47,7 @@ class Ruku():
         self.wordid = wordid
         self.callback_url = callback_info["callback_url"]
         # self.callback_url = "123456"
-        self.source = callback_info["source"]
+        self.source = callback_info["source"]  # {"xue_guan": "1", "teacher": "2", "ai": "3", "qtk": 4,"school":5}
         self.subject = subject  # items_list[0]["period"] + items_list[0]["subject"]
         self.callback_code = 0
         self.callback_err = ""
@@ -340,6 +340,8 @@ class Ruku():
                 if s:
                     s = re.sub(r'(<img src="[^"]*?[a-z\d])\\(?!\\)([^"]*?")', r"\1/\2", str(s))  # 将路径中的\改为/
                     s = s.replace(new_img_local, new_img_online)
+                    # 将latex的标红标签去掉
+                    s = re.sub(r'<span style=\"color: red\">([^"]+?)</span>', r"\1", s)
                     if old_img_local:
                         return s.replace(old_img_local, new_img_online)
                 return s

+ 4 - 2
utils/stem_ans_split.py

@@ -142,9 +142,11 @@ def get_split_pos(row_list):
     """
     # 寻找题目和答案的切分点,一定要有“答案”关键字
     split_p1 = [k for k, v in enumerate(row_list)
-                if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$|答案[和与及]?解析([((].*?[))])?$'  # |答\s*案$
+                if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$'
+                            r'|答案[和与及]?解析([((].*?[))])?$'  # |答\s*案$
                             r'|.{,15}(参考|考试|(考?试|检测)[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*$'
-                            r'|.{,15}评分(标准|参考)|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s..、、]+$'
+                            r'|.{,15}评分(标准|意见|细则|参考)$'
+                            r'|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s..、、]+$'
                             r'|.{,15}(参考|考试|(考?试|检测)[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*(物理|理综|数学|化学|生物)?\s*$'
                             r'|.{,15}解析[和与及]答案$',
                             re.sub(r"[上下]?学[年期]|[\d—【】..·、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]"

+ 93 - 5
utils/washutil.py

@@ -13,6 +13,7 @@ import shutil
 # from PIL import Image
 import base64, os, random
 import time
+import requests
 import hashlib
 from pprint import pprint
 # from bs4 import BeautifulSoup
@@ -365,6 +366,7 @@ class HtmlWash():
             # kk = re.search('(<img src=".*?image\d+\.(png|gif|jpg|jpeg))', src)
             # new_src = src.replace(kk.group(1), self.img_url[kk.group(1)]) if type(self.img_url) == dict and kk else src
             # 图片信息简化替换
+            print(src)
             new_src = re.sub(r'( data-latex)="\s*\\\[(.*?)\\\]\s*"', r'\1="$\2$"', src)
             new_src = re.sub(r'( data-latex="\$[^"]+?\$")',
                              lambda x: x.group(1).replace("<", " \lt ").replace("  ", " "), new_src)
@@ -375,7 +377,13 @@ class HtmlWash():
             w_h_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?width="([\d.]+)[pxt]*?"\s*height="([\d.]+)[pxt]*?"', src)
             w_h = " w_h=" + w_h_info.group(3).split('.')[0] + "*" + w_h_info.group(4).split('.')[0] \
                 if w_h_info and not mathpix else ""  # w_h 和 mathpix只存在一个
-            image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
+            # image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
+            image_info = re.search(r'<img src=".*?/([^/]+?)/(new_)?image([\da-z]+)\.', src)  # 2023.12.1
+
+            print(image_info.groups())
+            image_id = image_info.group(1) + image_info.group(3)
+            if len(image_id) > 10:
+                image_id = image_id[-10:]
             src2subs[src] = '<imgsrc' + image_id + w_h + mathpix + "/>"
             subs2src['<imgsrc' + image_id + w_h + mathpix + "/>"] = new_src
         for k, v in src2subs.items():
@@ -481,7 +489,7 @@ def get_md5(image_id):
     return str(md.hexdigest())
 
 
-def wash_after(res_dict):
+def wash_after(res_dict, subject="数学"):
     """
     1.处理最终结果多余的换行符;2.对题文中已给答案的选择填空进行替换;3.选择题的细分
     :param res_dict:
@@ -496,6 +504,8 @@ def wash_after(res_dict):
     is_optional = False
     option_score = 0
     select_type_id = []
+    all_content_str_list = []
+    topic_type_list = []
 
     for num, sr in enumerate(res_dict):
         sr["stem"] = re.sub(r"\n[_\-\s]*密[…O•.\s]*封[….O•\s]*装?[…O•.\s]*订?[….O•\s]*线?"
@@ -602,8 +612,8 @@ def wash_after(res_dict):
                 sr["answer_type"] = configs.answer_type[sr["answer_type"]]
 
             if not sr["parse"] and not sr["key"]:  # 答案和解析都没有
-                sr["parse"] = "略"
-                sr["key"] = "略"
+                # sr["parse"] = "略"
+                # sr["key"] = "略"
                 sr['errmsgs'].append("本题缺少答案和解析")
             elif not sr["key"] and sr["parse"]:
                 sr["key"] = ""  # 见解析
@@ -643,6 +653,7 @@ def wash_after(res_dict):
         # if "type1" in sr:
         #     del sr["type1"]
 
+        # 题型纠正
         # 将选择题改为单选或多选,"is_multiple_choice"
         sr['type'] = re.sub("([单多])项选择题?", r"\1选题", sr['type'])
         sr['type'] = sr['type'].replace("题题", "题")  # .replace("简答", "解答")
@@ -653,6 +664,8 @@ def wash_after(res_dict):
                 sr['type'] = '多选题'
             elif len(re.findall("[A-Z]", sr["key"])) == 1:
                 sr['type'] = '单选题'
+            elif "数学" in subject or "物理" in subject:
+                sr['type'] = '单选题'
             info_x = re.search("^[((](多)选题?[))]", sr["stem"].replace(" ", ""))
             if info_x:
                 sr['type'] = '{}选题'.format(info_x.group(1))
@@ -672,15 +685,51 @@ def wash_after(res_dict):
                 sr['type'] = '多选题'
             elif len(re.findall("[A-Z]", sr["key"])) == 1:
                 sr['type'] = '单选题'
+            elif "数学" in subject or "物理" in subject:
+                sr['type'] = '单选题'
             else:
                 sr['type'] = '选择题'
                 if "缺少答案" not in "".join(sr['errmsgs']):
                     sr['errmsgs'].append("本题缺少答案")
+        elif "数学" in subject:
+            if sr['type'].replace("题", "") == "填空":
+                if sr['blank_num'] > 1:
+                    sr['type'] = "多空题"
+                else:
+                    sr['type'] = "单空题"
+            elif sr['type'].replace("题", "") not in ["单空", "多空"]:
+                sr['type'] = "解答题"
+        # elif "物理" in subject:
+        #     # 用第一版模型预测
+        #     content = sr['stem']
+        #     if "options" in sr and sr["options"]:
+        #         content+= "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
+        #                                     for idm, option in enumerate(sr["options"])])
+        #     try:
+        #         r = requests.post(url=configs.phy_topicType_ip,
+        #                            json={"content": content, "period": "高中",
+        #                                 "topic_type": sr['type']})
+        #         sr['type'] = r.json()["res"]
+        #         if sr['type'] == "简答题":
+        #             sr['type'] = "解答题"
+        #     except Exception as e:
+        #         print(e)
+        #         if sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
+        #             sr['type'] = "填空题"
+        #         else:
+        #             sr['type'] = "解答题"
         elif sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
             sr['type'] = "填空题"
         elif sr['type'] not in ["选择", "选择题"]:
             sr['type'] = "解答题"
 
+        content = sr['stem']
+        if "options" in sr and sr["options"]:
+            content += "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
+                                         for idm, option in enumerate(sr["options"])])
+        all_content_str_list.append(content)
+        topic_type_list.append(sr['type'])
+
         # """按照原先高中数学解析的最后输出格式整理输出"""
         # sr["type"] = sr['type'].replace("非选择", "解答").replace("题题", "题")  #
         sr["topic_num"] = sr['item_id']
@@ -694,7 +743,9 @@ def wash_after(res_dict):
             del sr['is_optional']
         if 'spliterr_point' in sr:
             del sr['spliterr_point']
-        del sr['score'], sr['item_id']
+        if 'score' in sr:
+            del sr['score']
+        del sr['item_id']
 
         # ---------------------字符串公式处理--------------------------------
         # sr["stem"] = get_equation_instr(sr["stem"])
@@ -703,6 +754,43 @@ def wash_after(res_dict):
         # if "options" in sr:
         #     sr["options"] = list(map(get_equation_instr, sr["options"]))
         # ----------------------------------------------------------------
+    # 物理题型批量调接口:节约时间
+    if "物理" in subject:
+        epoches = int(len(all_content_str_list) / 10)
+        pred_topic_types = []
+        if epoches > 0:
+            last = 0
+            for epoch in range(epoches):
+                input_data = {"content": all_content_str_list[last:(epoch+1)*10], "period": "高中",
+                              "topic_type": topic_type_list[last:(epoch+1)*10]}
+                last = (epoch+1)*10
+                try:
+                    r = requests.post(url=configs.phy_topicType_ip, json=input_data)
+                    pred_topic_types.extend(r.json()["res"])
+                except Exception as e:
+                    print(e)
+                    pred_topic_types.extend([""]*10)
+            rest_con = all_content_str_list[last:]
+            rest_topic_type = topic_type_list[last:]
+        else:
+            rest_con = all_content_str_list
+            rest_topic_type = topic_type_list
+        if rest_con:
+            input_data = {"content": rest_con, "period": "高中", "topic_type": rest_topic_type}
+            try:
+                r = requests.post(url=configs.phy_topicType_ip, json=input_data)
+                pred_topic_types.extend(r.json()["res"])
+            except Exception as e:
+                print(e)
+                pred_topic_types.extend([""] * len(rest_con))
+        # 将预测题型替换到res_dict中
+        if any([True for i in pred_topic_types if i]) and len(pred_topic_types) == len(res_dict):
+            for idx, pred_type in enumerate(pred_topic_types):
+                if pred_type and res_dict[idx]['type'] in ["填空题", "解答题"]:
+                    if pred_type == "简答题":
+                        pred_type = "解答题"
+                    res_dict[idx]['type'] = pred_type
+    # --------------------------------------------------------------
     # 换行符替换
     convert_huanhang(res_dict)
     # ------------------------------------------------------------------------

+ 496 - 0
utils/washutil_for_DL_way.py

@@ -0,0 +1,496 @@
+#!/usr/bin/env/python
+# -*- coding:utf-8 -*-
+
+import re
+import base64, os, random
+import time
+from pprint import pprint
+import configs
+from utils.field_eq2latex import get_latex
+from utils.html_again_parse import css_label_wash
+
+
+def table_label_cleal(con):
+    """
+    去掉表格中的【换行符】
+    """
+    # print(con)
+    # print('------------------------------------------')
+    con = re.sub(r"\n(\s|\n|\t)+", "\n", con)
+    count = 1
+    while re.search(r"</?[a-z]+>\n(</?[a-z]+>|<td\s+\n*[a-z=\"\d]+>)", con, re.S) and count <= 10:
+        con = re.sub("(</?t[dr]>|</?table>|</?tbody>|</?div>)\n(</?t[dr]>|</div>|</?table>|</?tbody>|<p>)",
+                     r"\1\2", con, flags=re.S)
+        con = re.sub(r'(</?t[rd]>)\n(<td\s.+?>)', r'\1\2', con, flags=re.S)
+        count += 1
+    # if re.search(r"<table>(.|\n)+?</table>", con, re.S|re.M):
+    #     aa = re.search(r"(<table>(.|\n)+?</table>)", con, re.S|re.M)
+    #     con = con.replace(aa.group(1),aa.group(1).replace("\n",""))
+
+    # 将空表格的情况去掉
+    con = re.sub(r'<table>[\s\n\t]*?<tbody>[\s\n\t]*?(<tr>[\s\n\t]*?<td[^<>]*?>[\s\n\t]*?<p>[\s\n\t]*?</p>'
+                 r'[\s\n\t]*?</td>[\s\n\t]*?</tr>[\s\n\t]*?)+</tbody>[\s\n\t]*?</table>[\s\n\t]*?<p>', "", con,
+                 flags=re.S)
+    con = re.sub(r'(</table><p>)\s*([((]\s*\d\s*[))])', r'\1\n\2', con)
+    return con
+
+
+def base642img(html_data, wordid):
+    """
+    【基于mathjax渲染输出是css-html格式】
+    将base64编码的图片保存到本地
+    :return:
+    """
+    # 二进制图片进行转化, 按“word_id”建立文件夹
+    # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d')
+    # file_path = configs.IMG_FOLDER + '/' + str(self.wordid)
+    # if not os.path.exists(file_path):
+    #     os.makedirs(file_path)
+    # else:
+    # 思路1:删除图片,重建文件夹,【所有的新图片都是以base64格式传过来的】
+    # shutil.rmtree(file_path)
+    # os.makedirs(file_path)
+    # 思路2:每一次再解析都将base64图片保存到本地再以路径形式返回
+    # st = len(os.listdir(file_path))  # 不要以序号索引的形式命名
+
+    # 统计所有base64编码
+    all_base64_image = re.findall(r'(<img ([a-z]+="[^"]*?" )?src="(data:image[^>"]+?)"(.*?)\s*/?>)', str(html_data),
+                                  flags=re.S)
+    if all_base64_image:
+        file_path = configs.IMG_FOLDER + '/' + str(wordid)
+        if not os.path.exists(file_path):
+            os.makedirs(file_path)
+        # 新图片命名
+        name_list = random.sample(range(100000, 999999), len(all_base64_image))
+        for n, img in enumerate(all_base64_image):
+            img1 = img[2].split(",", maxsplit=1)
+            img_type_info = re.search("data:image/(.+?);base64", img1[0])
+            img_type = img_type_info.group(1) if img_type_info else ""
+            # 可能还有alt和style的属性,暂时先不要
+            w_info = re.search('( width="\d+")', img[3])
+            h_info = re.search('( height="\d+")', img[3])
+            img_data = base64.b64decode(str(img1[-1]))
+            if img_type:
+                # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape)
+                img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type
+                save_path = os.path.join(file_path, img_name)
+                with open(save_path, 'wb') as f:
+                    f.write(img_data)
+                # self.localnewpic_list.append(save_path)
+                # put_key = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(img_name)
+                # self.put_key_list.append(save_path)
+                flag_behind = '" />'
+                if w_info and h_info:
+                    flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />'
+                temp_img = '<img src="' + configs.new_img_ip + '/' + str(wordid) + '/' + img_name + flag_behind
+                # new_img = '<img src="http://' + configs.public_bucket_addr + put_key + '" />'
+                html_data = html_data.replace(img[0], temp_img)
+    return html_data
+
+
+class HtmlWash_2():
+    def __init__(self, html, wordid, is_reparse=0, img_url="", must_latex=0):
+        """
+        html文本清洗
+        批量再解析中,新增图片信息替换的文本返回作为ocr保存文本,
+        继续往下清洗的文本,则进入结构化解析逻辑中
+        """
+        # super().__init__(html, wordid, is_reparse, must_latex)
+        self.html = html
+        self.img_url = img_url
+        self.wordid = wordid
+        self.is_reparse = is_reparse
+        self.must_latex = must_latex
+        # self.put_key_list = []
+        # self.localnewpic_list =[]
+        self.sub_list = ["</?div>", "</?b>", "</?caption>", "</?center>", "</?cite>", "</?code>", "</?colgroup>",
+                         "</?menu>", "</?dd>", "</?dir>", "</?li>", "</?em>", "</?article>", "</?header>", "</?ruby>",
+                         "</?summary>", "</?details>", "</?strong>", "</?strike>", "</?small>", "</?select>",
+                         "</?section>", "</?script>", "</?[su]>", "</?var>", "</?ul>", "</?tt>", "</?title>",
+                         "</?thead>",
+                         "</?tfoot>", "<hr />", "<hr>", ""]
+        self.sub_dd = {'&times;': '×',
+                       '&divide;': '÷',
+                       '&deg;': '°',
+                       '&middot;': '·',
+                       '&plusmn;': '±',
+                       '&ordm;': 'º',
+                       '&sup1;': '¹',
+                       '&sup2;': '²',
+                       '&sup3;': '³',
+                       '&frac12;': '1/2',
+                       '&frac14;': '¼',
+                       '&frac34;': '¾',
+                       '&yen;': '¥',
+                       'm&sup3;': 'm³',
+                       # '&lt;': '<',
+                       '&pound;': '£',
+                       # '∠&lt;': '&lt;',
+                       '&gt;': '>',
+                       "A": "A",
+                       "А": "A",
+                       "Α": "A",
+                       "B": "B",
+                       "В": "B",
+                       "в": "B",
+                       "Β": "B",
+                       "C": "C",
+                       "С": "C",
+                       "c": "c",
+                       "с": "c",
+                       "D": "D",
+                       "Ε": "E",
+                       "E": "E",
+                       "F": "F",
+                       "G": "G",
+                       "g": "g",
+                       "m": "m",
+                       "N": "N",
+                       "s": "s",
+                       "t": "t",
+                       "/": "/",
+                       "=": "=",
+                       "-": "-",
+                       "2": "2", "3": "3", "4": "4", "5": "5", "6": "6",
+                       "7": "7", "8": "8", "9": "9", "1": "1", "0": "0",
+                       '&nbsp;&nbsp;': ' ',
+                       '&nbsp;': ' ',
+                       "〖": '【',
+                       "〗": '】',
+                       "題": '题',
+                       "单项选择": '单选',
+                       "多项选择": '多选',
+                       # "不定项选择": '选择',
+                       "双项选择": '多选',
+                       "实验与探究题": '实验',
+                       "原理综合题": '原理题',
+                       }
+
+    def new_pic_sub(self):
+        """
+        针对base64图片先保存到本地,入库时再换成腾讯云线上地址
+        # 第一版:再解析中,将二进制图片进行转化,图片怎么保存比较好,先再“天数”建立文件夹
+        第一版:再解析中,根据“word_id”建立文件夹
+        :return:
+        """
+        if self.is_reparse:
+            # css 标签清洗
+            self.html = css_label_wash(self.html)
+            # 保存base64编码的图片
+            self.html = base642img(self.html, self.wordid)
+        self.new_html = self.html
+
+    def html_cleal(self):
+        # =======清洗mathjax标签========
+        if "MathJax" in self.html:  # 再解析中存在mathjax公式渲染的标签
+            all_mathjax = re.findall('(<span class="MathJax_Preview".*?</script>(</span>)*)', self.html)
+            for jax in all_mathjax:
+                latex = re.findall('<script .+?">(((?!(</)).)*?)</script>(</span>)*', jax[0])
+                if latex:
+                    latex = "${}$".format(latex[0][0])
+                    self.html = self.html.replace(jax[0], latex)
+                else:
+                    self.html = self.html.replace(jax[0], "")
+
+        # ======再解析中的新图片处理=====
+        self.new_pic_sub()
+
+        # =====特殊符号处理=====
+        html2txt = re.sub(r"|".join(self.sub_list), "", str(self.html))  # ("", " ")   #2020/4/7
+        html2txt = re.sub("|".join(self.sub_dd.keys()), lambda x: self.sub_dd[x.group()], html2txt)  # 2020/4/1,4/7,4/20
+        html2txt = re.sub("[不非]定[向项]选择", "不定选择", html2txt)
+        html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \
+            .replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \
+            .replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ") \
+            .replace("\u2003", " ").replace("\x7f", " ").replace("\xa0", "")
+        html2txt = re.sub(r"(<p>\s*)【例题(\d+)】", r"\1\2、", html2txt)
+        html2txt = re.sub(r"\\\(|\\\)", "$", html2txt)
+
+        # 域公式的转化处理;<sub>\<sup>可以在前端显示,不需要用latex渲染
+        try:
+            html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid, self.must_latex)
+            if newhml:  # 存在域公式转图片时,需要将原文本的域公式也转为图片信息
+                self.new_html = newhml
+            html2txt = html2txt.replace("【omml-latex】", "")
+        except:
+            html2txt = html2txt.replace("【omml-latex】", "")
+
+        # 字符串公式的处理:如Fe<sub>2</sub>O<sub>3</sub>, 在结构化之后处理比较好
+        # <br/>处理
+        html2txt = re.sub(r"<br\s*/?>", "\n", html2txt)
+        html2txt = re.sub(r"[((]\s*(\d)\s*\$分\s*[))]", r"$(\1分)", html2txt)
+
+        # =====题型行的统一处理=====
+        # ---->>>>>题型行可能放在表格中
+        if len(re.findall("</table>", html2txt)) >= 8:  # 这个限制还不太严谨
+            for tt in re.finditer('<tr>(((?!(</?tr>)).)*)</tr>', html2txt, re.S):
+                tt_list = re.split(r'^\s*<td[^<>]*?>|</p></td>|</td>[\n\s]*?<td[^<>]*?>'
+                                   r'|</td>\s*\n|</td>\s*$|\n\s*<td[^<>]*?>|<td[^<>]*?><p>',
+                                   tt.group(1).strip())  # </td>\s*[$\n]这样无效
+                tt_list = [col for col in tt_list if col.strip()]
+                if " ".join(tt_list).replace(" ", "") in ['得分评卷人', '评卷人得分']:
+                    html2txt = html2txt.replace(tt.group(0), "")
+                else:
+                    pass
+                    # html2txt = html2txt.replace(tt.group(0), "<p>" + " ".join(tt_list) + "</p>")
+                # html2txt = re.sub(r"</?tbody>|</?table>|</?div>", "", html2txt)
+        # ---->>>>>end
+        html2txt = re.sub(r"(</table>)\s*([一二三四五六七八九十]\s*[、..、::]?.{2,6}题)", r"\1</p>\2", html2txt)
+        html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt)
+        html2txt = re.sub(r'<td[^<>]*?><p>(([一二三四五六七八九十])\s*[、..、,,::]\s*(.{2,4}题)\s*</p>)</td>[^p]*?<p>', r"\1",
+                          str(html2txt), flags=re.S)
+        html2txt = re.sub(r"<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)",
+                          r"<p>\1、\2题", html2txt)
+        html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "", html2txt)
+        html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?<!\d)(\d+分)\s*[,,。].{,50}</p>',
+                          r"<p>【选做题】:'\1'</p>", html2txt)
+        html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "<p>【选做题】</p>", html2txt)
+        html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*</p>',
+                          r"<p>\1、\2题</p>", html2txt)
+        html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)'
+                          r'([((]\s*本题|.*?\d分)', r"\1" + "、" + r'\2' + "题" + r"\3", html2txt)
+        html2txt = re.sub(r'([一二三四五六])\s*[、..、,,::]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题',
+                          r"\1" + "、" + r'\2' + "题", html2txt)
+        # html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt)  # + r"\2"
+        # html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt)
+        html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)',
+                          r"\1" + "、" + "解答题", html2txt)
+        html2txt = re.sub(r'(?<!<p>)\s*([一二三四五六七八九十]\s*[、..、,,::]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)',
+                          r'</p>\n<p>\1', html2txt)
+        html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s*</p>', r"<p>\1、本大题</p>",
+                          html2txt)
+
+        # html2txt = re.sub(r'<p>\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"<p>一、\1题", html2txt)
+
+        # =====图片的处理=====
+        # 1>>根据图片宽高的异常值判断删除隐藏图片
+        def sub1(ss):
+            if float(ss.group(1)) <= 3 and float(ss.group(2)) <= 3:
+                return ""
+            else:
+                return ss.group(0)
+
+        html2txt = re.sub(r'<img src=.*? width="([\d.]+)p[xt]" height="([\d.]+)p[xt]"\s*/?>', sub1, html2txt)
+
+        # 2>>将图片中带有的汉字去掉
+        html2txt = re.sub(r'(<img src=.*?) alt=".+?"', r"\1", html2txt)
+        # html2txt = re.sub(r'(<img src=.+?(?<!\\)\")>', r"\1 />", html2txt)  # 将">换为" />
+        html2txt = re.sub(r'(<img src=(?!\sstyle=)+?(?<!\\)\")>', r"\1 />", html2txt)  # 将">换为" />
+
+        # =====答案解析关键字的统一处理=====
+        html2txt = re.sub(r'【\s*(<img src=((?!/>).)+?/>\s*)*?([解答])\s*(<img src=((?!/>).)+?/>\s*)*?([析案])\s*'
+                          r'(<img src=((?!/>).)+?/>\s*)*?】', r"【\3\6】", str(html2txt))  # 2022/4/28
+        html2txt = re.sub(r'<p>\s*(解\s*[::])', r"<p>【解答】", str(html2txt))
+        html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt))
+        # html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答])[^【】]*?】', r"【\1】", str(html2txt))
+        html2txt = re.sub(r'(\n\s*|<p>\s*|\s{2,}|\n\s*\d{,2}\s*[、..、]\s*)(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"\1【\2】",
+                          str(html2txt))
+        html2txt = re.sub(r'(\n|^|<p>)\s*(([1-9]|[1-9][0-9])\s*[..、、])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]',
+                          r"\1\2【\4】", str(html2txt))
+        html2txt = re.sub(r'([A-D])\s*\[\s*(解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\n【\2】", str(html2txt))
+        html2txt = re.sub(r'(\n|^|<p>)\s*(分析)\s*[::]', r"【\2】", str(html2txt))
+        if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt:
+            html2txt = re.sub(r'【解答】', "【解析】", str(html2txt))
+
+        # =====其他关键字的处理=====
+        html2txt = re.sub(r'<p>\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?</p>', "", str(html2txt))
+        html2txt = re.sub(r'<p>\s*(选修[\d-]*?[::].{2,15})\s*</p>', r"<p>【章节】\1</p>", html2txt)
+        html2txt = re.sub(r'<p>\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*</p>',
+                          r"<p>【章节】\2</p>", html2txt)
+        html2txt = re.sub(r'<p>\s*(基础|中档|综合)题[^p题]*?</p>|<p>\s*【(考点|专题)】[^p]*?</p>', "", str(html2txt))
+        html2txt = re.sub(r'<p>\s*(基础训练|提升训练|探究培优)</p>', "", str(html2txt))
+        html2txt = re.sub(r'<p>注意事项[::]\s*</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt, flags=re.S)
+        html2txt = re.sub(r'<p>注意事项[::]\s*\d\s*[、..、][^/]+?</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt,
+                          flags=re.S)
+        html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt)
+        html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt)
+        html2txt = re.sub(r'\[来源:.*?\]', "", html2txt)
+        html2txt = re.sub('<p>欢迎访问.*?</p>', '', html2txt)
+        html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?<!["“=\'])http:.*?\.(com|cn|org)', "",
+                          html2txt)  # ww w.gkstk.c om
+        html2txt = re.sub(r'<(table|tr) [a-z]+="\d+">', r'<\1>', html2txt)  # <td rowspan="2">保留
+        html2txt = re.sub(r'<(table)( [a-z]+=".*?")+>', r'<\1>', html2txt)
+        html2txt = re.sub(r'<p>\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s*</p>', "<p>【非选择题】</p>", html2txt)
+        # == == =对可能的题型行的处理 == ==
+        html2txt = re.sub("<p>【非选择题】</p>((\s|\n|<p>|</p>)*\d{1,2}\s*[..、、].+?)", r"<p>二、解答题</p>\1", html2txt) \
+            .replace("【非选择题】", "")
+
+        # =====选项的处理=====
+        html2txt = re.sub(r'(<p>\s*([1-9]|[1-9][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*?</p>)',
+                          r"\1</p>\n<p>\3", str(html2txt))
+
+        # =====题号的处理=====
+        html2txt = re.sub(r'([ED]\s*[、..、].*?((?<![::])\s+|</su[pb]>\s*))(([1-9]|[1-9][0-9])\s*[、..、])',
+                          r"\1</p>\n<p>\3", html2txt)
+        html2txt = re.sub(r'((</?p>|\n)\s*(<img src=.*?"\s*/?>\s*)?([1-9]|[1-9][0-9]))\s*'
+                          r'([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)', r"</p>\1、\5", html2txt)
+        html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\1、\2",
+                          html2txt)
+        html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::]|\[(答案|解析)\])", r"<p>\1、\2",
+                          html2txt)
+        html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([((]\s*\d+\s*分?\s*[))])?(【(解析?|答案?)】|(解析?|答案?)\s*[::]"
+                          r"|\[(答案|解析)\])", r"<p>\1、\2\3", html2txt)
+        # 图片和题号相连情况
+        html2txt = re.sub(r"<p>\s*(<img src=((?!/>).)+?/>)\s*([1-9]|[1-9][0-9])\s*"
+                          r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\3、\1\4", html2txt)  # 2024.5.6
+        html2txt = re.sub(r'<p>((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+p[tx]"\s*/?>(\s*</su[bp]>)?\s*)'
+                          r'(([1-9]|[1-9][0-9])\s*[、..、])', r"<p>\4\1", html2txt)  # 2024.5.6
+        html2txt = re.sub(r"(</p>|\n)\s*(<img src=((?!/>).)+?/>)\s*([1-9]|[1-9][0-9])\s*"
+                          r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\2</p>" + "\n" + r"<p>\4、\5",
+                          html2txt)  # 【susp_img】
+        html2txt = re.sub(r'(</p>|\n)((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+p[tx]"\s*/?>(\s*</su[bp]>)?\s*)'
+                          r'(([1-9]|[1-9][0-9])\s*[、..、])', r"</p>\2</p>" + "\n" + r"\5", html2txt)
+        html2txt = re.sub(r"(<p>((?!<p>).)+?(\s|[/\"]>))(([1-9]|[1-9][0-9])\s*[、..、].{,20}本[大小]?题\d+分)",
+                          r"\1</p>" + "\n<p>" + r"\4", html2txt)
+        # 多张图片和题号相连情况
+        html2txt = re.sub(r"</?p>((\s*<su[bp]>\s*)?<img src=.*?/>(\s*</su[bp]>)?"
+                          r"((\s*<su[bp]>\s*)?<img src=((?!/>).)+?/>(\s*</su[bp]>)?)*?\s*)\s*(([1-9]|[1-9][0-9])\s*[、..、])",
+                          r"</p>\1</p>" + "\n<p>" + r"\8", html2txt, flags=re.S)
+        html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>',
+                          r"\1</p>\n<p>\2</p>", html2txt)
+        html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>',
+                          r"\1</p>\n<p>\2</p>", html2txt)
+        html2txt = re.sub(r'(<p>.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>', r"\1</p>\n<p>\2</p>",
+                          html2txt)
+        html2txt = re.sub(r'([1-9]|[1-9][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2",
+                          html2txt)
+
+        # 3>>建立图片id字典,对原图片信息第一次替换
+        html2txt = re.sub(r'( src=".*?files)\\image', r"\1/image", html2txt)
+        # all_image = re.findall(r'<img src=".*?image[\da-z]+\..*?[/\"]>', html2txt)
+        # src2subs = {}
+        # subs2src = {}
+        # for src in all_image:
+        #     # 校本题库上传的图片名称是随机数,故设置映射
+        #     # kk = re.search('(<img src=".*?image\d+\.(png|gif|jpg|jpeg))', src)
+        #     # new_src = src.replace(kk.group(1), self.img_url[kk.group(1)]) if type(self.img_url) == dict and kk else src
+        #     # 图片信息简化替换
+        #     print(src)
+        #     new_src = re.sub(r'( data-latex)="\s*\\\[(.*?)\\\]\s*"', r'\1="$\2$"', src)
+        #     new_src = re.sub(r'( data-latex="\$[^"]+?\$")',
+        #                      lambda x: x.group(1).replace("<", " \lt ").replace("  ", " "), new_src)
+        #     latex_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?(data-latex=".*?")', src)
+        #     mathpix = " " + latex_info.group(3).replace("\n", "").strip().replace("  ", " ") if latex_info else ""
+        #     if mathpix and len(mathpix) > 20:
+        #         mathpix = ""
+        #     w_h_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?width="([\d.]+)[pxt]*?"\s*height="([\d.]+)[pxt]*?"', src)
+        #     w_h = " w_h=" + w_h_info.group(3).split('.')[0] + "*" + w_h_info.group(4).split('.')[0] \
+        #         if w_h_info and not mathpix else ""  # w_h 和 mathpix只存在一个
+        #     # image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
+        #     image_info = re.search(r'<img src=".*?/([^/]+?)/(new_)?image([\da-z]+)\.', src)  # 2023.12.1
+
+        #     print(image_info.groups())
+        #     image_id = image_info.group(1) + image_info.group(3)
+        #     if len(image_id) > 10:
+        #         image_id = image_id[-10:]
+        #     src2subs[src] = '<imgsrc' + image_id + w_h + mathpix + "/>"
+        #     subs2src['<imgsrc' + image_id + w_h + mathpix + "/>"] = new_src
+        # for k, v in src2subs.items():
+        #     html2txt = html2txt.replace(k, v)
+        # ------------------------------------------------------------------------
+
+        # ========html 转 list=========
+        html2txt = re.sub(r'(</?div>|</table>|</?body>)(\n\s*)*?<p>', r"\1</p>" + "\n<p>", html2txt, flags=re.S)
+        # >>>>>> <table>先替换后再切割
+        # 不能简单按 \n 切割,表格里面也可能有换行,应该先替换后再切割
+        subs2table = {}
+        all_table = re.findall(r'<table>.*?</table>', html2txt, flags=re.S)
+        for k, v in enumerate(all_table):
+            html2txt = html2txt.replace(v, "<t{}b>".format(k))
+            # 将表格中的换行去掉
+            v = re.sub(r'<p>\s*(</?t[drh]( .*?")?>|</?table>|</?tbody>)\s*</p>', r"\1", v)
+            v = re.sub(r'</td></p>[\n\s]*<p><td>', "</td><td>", v)
+            v = re.sub(r'<td>(<p>|\s|</p>|\n)*</td>', "<td> </td>", v)
+            v = re.sub(r'</tbody></?p></table>', "</tbody></table>", v)
+            v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(\s*<p>\s*</p>)[\s\n]*?(<br\s*/?>|\n)+', r"\1", v,
+                       flags=re.S)
+            v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(<br\s*/?>|\n|</p>|\s)+', r"\1", v, flags=re.S)
+            v = re.sub(r'(</t[drh]( .*?")?>|</table>|</tbody>)(<br\s*/?>|\n|<p>|\s)+', r"\1", v, flags=re.S)
+
+            # 暂时还有table标签首尾的换行没去掉
+            subs2table["<t{}b>".format(str(k))] = v
+
+        # <造成的css标签冲突处理  2021-10-13
+        def sub2(ss):
+            if re.search(r'^(img|/?h[123456]|/?su[bp]>|t\d+b>|br\s*/?>'
+                         r'|/?(p|span|font|article|ul|ol|div|table|t?body|html|head|t[drh])(\s*|\s+style=.*?")>'
+                         r'|/?[a-z]+ style=.*?">)', ss.group(1)) is None:
+                return "&lt;{}".format(ss.group(1))
+            else:
+                return "<{}".format(ss.group(1))
+
+        html2txt = re.sub("<([^<]{1,30})", sub2, html2txt)
+        if subs2table:
+            html2txt = re.sub(r"|".join(subs2table.keys()), lambda x: subs2table[x.group()], html2txt)
+        # print(html2txt)
+        # >>>>>> html 切割
+        con_list = sum([re.split('<p>|<h[12345]>', i) if len(re.findall("<p>|<h[12345]>", i)) > 1 else [i] for i in
+                        re.split(r"\n+|</p>(?!</td>)|</h[12345]>", html2txt)], [])  # html2txt)[:-1]
+        con_list = [re.sub(r"^\n*\s*(<p>|<h[12345]>)+", "", ii) for ii in con_list]
+        # 剩余个别标签处理
+        con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*</\1>$", "", i.strip()) for i in con_list]  # 2020/4/7,14
+        con_list = [re.sub(r"^(<table>|</td>|<td[^<>]*?>|</?tr>)+?(.|\n)+?([一二三四五六七八九十])\s*[、..、]\s*(.{2,4}题)(.|\n)+?</table>",
+                           r"\3、\4", i.strip())
+                    for i in con_list]
+        # 把最后可能还存在的</?p>或考号信息去掉
+        con_list = [re.sub("</?p>|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$"
+                           "|((学校|班级|姓名|座位号|准考号|[学考]号)[\s::_]*?){2,}$", "", i.strip()) for i in con_list]
+        # =====答案行格式处理====
+        temp_list = [re.split(r"^((\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+)", v.strip(), maxsplit=1)[1::3]
+                     if re.match(r'(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$'
+                                 r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?评分标准'
+                                 r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$',
+                                 re.sub(r"[上下]?学[年期]|[\d—【】..、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "",
+                                        v.strip())) else [v] for v in con_list]
+        con_list = sum(temp_list, [])
+        # =====对可能的题号的处理====  如2、3、4、5、      加了【fei】  # 重新修改!!!!!!!!!!
+        con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[..、、])", r"【fei】\1", i.strip())
+                    if (len(re.findall(r"(^|\s*[..、、])\s*[1-9][0-9]?\s*[..、、]", i)) >= 3
+                        and len(re.sub(r"[\d..、、\s]", "", i)) < 2) else i for i in con_list]
+
+        # =====头尾清除没用的信息=====
+        if con_list and re.search(r"[\u4e00-\u9fa5]|<img ", con_list[0]) is None:
+            con_list = con_list[1:]
+        while con_list and re.search(r"声明[::].*?著作权属.*?所有|(邮箱|用户|日期|QQ)\s*[::].+?", con_list[-1]):
+            con_list = con_list[:-1]
+        return html2txt, con_list, self.new_html  # subs2table
+
+
+if __name__ == '__main__':
+    # -------------生成requirements.txt---------------
+    # pip freeze > requirements.txt
+    # import os, sys
+    #
+    # project_root = os.path.dirname(os.path.realpath(__file__))  # 找到当前目录
+    # print(project_root)
+    #
+    # # 找到解释器,虚拟环境目录
+    # python_root = sys.exec_prefix
+    # print(python_root)
+    #
+    # # 拼接生成requirements命令
+    # command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt'
+    # print(command)
+    #
+    # # 执行命令。
+    # os.system(command)
+
+    # ----------------一键安装 requirements.txt------------
+    # pip install -r requirement.txt
+    # python_root + '\Scripts\' + pip install -r requirements.txt
+
+    # import os
+    # rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx")
+    # print(rrr)
+    # item = "<a 我没发你的接口 $2366<a$   <a 我没发你的接口 $2366<a$  <img 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$   <a 我没发你的接口 $2366<a$"
+    # item = r"2.下列选项中,使不等式\( x<\frac{1}{x}< x_{2} \)"
+    # ww = css_conflict_deal(item)
+    # print(ww)
+    p1 = r"/home/cv/workspace/tujintao/document_segmentation/Data/samples/真实样例/6264fa25f84c0e279ac643ef.html"
+    t1 = open(p1, 'r', encoding="utf8").read()
+    row_list, new_html = HtmlWash_2(t1, '11111111', is_reparse=1, must_latex=1).html_cleal()
+    row_list = list(filter(lambda x: x.strip() != "", row_list))
+    pprint(row_list)
+    print(len(row_list))
+    # html, wordid, is_reparse=0, img_url="", must_latex=0)
+

部分文件因为文件数量过多而无法显示