Explorar el Código

局部信息优化

cdZWj hace 4 meses
padre
commit
e3c541d90e

+ 6 - 5
structure/option.py

@@ -44,15 +44,15 @@ def option2block(option_con, item_no_type):
             while re.search(r"(\n\s*<img\s*src=.+?)([A-H][..、、])(.+?)", con.replace(" ", "")):
                 con = re.sub(r"(\n\s*<img\s*src=.+?)(?<!【)([A-H]\s*[..、、::])(.+?)", r"\1" + "\n" + r"【【\2】】\3", con)
             while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-Hc][..、、])\n+(.+?)(?<!【)([A-H][..、、])(.+?)",
-                            con.replace(" ", ""), re.S):
+                            con.replace(" ", ""), flags=re.S):
                 con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】.+?)(?<!【)([A-H]\s*[..、、::])\s*\n+(.+?)"
                              r"(?<!【)([A-H]\s*[..、、::])(.+?)", r"\1【【\2】】\3【【\4】】\5", con, flags=re.S)
-            while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-H][..、、])\n+(.+?)", con.replace(" ", ""), re.S):
+            while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-H][..、、])\n+(.+?)", con.replace(" ", ""), flags=re.S):
                 con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】.+?)(?<!【)([A-H]\s*[..、、::])\s*\n+(.+?)",
                              r"\1【【\2】】\3", con, flags=re.S)
-            while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-H][..、、])(.+?)", con.replace(" ", "")):
-                con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】.+?)(?<!【)([A-H]\s*[..、、::])(.+?)", r"\1【【\2】】\3", con)
-            while re.search(r"(\n【【[A-H][..、、]】】[^【]+?/>\s+)(?<!【)([B-H][..、、])(.+?)", con.replace(" ", ""), re.S):
+            while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-H][..、、])(.+?)", con.replace(" ", ""), flags=re.S):
+                con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】.+?)(?<!【)([A-H]\s*[..、、::])(.+?)", r"\1【【\2】】\3", con, flags=re.S)
+            while re.search(r"(\n【【[A-H][..、、]】】[^【]+?/>\s+)(?<!【)([B-H][..、、])(.+?)", con.replace(" ", ""), flags=re.S):
                 con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】[^【]+?/>\s+)(?<!【)([B-H]\s*[..、、::])\s*(.+?)",
                              r"\1【【\2】】\3", con, flags=re.S)  # 选项子母前面是图片 9/8
     if item_no_type == 2:
@@ -100,6 +100,7 @@ def option_structure(one_item, con, ans, item_no_type, is_danti=0):
         return one_item
 
     ans = re.sub("[;;.]+", "", ans)
+    ans = re.sub("<[a-z]+ (style|rowspan|colspan|class)=[^<>]*?\">|</[a-z]+>", "", ans)
     ans2 = []
     for a in ans.split("#"):
         if 0 < len(a.replace(" ", "")) < 8:

+ 4 - 3
structure/stems_structure.py

@@ -18,7 +18,7 @@ def stems_structure_byno(stem_con, subject="", is_danti=0):
             while stem_con and ((re.search(r"[\u4e00-\u9fa5]", stem_con[0]) is None) or
                                 (((subject != "语文" and re.search(r"[一二三四五]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", stem_con[0]) is None)
                                  or (subject == "语文" and
-                                     re.search(r"[一二三四五]\s*[、..、]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)", stem_con[0]) is None))
+                                     re.search(r"[一二三四五]\s*[、..、]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)|^([\[【]题文[\]】]|阅读)", stem_con[0]) is None))
                                  and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[..、、]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$",
                                                stem_con[0]) is None)):
                 head_cons.append(stem_con[0])
@@ -69,8 +69,9 @@ def stems_structure_byno(stem_con, subject="", is_danti=0):
         # 获取正确题号的位置,进行切分
         new_item_no, items_no_idx = get_right_no(item_no_info)
         one_item_split = [stem_str[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
-        if is_danti and items_no_idx and stem_str[:items_no_idx[0]]:
-            head_cons = stem_str[:items_no_idx[0]]
+        if items_no_idx and stem_str[:items_no_idx[0]]:
+            if is_danti or "【题文】" in stem_str[:items_no_idx[0]]:
+                head_cons = stem_str[:items_no_idx[0]]
         dd = {}
         for n, one_item in enumerate(one_item_split):
             if subject in ["地理", "语文"]:

+ 19 - 5
structure/stems_to_groups.py

@@ -68,9 +68,10 @@ def regroup(res_list, item_groups, ans_groups):
                 added_nos.append(group_no+1)
                 fei_no[n] = group_no
                 continue
-            # 其他情况
-            new_res_dict.append(res_list[group_no])
-            added_nos.append(group_no+1)
+            # 其他情况,须是独立的题号
+            if not "#" + str(group_no + 1) + "-" in "#" + "#".join(ans_groups.keys()):
+                new_res_dict.append(res_list[group_no])
+                added_nos.append(group_no+1)
             # continue
         else:
             # 遇到带"com_stem"的试题
@@ -103,8 +104,21 @@ def regroup(res_list, item_groups, ans_groups):
                         added_nos.append(int(end))
                     else:  # end值出错
                         if str(group_no+1) + "-" in "#".join(ans_groups.keys()):
-                            end = re.search("[^#]{}-(\d+)[$#]".format(group_no), "#".join(ans_groups.keys())).group(1)
-                            one_group["slave"] = res_list[group_no: int(end) + 1]
+                            # 修改20240621
+                            # end = re.search("[^#]{}-(\d+)[$#]".format(group_no), "#".join(ans_groups.keys())).group(1)
+                            # one_group["slave"] = res_list[group_no: int(end) + 1]
+                            end_info1 = re.search(r"#{}-(\d+)($|#)".format(group_no+1), "#".join(ans_groups.keys()))
+                            end_info2 = re.search(r"[^#]{}-(\d+)($|#)".format(group_no), "#".join(ans_groups.keys()))
+                            if end_info1:
+                                end = end_info1.group(1)
+                                one_group["slave"] = res_list[group_no: int(end)]
+                                added_nos.append(int(end))
+                            elif end_info2:  #????
+                                end = end_info2.group(1)
+                                one_group["slave"] = res_list[group_no: int(end) + 1]
+                                added_nos.append(int(end)+1)
+                            else:
+                                one_group["slave"] = []
                         else:
                             endp = [m for m, j in enumerate(res_list[added_nos[-1]:])
                                     if j["type"] != res_list[added_nos[-1]]["type"]]

+ 4 - 4
structure/structure_main.py

@@ -221,8 +221,8 @@ if __name__ == '__main__':
     # 627b64b0814132f0d7b12589    627b622981b582c0470d020e
     # 6294326cf84c0e279ac6484e.html   62903acaf84c0e279ac647fb
     path2 = r"C:\Users\Administrator\Desktop\666a67fec3c4da9e7009b531.html"
-    path2 = r"C:\Users\Administrator\Desktop\6667f5a8c3c4da9e7009b44b.html"
-    # path2 = r"F:\zwj\Text_Structure\accept_files\62aae86a765759d85567a475.html"
+    path2 = r"C:\Users\Administrator\Desktop\66753958c3c4da9e7009b7ae.html"
+    path2 = r"F:\zwj\Text_Structure\accept_files\66799166c3c4da9e7009b84f_2.html"
     html = open(path2, "r", encoding="utf-8").read()
     # html = json.loads(html)  621845626ca622396925f55c
     html2 = """
@@ -239,14 +239,14 @@ if __name__ == '__main__':
 11. He has worked for nearly 20 years, so he is senior ___________ most of his workmates.
 12. Although he is three years junior ___________ me, he has more work experience.
     """
-    res1 = StructureExporter(html, "202406131725", "语文", 1).export()
+    res1 = StructureExporter(html, "202406251733", "语文", 0).export()
     # new_fpath = os.path.join(r"G:\zwj\WL\Text_Structure\fail_files", "res_政治.json")
     # re_f = open(new_fpath, 'w', encoding='utf-8')
     # json.dump(res1[0]["items"], re_f, ensure_ascii=False)
     # for i in res1[0]["items"]:
     #     re_f.write(str(i))
     pprint(res1[0]["items"])
-    pprint(res1[0]["html"])
+    # pprint(res1[0]["html"])
     print('题目数量:', len(res1[0]["items"]))
 
     # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"

+ 2 - 2
utils/washutil.py

@@ -689,8 +689,8 @@ def wash_after(res_dict, item_groups, ans_groups, subject):
     # -----------------------------------------------------------------------
     # pprint(res_dict)
     if item_groups and item_groups["is_groups"]:
-        print("item_groups:", item_groups)
-        print("ans_groups:", ans_groups)
+        # print("item_groups:", item_groups)
+        # print("ans_groups:", ans_groups.keys())
         res_dict = stems_to_groups.regroup(res_dict, item_groups, ans_groups)
     else:
         for one_res in res_dict: