hace 4 meses · e3c541d90e
--- a/structure/option.py
+++ b/structure/option.py
@@ -44,15 +44,15 @@ def option2block(option_con, item_no_type):
 
				             while re.search(r"(\n\s*<img\s*src=.+?)([A-H][.．、､])(.+?)", con.replace(" ", "")):
			
 
				                 con = re.sub(r"(\n\s*<img\s*src=.+?)(?<!【)([A-H]\s*[.．、､：:])(.+?)", r"\1" + "\n" + r"【【\2】】\3", con)
			
 
				             while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-Hc][.．、､])\n+(.+?)(?<!【)([A-H][.．、､])(.+?)",
			
 
				-                            con.replace(" ", ""), re.S):
			
 
				+                            con.replace(" ", ""), flags=re.S):
			
 
				                 con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】.+?)(?<!【)([A-H]\s*[.．、､：:])\s*\n+(.+?)"
			
 
				                              r"(?<!【)([A-H]\s*[.．、､：:])(.+?)", r"\1【【\2】】\3【【\4】】\5", con, flags=re.S)
			
 
				-            while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-H][.．、､])\n+(.+?)", con.replace(" ", ""), re.S):
			
 
				+            while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-H][.．、､])\n+(.+?)", con.replace(" ", ""), flags=re.S):
			
 
				                 con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】.+?)(?<!【)([A-H]\s*[.．、､：:])\s*\n+(.+?)",
			
 
				                              r"\1【【\2】】\3", con, flags=re.S)
			
 
				-            while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-H][.．、､])(.+?)", con.replace(" ", "")):
			
 
				-                con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】.+?)(?<!【)([A-H]\s*[.．、､：:])(.+?)", r"\1【【\2】】\3", con)
			
 
				-            while re.search(r"(\n【【[A-H][.．、､]】】[^【]+?/>\s+)(?<!【)([B-H][.．、､])(.+?)", con.replace(" ", ""), re.S):
			
 
				+            while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-H][.．、､])(.+?)", con.replace(" ", ""), flags=re.S):
			
 
				+                con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】.+?)(?<!【)([A-H]\s*[.．、､：:])(.+?)", r"\1【【\2】】\3", con, flags=re.S)
			
 
				+            while re.search(r"(\n【【[A-H][.．、､]】】[^【]+?/>\s+)(?<!【)([B-H][.．、､])(.+?)", con.replace(" ", ""), flags=re.S):
			
 
				                 con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】[^【]+?/>\s+)(?<!【)([B-H]\s*[.．、､：:])\s*(.+?)",
			
 
				                              r"\1【【\2】】\3", con, flags=re.S)  # 选项子母前面是图片 9/8
			
 
				     if item_no_type == 2:
			
@@ -100,6 +100,7 @@ def option_structure(one_item, con, ans, item_no_type, is_danti=0):
 
				         return one_item
			
 
				 
			
 
				     ans = re.sub("[;；.]+", "", ans)
			
 
				+    ans = re.sub("<[a-z]+ (style|rowspan|colspan|class)=[^<>]*?\">|</[a-z]+>", "", ans)
			
 
				     ans2 = []
			
 
				     for a in ans.split("#"):
			
 
				         if 0 < len(a.replace(" ", "")) < 8:
			
--- a/structure/stems_structure.py
+++ b/structure/stems_structure.py
@@ -18,7 +18,7 @@ def stems_structure_byno(stem_con, subject="", is_danti=0):
 
				             while stem_con and ((re.search(r"[\u4e00-\u9fa5]", stem_con[0]) is None) or
			
 
				                                 (((subject != "语文" and re.search(r"[一二三四五]\s*[、.．､]\s*[^必考基础综合中等]{2,4}题", stem_con[0]) is None)
			
 
				                                  or (subject == "语文" and
			
 
				-                                     re.search(r"[一二三四五]\s*[、.．､]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)", stem_con[0]) is None))
			
 
				+                                     re.search(r"[一二三四五]\s*[、.．､]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)|^([\[【]题文[\]】]|阅读)", stem_con[0]) is None))
			
 
				                                  and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[.．、､]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$",
			
 
				                                                stem_con[0]) is None)):
			
 
				                 head_cons.append(stem_con[0])
			
@@ -69,8 +69,9 @@ def stems_structure_byno(stem_con, subject="", is_danti=0):
 
				         # 获取正确题号的位置，进行切分
			
 
				         new_item_no, items_no_idx = get_right_no(item_no_info)
			
 
				         one_item_split = [stem_str[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
			
 
				-        if is_danti and items_no_idx and stem_str[:items_no_idx[0]]:
			
 
				-            head_cons = stem_str[:items_no_idx[0]]
			
 
				+        if items_no_idx and stem_str[:items_no_idx[0]]:
			
 
				+            if is_danti or "【题文】" in stem_str[:items_no_idx[0]]:
			
 
				+                head_cons = stem_str[:items_no_idx[0]]
			
 
				         dd = {}
			
 
				         for n, one_item in enumerate(one_item_split):
			
 
				             if subject in ["地理", "语文"]:
			
--- a/structure/stems_to_groups.py
+++ b/structure/stems_to_groups.py
@@ -68,9 +68,10 @@ def regroup(res_list, item_groups, ans_groups):
 
				                 added_nos.append(group_no+1)
			
 
				                 fei_no[n] = group_no
			
 
				                 continue
			
 
				-            # 其他情况
			
 
				-            new_res_dict.append(res_list[group_no])
			
 
				-            added_nos.append(group_no+1)
			
 
				+            # 其他情况,须是独立的题号
			
 
				+            if not "#" + str(group_no + 1) + "-" in "#" + "#".join(ans_groups.keys()):
			
 
				+                new_res_dict.append(res_list[group_no])
			
 
				+                added_nos.append(group_no+1)
			
 
				             # continue
			
 
				         else:
			
 
				             # 遇到带"com_stem"的试题
			
@@ -103,8 +104,21 @@ def regroup(res_list, item_groups, ans_groups):
 
				                         added_nos.append(int(end))
			
 
				                     else:  # end值出错
			
 
				                         if str(group_no+1) + "-" in "#".join(ans_groups.keys()):
			
 
				-                            end = re.search("[^#]{}-(\d+)[$#]".format(group_no), "#".join(ans_groups.keys())).group(1)
			
 
				-                            one_group["slave"] = res_list[group_no: int(end) + 1]
			
 
				+                            # 修改20240621
			
 
				+                            # end = re.search("[^#]{}-(\d+)[$#]".format(group_no), "#".join(ans_groups.keys())).group(1)
			
 
				+                            # one_group["slave"] = res_list[group_no: int(end) + 1]
			
 
				+                            end_info1 = re.search(r"#{}-(\d+)($|#)".format(group_no+1), "#".join(ans_groups.keys()))
			
 
				+                            end_info2 = re.search(r"[^#]{}-(\d+)($|#)".format(group_no), "#".join(ans_groups.keys()))
			
 
				+                            if end_info1:
			
 
				+                                end = end_info1.group(1)
			
 
				+                                one_group["slave"] = res_list[group_no: int(end)]
			
 
				+                                added_nos.append(int(end))
			
 
				+                            elif end_info2:  #????
			
 
				+                                end = end_info2.group(1)
			
 
				+                                one_group["slave"] = res_list[group_no: int(end) + 1]
			
 
				+                                added_nos.append(int(end)+1)
			
 
				+                            else:
			
 
				+                                one_group["slave"] = []
			
 
				                         else:
			
 
				                             endp = [m for m, j in enumerate(res_list[added_nos[-1]:])
			
 
				                                     if j["type"] != res_list[added_nos[-1]]["type"]]
			
--- a/structure/structure_main.py
+++ b/structure/structure_main.py
@@ -221,8 +221,8 @@ if __name__ == '__main__':
 
				     # 627b64b0814132f0d7b12589    627b622981b582c0470d020e
			
 
				     # 6294326cf84c0e279ac6484e.html   62903acaf84c0e279ac647fb
			
 
				     path2 = r"C:\Users\Administrator\Desktop\666a67fec3c4da9e7009b531.html"
			
 
				-    path2 = r"C:\Users\Administrator\Desktop\6667f5a8c3c4da9e7009b44b.html"
			
 
				-    # path2 = r"F:\zwj\Text_Structure\accept_files\62aae86a765759d85567a475.html"
			
 
				+    path2 = r"C:\Users\Administrator\Desktop\66753958c3c4da9e7009b7ae.html"
			
 
				+    path2 = r"F:\zwj\Text_Structure\accept_files\66799166c3c4da9e7009b84f_2.html"
			
 
				     html = open(path2, "r", encoding="utf-8").read()
			
 
				     # html = json.loads(html)  621845626ca622396925f55c
			
 
				     html2 = """
			
@@ -239,14 +239,14 @@ if __name__ == '__main__':
 
				 11. He has worked for nearly 20 years, so he is senior ___________ most of his workmates.
			
 
				 12. Although he is three years junior ___________ me, he has more work experience.
			
 
				     """
			
 
				-    res1 = StructureExporter(html, "202406131725", "语文", 1).export()
			
 
				+    res1 = StructureExporter(html, "202406251733", "语文", 0).export()
			
 
				     # new_fpath = os.path.join(r"G:\zwj\WL\Text_Structure\fail_files", "res_政治.json")
			
 
				     # re_f = open(new_fpath, 'w', encoding='utf-8')
			
 
				     # json.dump(res1[0]["items"], re_f, ensure_ascii=False)
			
 
				     # for i in res1[0]["items"]:
			
 
				     #     re_f.write(str(i))
			
 
				     pprint(res1[0]["items"])
			
 
				-    pprint(res1[0]["html"])
			
 
				+    # pprint(res1[0]["html"])
			
 
				     print('题目数量：', len(res1[0]["items"]))
			
 
				 
			
 
				     # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"
			
--- a/utils/washutil.py
+++ b/utils/washutil.py
@@ -689,8 +689,8 @@ def wash_after(res_dict, item_groups, ans_groups, subject):
 
				     # -----------------------------------------------------------------------
			
 
				     # pprint(res_dict)
			
 
				     if item_groups and item_groups["is_groups"]:
			
 
				-        print("item_groups:", item_groups)
			
 
				-        print("ans_groups:", ans_groups)
			
 
				+        # print("item_groups:", item_groups)
			
 
				+        # print("ans_groups:", ans_groups.keys())
			
 
				         res_dict = stems_to_groups.regroup(res_dict, item_groups, ans_groups)
			
 
				     else:
			
 
				         for one_res in res_dict: