", mix_con, flags=re.S)] # add 2024-9-12 com_stem_idx.extend(com_stem_idx_2) com_stem_idx.sort() # print("common_stem_info3:", common_stem_info3) if subject == "地理" and common_stem_info3: item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1] + 1, len(bef_res)-1) elif com_stem_idx and subject == "地理": com_stem = mix_con[com_stem_idx[-1]:] bef_con = mix_con[:com_stem_idx[-1]] # print("com_stem:", com_stem) # print("bef_con:", bef_con) else: if not pc_idx: com_stem = mix_con if len(com_stem) < 25 and re.search("任选一题", com_stem): com_stem = "" if not com_stem: item_groups["groups_data"][len(bef_res)] = "fei" else: if subject == "语文": mix_con_list = re.split(r"(\n\s*阅读.*?按要求[作回]答.{,2}\n" r"|\n\s*阅读下[面列].*?[完成回答]+各题.{,2}\n" r"|[((][一二三四五][))]\s*.{,6}\n|[((][一二三四五][))]\s*.{,6}$)", # r"|\n+\s*[((][一二三四五][))])", # 与上一条都满足时,匹配短的 re.sub("[((][本题共\d小\s]*?\d{1,2}分\s*[))].?", "", mix_con)) if len(mix_con_list) == 1: mix_con_list = re.split(r"([((][一二三四五][))].*?[完成回答]+下?[面列]?的?.*?[\d小]\s*?题.{,2})\n", re.sub("[((]\s*\d{1,2}\s*分\s*[))].?", "", mix_con)) if len(mix_con_list) == 1: mix_con_list = re.split(r"\n+\s*[((][一二三四五][))]", re.sub("[((][本题共\d小\s]*?\d{1,2}分\s*[))].?", "", mix_con)) if len(mix_con_list) >= 3: com_stem = "\n".join(mix_con_list[-2:]) # 有([一二三四五]),但不一定有公共题文 if re.search("[((]\s*[一二三四五]\s*[))](语言文字运用|微?写作|选择|单选|语言表达|作文" r"|.{,4}(文本阅读|诗歌阅读|文言文阅读|名著阅读|默写)题?).{,10}$", com_stem.strip()): com_stem = "" bef_con = "".join(mix_con_list[:-2]) if com_id[-1] not in item_groups["groups_data"] or not item_groups["groups_data"][com_id[-1]]: item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1]+1, len(bef_res)) common_stem_info4 = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第~~-]+?(\d{1,2})小?题", mix_con_list[-2].replace(" ", "").replace("\n", "")) if common_stem_info4: item_groups["groups_data"][len(bef_res)] = "{}-{}".format( common_stem_info1.group(2), common_stem_info1.group(3)) elif common_stem_info2 and re.search("\n\s*[\[【]\s*(答案|解析)", mix_con[st_2:]) is None: item_groups["groups_data"][len(bef_res)] = "" common_stem_list = re.split("([完成回答]下?[面列]?的?.*?[\d小各]题.{,2}\n" "|\n\s*阅读.*?按要求[作回]答.{,2}\n)", mix_con) if len(common_stem_list) > 3: com_stem = common_stem_list[-1] mix_con = "".join(common_stem_list[:-1]) else: com_stem = mix_con[st_2:] mix_con = mix_con[:st_2] # 没切全,还需进一步 # print("mix_con:",mix_con) paras = mix_con.split("\n") paras = [para for para in paras if para.strip()] # 去掉空行 if len(paras) > 1: com_stem = paras[-1] + com_stem bef_con = "\n".join(paras[:-1]) # print(bef_con) else: bef_con = mix_con else: mix_con_list = re.split(r"(详解】|解析】|答案】)", mix_con) if len(mix_con_list) > 2: # 含解析: bef_con = "".join(mix_con_list[:-1]) mix_con = mix_con_list[-1] else: mix_con = mix_con_list[0] paras = mix_con.split("\n") # 将混淆的部分换行拆分 paras = [para for para in paras if para.strip()] # 去掉空行 if len(paras) > 1: if len(paras) == 2 or len(paras[-1]) >= 20: # 2段or段长 com_stem = paras[-1] if re.search("^\s*[((]\s*\d\s*[))]", com_stem): com_stem = "" else: bef_con = bef_con + "\n".join(paras[:-1]) else: com_stem = "\n".join(paras[-2:]) bef_con = bef_con + "\n".join(paras[:-2]) else: bef_con = bef_con + "\n".join(paras) # 此时 com_stem 为空 if subject == "地理": if com_stem: one_stem["com_stem"] = com_stem if bef_res and pc_idx: # 不包括第一题 bef_res[-1]["stem"] = bef_con else: if re.sub("?tbody>|?table>|\n", "", com_stem): one_stem["com_stem"] = com_stem item_groups["is_groups"] = 1 if len(bef_res) not in item_groups["groups_data"]: item_groups["groups_data"][len(bef_res)] = "" if bef_res and bef_con: bef_res[-1]["stem"] = bef_con # elif not pc_idx: # item_groups["groups_data"][len(bef_res)] = "" return bef_res, one_stem, item_groups