#!/usr/bin/env/python # -*- coding:utf-8 -*- import re from utils.item_type_line import get_item_head_info, get_item_head_info_cn from utils.topic_no import judge_item_no_type, get_right_no, pre_get_item_no def stems_structure_byno(stem_con, subject="", is_danti=0): """ 按题号进行切分; 针对无解析的试卷中所有题目的拆分; :return:{"stem": , "item_id": , "errmsgs": [],"type":,} """ head_cons = [] if not is_danti: try: while stem_con and ((re.search(r"[\u4e00-\u9fa5]", stem_con[0]) is None) or (((subject != "语文" and re.search(r"[一二三四五]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", stem_con[0]) is None) or (subject == "语文" and re.search(r"[一二三四五]\s*[、..、]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)|^([\[【]题文[\]】]|阅读)", stem_con[0]) is None)) and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[..、、]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$", stem_con[0]) is None)): head_cons.append(stem_con[0]) del stem_con[0] except: return "本份试卷开头格式有问题,请按试卷格式来!" # else: head_cons = "\n".join(head_cons).strip() stem_str = "\n" + "\n".join(stem_con) # 题号格式有条件清洗 # def sub1(ss): # if int(ss.group(5)) - int(ss.group(2)) in [1, 2]: # return ss.group(1)+"\n"+ss.group(5)+"、"+ss.group(6) # else: # return ss.group(0) # stem_str = re.sub(r"(\n\s*([1-9][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-9][0-9]|[1-9])\s*[..、、]).)+?)\n+\s*([1-9][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", sub1, stem_str, flags=re.S) # 也可以用下面方法,但比较啰嗦 pattern_3 = re.compile(r"(\n\s*([1-9][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-9][0-9]|[1-9])\s*[..、、]).)+?)" r"\n+\s*([1-9][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", re.S) while re.search(pattern_3, stem_str): wrong_id_info = re.search(pattern_3, stem_str) stem_str = stem_str.replace(wrong_id_info.group(0), wrong_id_info.group(1)+"\n"+wrong_id_info.group(5)+"、"+wrong_id_info.group(6)) # print(stem_str) # 1、获取题型行信息、按题型行切分 if subject == "语文": con11, title_info_dict, choice_class = get_item_head_info_cn(stem_str) else: con11, title_info_dict, choice_class = get_item_head_info(stem_str) all_type = title_info_dict["all_type"] title_type_num = title_info_dict["title_type_num"] select_type_id = title_info_dict["select_type_id"] each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"] # ------------------------------------------------------- res = [] item_type_classify = {} # 记录每类题型中含有的题目个数,含合并的情况 item_type_num = [] # 题型不合并 pic_no = {} # 记录每个题的图 new_item_no = [] # 将纠错后的题号再记录一份 item_groups = {"is_groups": 0, "groups_data": {}} # 公共题干的位置,从哪个题开始,比如地理选择题 if not all_type: print("不存在大题题型行或题型行格式或名称有问题") # 初步获取题号,题号类型 stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str) # 获取正确题号的位置,进行切分 new_item_no, items_no_idx = get_right_no(item_no_info) one_item_split = [stem_str[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])] if items_no_idx and stem_str[:items_no_idx[0]]: if is_danti or "【题文】" in stem_str[:items_no_idx[0]]: head_cons = stem_str[:items_no_idx[0]] dd = {} for n, one_item in enumerate(one_item_split): if subject in ["地理", "语文"]: res, com_stem, item_groups = split_with_comstem(head_cons, one_item_split, res, n, item_groups, subject) if com_stem: # print("com_stem:", com_stem) dd.update(com_stem) elif is_danti and not n: # 2022/2/24 dd.update({"com_stem": head_cons}) item_groups["is_groups"] = 1 item_groups["groups_data"][0] = "" # if not n: # mix_con = "\n".join(head_cons).strip() # else: # mix_con = one_item_split[n - 1].strip() # pattern_3 = re.search("(完成|回答)下?[面列]?的?.*?[\d小]题.{,2}$", mix_con) # if pattern_3: # com_stem, bef_con = "", "" # if re.search("【题文】", mix_con): # bef_con, com_stem = mix_con.split("【题文】", maxsplit=1) # else: # com_stem_idx = [i.end() for i in re.finditer(r"(所以|故而?).{,3}(选择?[A-Fa-f]选?项?" # r"|选项[A-Fa-f]正确).{,2}\n" # r"|\n[A-E]\s*[..、、::].*?\n", mix_con)] # if com_stem_idx: # com_stem = mix_con[com_stem_idx[-1]+1:] # bef_con = mix_con[:com_stem_idx[-1]+1] # else: # if not n: # com_stem = mix_con # else: # mix_con_list = mix_con.split(r"(详解】|解析】|答案】)") # if len(mix_con_list) > 2: # bef_con = "".join(bef_con[:-1]) # mix_con = bef_con[-1] # else: # mix_con = bef_con[0] # paras = bef_con.split("\n") # paras = [para for para in paras if para.strip()] # if len(paras)>1: # if len(paras) == 2 or len(paras[-1]) >= 20: # com_stem = paras[-1] # bef_con = bef_con + "\n".join(paras[:-1]) # else: # com_stem = paras[-2:] # bef_con = bef_con + "\n".join(paras[:-2]) # else: # bef_con = bef_con + "\n".join(paras) # dd["com_stem"] = com_stem # res[-1]["stem"] = bef_con dd["stem"] = one_item dd["item_id"] = new_item_no[n] # 题目本身的题号 dd["type"] = "" # 先题型不备注,根据答案再看 dd["errmsgs"] = [] # dd["score"] = 0 # if select_type_id and dd["item_id"] in select_type_id: # dd['is_optional'] = 'true' res.append(dd) dd = {} # 先不做拆图处理了 else: # print(all_type, len(con11)) if len(all_type) == len(con11)-1: # 第一部分的题型行掉了 all_type.insert(0, "") each_item_score.insert(0, 0) # 按题型行拿的分数也会掉,先补上默认的0 elif len(all_type) != len(con11): print("第二种试卷格式:存在题型行没有换行") # 可能造成题目和题型行在同一行 # return "存在题型行末尾没有换行或题型行中题型不明确" # else: # # print(all_type) # if "非选择题" in all_type: # error_info1 = "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确" # if any([True for one_type in all_type if re.search("必考基础综合中等面列下各", one_type)]): # error_info1 = "存在题型行中题型不明确" # --------------------------------------------------------------------- # 思路:>>>>先纠正题号,再拆分题目;等切分好后再纠正(删减添加操作)比较费时 # 按题号切分,可再加些细节!!!! 1>>题号不要求连续 # 初步判断题号类型 stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str) # >>>>切分题目 for num, one_type in enumerate(con11): # 初步获取题号 item_no_info = pre_get_item_no("\n"+one_type, item_no_type) # 获取正确题号的位置,进行切分 if res: items_no_temp, items_no_idx = get_right_no(item_no_info, have_type=1, last_id=res[-1]["item_id"]) else: items_no_temp, items_no_idx = get_right_no(item_no_info) if not items_no_temp: res.append({ "stem": one_type, "item_id": res[-1]["item_id"]+1 if res else 1, "type": all_type[num], "errmsgs": [], }) continue is_from_0 = 1 if items_no_temp and items_no_idx[0] != 0 and ((not num and items_no_temp[0] > 1) or (num and items_no_temp[0] > new_item_no[-1]+1)): # 以防出现题号漏了的情况 items_no_idx.insert(0, 0) is_from_0 = 0 one_item_split = [("\n" + one_type)[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])] # 针对多个题共用一段材料的情况2021-11-3 head_item = one_type[0:items_no_idx[0]] if not head_item.strip() and head_cons.strip() and not num: # 2022.2.24 head_item = head_cons # if head_item and subject == "地理": # res, com_stem, item_group = split_with_comstem(head_item, [], res, 0) # item_groups.update(item_group) # if com_stem: # dd["com_stem"] = com_stem # common_stem_may = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第~~-]+?(\d{1,2})题", head_item.replace(" ", "")) # if common_stem_may: # if not item_groups: # item_groups["pos"].append(1) # else: # item_groups["pos"].append(len(res)+1) # item_groups["{}-{}".format(common_stem_may.group(2), common_stem_may.group(3))] = head_item # ------------------每个大题的第一题前面的图可能有漏的情况------------------------ may_oimt_pic = [] if not is_from_0: may_oimt_pic_info = re.search("\n((\s*).)+?/>)*?)\s*\n?$", one_item_split[0]) if may_oimt_pic_info: if len(one_item_split) > 1 and re.search("如[上下左右]?图", one_item_split[1]): may_oimt_pic.extend(re.findall("", may_oimt_pic_info.group(1))) # print("may_oimt_pic:", may_oimt_pic) # ------------------------------------------------------------------ if len(all_type) == len(con11): if not is_from_0: if all_type[num] and all_type[num].replace("题", "") not in ['选择', '单选', '多选', '不定选择']: one_item_split = one_item_split[1:] else: # 针对选择题第1题或前面几题题号漏了的情况 one_item_split1 = one_item_split[1:] # ------针对分错的细节继续拆分,如上一题选项行与下一题题干没有换行----------------- new_one_item_split = [] pattern_1 = re.compile( r"([CDE]\s*[..、、].+?)(? 0.0: # dd["score"] = title_info_dict["total_score"][num] res.append(dd) # 多图在一起的情况进行拆分 # if len(re.findall(r"第?\(?([1-9]|[1-9][0-9])\)?\s*题图", one_item)) > 1 and \ # re.search(r"", pic_info.group(1)) pic_w = re.findall("([1-9]|[1-9][0-9])[))]?\s*(?=题)", pic_info.group(2)) if len(pic_list) >= len(pic_w): pic_no = {int(p): pic_list[len(pic_list) - len(pic_w) + k] for k, p in enumerate(pic_w)} dd["stem"] = dd["stem"].replace(pic_info.group(2), "") for k, pic in enumerate(pic_list[::-1]): if k < len(pic_w): dd["stem"] = dd["stem"].replace(pic, "") # ----------------------------------------------------- dd = {} if pic_no: for i in list(pic_no.keys()): res[i-1]["stem"] = res[i-1]["stem"].strip() + "\n" + pic_no[i] + "\n" + "第" + str(i) + "题图" # ---------------------------------------------------------------------- # 可能出现选择题类的选做题 # --------最后判断一下题量是否正确----------------------------------------- # 针对拆分后题量特别多的情况,将拆分后题量与已知题量一样的题目保留 new_res = [] right_type = [] # 记录与已知题目个数一样的分块{第几个分块,题型} if all_type and sum(list(item_type_classify.values())) > 40: # 很可能存在大片不是题号的题号 title_type_num_all = sum([t[1] for t in title_type_num]) # 题型行给出的题目个数 if title_type_num_all > 0: for idx, type_num in enumerate(item_type_num): if type_num[1] == title_type_num[idx][1] > 0: # 同一题型是否题量一致 right_type.append((idx, type_num[0])) elif title_type_num[idx][1] == 0 and type_num[1] == 1: title_type_num[idx][1] = 1 right_type.append((idx, type_num[0])) if right_type: for rtype in right_type: for idx, item in enumerate(res): r_st = 0 if rtype[0]==0 else sum([t[1] for t in item_type_num[:rtype[0]]]) r_ed = r_st + item_type_num[rtype[0]][1] if item["type"] == rtype[1] and r_st<=idx[\s\n]*?)*?.{,2}$", mix_con, flags=re.S) if subject == "语文": common_stem_info2 = re.search("(完成|回答)下?[面列]?的?.*?[\d小各]题.{,2}\n|\n\s*阅读.*?按要求[作回]答.{,2}\n", mix_con) bef_con = "" one_stem = {} if common_stem_info2: item_groups["is_groups"] = 1 if common_stem_info1: st = common_stem_info1.group(2) end = common_stem_info1.group(3) if not item_groups["groups_data"]: item_groups["groups_data"][0] = "{}-{}".format(st, end) else: item_groups["groups_data"][len(bef_res)] = "{}-{}".format(st, end) elif subject == "地理": item_groups["groups_data"][len(bef_res)] = "" # item_groups["groups_data"][len(bef_res)] = "" # elif common_stem_info1: # item_groups["is_groups"] = 1 if (item_groups["is_groups"] and common_stem_info2) or "【题文】" in mix_con or subject == "语文": com_stem = "" if re.search("【题文】", mix_con): bef_con, com_stem = mix_con.split("【题文】", maxsplit=1) if len(bef_res) not in item_groups["groups_data"]: item_groups["groups_data"][len(bef_res)] = "" if not item_groups["is_groups"]: item_groups["is_groups"] = 1 else: com_id = sorted(item_groups["groups_data"]) common_stem_info3 = re.search("(完成|回答)下?[面列]?的?.*?问题.{,2}\n", one_item_split[pc_idx]) # 带多问的大题 com_stem_idx = [i.end() for i in re.finditer(r"(所以|故而?|答案).{,3}(选择?[A-Fa-f]选?项?.{,2}" r"|选项[A-Fa-f]+正确|[A-Fa-f]+项正确).{,2}($|\n\s*【点睛】\s*\n.+?\n|\n)" r"|(选?项?[A-Fa-f]+项?错误?|[A-D]正确|排除[A-D]" r"|[A-D][A-D、、]*?不符?合题意).{,2}\s*($|\n\s*【点睛】\s*\n.+?\n|\n)" r"|(\n\s*[A-E]\s*[..、、::][^\n]*?\n?)+(\n|$)", mix_con, flags=re.S)] # add 第1条后部分 2022-2-14 # print("common_stem_info3:", common_stem_info3) if subject == "地理" and common_stem_info3: item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1] + 1, len(bef_res)-1) elif com_stem_idx and subject == "地理": com_stem = mix_con[com_stem_idx[-1]:] bef_con = mix_con[:com_stem_idx[-1]] # print("com_stem:", com_stem) # print("bef_con:", bef_con) else: if not pc_idx: com_stem = mix_con if len(com_stem) < 25 and re.search("任选一题", com_stem): com_stem = "" if not com_stem: item_groups["groups_data"][len(bef_res)] = "fei" else: if subject == "语文": mix_con_list = re.split(r"(\n\s*阅读.*?按要求[作回]答.{,2}\n" r"|\n\s*阅读下[面列].*?[完成回答]+各题.{,2}\n" r"|[((][一二三四五][))]\s*.{,6}\n|[((][一二三四五][))]\s*.{,6}$)", # r"|\n+\s*[((][一二三四五][))])", # 与上一条都满足时,匹配短的 re.sub("[((][本题共\d小\s]*?\d{1,2}分\s*[))].?", "", mix_con)) if len(mix_con_list) == 1: mix_con_list = re.split(r"([((][一二三四五][))].*?[完成回答]+下?[面列]?的?.*?[\d小]\s*?题.{,2})\n", re.sub("[((]\s*\d{1,2}\s*分\s*[))].?", "", mix_con)) if len(mix_con_list) == 1: mix_con_list = re.split(r"\n+\s*[((][一二三四五][))]", re.sub("[((][本题共\d小\s]*?\d{1,2}分\s*[))].?", "", mix_con)) if len(mix_con_list) >= 3: com_stem = "\n".join(mix_con_list[-2:]) # 有([一二三四五]),但不一定有公共题文 if re.search("[((]\s*[一二三四五]\s*[))](语言文字运用|微?写作|选择|单选|语言表达|作文" r"|.{,4}(文本阅读|诗歌阅读|文言文阅读|名著阅读|默写)题?).{,10}$", com_stem.strip()): com_stem = "" bef_con = "".join(mix_con_list[:-2]) if com_id[-1] not in item_groups["groups_data"] or not item_groups["groups_data"][com_id[-1]]: item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1]+1, len(bef_res)) common_stem_info4 = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第~~-]+?(\d{1,2})小?题", mix_con_list[-2].replace(" ", "").replace("\n", "")) if common_stem_info4: item_groups["groups_data"][len(bef_res)] = "{}-{}".format( common_stem_info1.group(2), common_stem_info1.group(3)) elif common_stem_info2: item_groups["groups_data"][len(bef_res)] = "" common_stem_list = re.split("([完成回答]下?[面列]?的?.*?[\d小各]题.{,2}\n" "|\n\s*阅读.*?按要求[作回]答.{,2}\n)", mix_con) if len(common_stem_list) > 3: com_stem = common_stem_list[-1] mix_con = "".join(common_stem_list[:-1]) else: com_stem = mix_con[common_stem_info2.end():] mix_con = mix_con[:common_stem_info2.end()] # 没切全,还需进一步 # print("mix_con:",mix_con) paras = mix_con.split("\n") paras = [para for para in paras if para.strip()] # 去掉空行 if len(paras) > 1: com_stem = paras[-1] + com_stem bef_con = "\n".join(paras[:-1]) # print(bef_con) else: bef_con = mix_con else: mix_con_list = re.split(r"(详解】|解析】|答案】)", mix_con) if len(mix_con_list) > 2: # 含解析: bef_con = "".join(mix_con_list[:-1]) mix_con = mix_con_list[-1] else: mix_con = mix_con_list[0] paras = mix_con.split("\n") # 将混淆的部分换行拆分 paras = [para for para in paras if para.strip()] # 去掉空行 if len(paras) > 1: if len(paras) == 2 or len(paras[-1]) >= 20: # 2段or段长 com_stem = paras[-1] if re.search("^\s*[((]\s*\d\s*[))]", com_stem): com_stem = "" else: bef_con = bef_con + "\n".join(paras[:-1]) else: com_stem = "\n".join(paras[-2:]) bef_con = bef_con + "\n".join(paras[:-2]) else: bef_con = bef_con + "\n".join(paras) # 此时 com_stem 为空 if subject == "地理": if com_stem: one_stem["com_stem"] = com_stem if bef_res and pc_idx: # 不包括第一题 bef_res[-1]["stem"] = bef_con else: if re.sub("||\n", "", com_stem): one_stem["com_stem"] = com_stem item_groups["is_groups"] = 1 if len(bef_res) not in item_groups["groups_data"]: item_groups["groups_data"][len(bef_res)] = "" if bef_res and bef_con: bef_res[-1]["stem"] = bef_con # elif not pc_idx: # item_groups["groups_data"][len(bef_res)] = "" return bef_res, one_stem, item_groups