#!/usr/bin/env/python # -*- coding:utf-8 -*- # paper3_process: 第三类word试卷模式, 题目和答案分开的情况 # split2one_item:将所有行文本 按题型分大类,再在每个大类中切分每个题目 # split2one_item_by_topicno:将所有行文本 按题型分大类,再在每个大类中按题号切分每个题目 """ 总共3种方案:1、教师用卷;2、按题号切分;3、划分试题和答案,再按题号切分 """ from structure.ans_structure import * from utils.insert_keywords import get_con from utils.item_resplit import resplit from utils.washutil import table_label_cleal from structure.stems_structure import stems_structure_byno from utils.item_type_line import get_item_head_info, get_item_head_info_cn from utils.topic_no import judge_item_no_type, get_right_no from utils.stem_ans_split import stem_ans_split from collections import Counter from pprint import pprint def items_ans_reform(items_list, ans_list, subject): """ 第三种word试卷格式, 题目和答案分开的情况 答案也有几种类型:带题型? :param items_list: :param ans_list: :param subject: :return: """ con1 = list(filter(lambda x: x.strip() != "", items_list)) # 题目 anss1 = list(filter(lambda x: x.strip() != "", ans_list)) # 答案,list中的每个元素为一行 if re.match(".+?省.+?试[卷题]", con1[-1]): con1 = con1[:-1] if re.match(".+?省.+?试[卷题]|.*?答题?[卷卡页]", anss1[0]): anss1 = anss1[1:] # --------------答案页也包含题目的情况----------但可能题目不存在----------------------- ans_n = re.findall("【答案】", "\n".join(anss1)) if subject not in ["地理", "语文"] and ans_n and len(ans_n) == len(re.findall("【解析】", "\n".join(anss1))) > 2: # 带相同个数的答案和解析 print("答案页中有相同个数的答案和解析,可以答案中也带题干") item_res = split_by_keywords(anss1, subject) print("item_res:", item_res) if type(item_res) != str: # 还要判断题目是否为空 if len(item_res[0]) > 10 and len([i["item_id"] for i in item_res[0] if len(i["stem"].strip()) < 5]) < 2: return item_res # ----------------- 【解析 题目】---------------------------- print('---------------解析 题目-------------------') ress = stems_structure_byno(con1, subject) if type(ress) == str: return ress else: item_res, all_type, item_type_classify, item_no_type, \ item_type_num, new_item_no, item_groups = ress # 全题目(不含解析)的结构化 print("item_groups:",item_groups) # pprint(item_groups) # 将空题目去掉 new_res = [] for k, sub_res in enumerate(item_res): if sub_res['stem'].strip(): sub_res['stem'] = del_no(sub_res['stem']) new_res.append(sub_res) item_res = new_res # 先对题目的切分结果进行纠正!!!!! item_res = resplit(item_res) print("item_type_classify:", item_type_classify) print("item_type_num:", item_type_num) print('----------解析 答案---------------') # -------------解析 答案--------------------------- # 分两种情况:1>>答案中又按题型排列, 如一、选择题 1.答案 2.答案 # 2>>答案中不含题型关键字,只按序号排列 # 3>>答案中不含题型关键字,且题目中也没有,all_type, item_type_classify为空 rd1_is_fail = 0 have_type_line = re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等((\[]{2,5}题", "\n".join(anss1)) anss1_cy = anss1.copy() # 复制一份,保证不能影响后面 if have_type_line and subject != "语文": # 这里的anss1的清洗不应该影响rd2_is_fail中的原始文本!!先不修改看看再说 while re.search(r"

[A-F]

|[A-F]|([A-F]\s*){3,}", anss1_cy[0]) is None and \ (re.search(r"[\u4e00-\u9fa5]", anss1_cy[0]) is None or re.search(r"[一二三四五六七八九十]\s*[、..、]\s*()?\s*(.{2,5}题" r"|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏))", anss1_cy[0]) is None): del anss1_cy[0] # 答案中的题型 all_type2 = re.findall(r"\n\s*[一二三四五六七八九十]\s*[、..、::]\s*([^必考基础综合中共等::((\[]{2,5}题)|" r"\n\s*[、..、::]?\s*(单选题|非?选择题|不定选择题|多选题|填空题|计算题|[解简]答题|实验题|作图题|论述题|探究题)", "\n" + "\n".join(anss1_cy)) all_type2 = ["".join(a) for a in all_type2] # '本大题' 后面处理 print("答案中的题型:", all_type2) ans_str = "\n" + "\n".join(anss1_cy) try: item_res, rd1_is_fail = anss_structure_with_type(item_res, ans_str, all_type, all_type2, item_type_num, item_type_classify) except: rd1_is_fail = 1 # 没有题型行或第一次解析失败 rd2_is_fail = 0 if not have_type_line or rd1_is_fail or subject == "语文": # 答案中没有题型行 或题型行名称不规范 print('没有题型行或题目和答案的题型个数不一致或第一次解析失败') anss1 = list( map(lambda x: re.sub(r"(\n|^)\s*[((]?\s*[一二三四五六七八九十]\s*[))]?\s*[、..、::]?\s*(

)?" r"(\s*(.{2,5}题|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)).+?分\s*[.。]?\s*$" r"|.*?[((].+?[得共]\d+分.*?[))].*?$" r"|\s*(.{2,5}题|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏))\s*([((].+?[))])?).*?$" r"|(\n|^)\s*([^\d]{2,5}题|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏))(.+?分\s*[))])?\s*$", "", x), anss1)) # print("anss1:", anss1) raw_item_res = item_res # try: item_res = ans_structure_step1(anss1, item_type_classify, item_res) # 答案整体结构化 if str(raw_item_res) != str(item_res): rd2_is_fail = 1 # except: # rd2_is_fail = 1 # for i, one_item in enumerate(item_res): # item_res[i].update({'key': "", 'parse': ""}) # return item_res, item_no_type, rd2_is_fail for i, one_item in enumerate(item_res): if 'key' not in one_item: item_res[i]['key'] = "" if 'parse' not in one_item: item_res[i]['parse'] = "" return item_res, item_no_type, rd2_is_fail, item_groups def split_by_keywords(con_list, subject): """ 第一种试卷格式:教师用卷,含答案和解析关键字 切分思路: 1.根据大题型分,再按【答案|解析】初步拆分题目,再在‘解析’和‘答案’间细分‘题干’和‘解析’ :param con_list: :return: 每个切分后的题目组成的dict """ # items_con = "\n" + "\n".join(con_list) # judge_item_no_type(items_con) # item_no_type = 1 # all_con = table_label_cleal() # item_no = [int(no) for no in re.findall(r'\n+\s*([1-9][0-9]?)\s*[..、、]', all_con)] # if len(item_no) <= 2: # item_no_type = 2 # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9][0-9]?)\s*[))]\s*[..、、]?', all_con)] # if len(item_no) > 3: # 去掉多余空格,作用不大 con2 = ["【delete】" if (k < len(con_list) - 1 and v.strip() == "" and ( re.match(r"【(答案|解析)】|(答案|解析)\s*[::]| 0 and v.strip() == "" and ( re.match(r"【(答案|解析)】$|(答案|解析)\s*[::]", con_list[k - 1].strip()) or re.match(r"[a-z<>/\s]*?[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con_list[k - 1].strip()))) else v for k, v in enumerate(con_list)] con3 = list(filter(lambda x: x != "【delete】", con2)) while con3 and con3[-1].strip() == "": del con3[-1] while con3 and con3[0].strip() == "": del con3[0] con3.append("") # 不然最后一个题就漏掉了 # 开头没用信息处理 con3[0] = re.sub(r"([一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)", r"\n\1", con3[0]) while con3 and ((re.search(r"[\u4e00-\u9fa5]", con3[0]) is None) or (((subject != "语文" and re.search(r"[一二三四五]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con3[0]) is None) or (subject == "语文" and re.search(r"[一二三四五]\s*[、..、]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文)", con3[0]) is None)) and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[..、、]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$", con3[0]) is None)): # while con3 and (re.search(r"[\u4e00-\u9fa5]", con3[0]) is None # or (re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con3[0]) is None # and re.match("\s*[1-9]\s*[、..、].+?", con3[0]) is None)): del con3[0] # ----------------------------------开始结构化--------------------------------------------- items_con = "\n" + "\n".join(con3) # 初步获取题号,题号类型 items_con, item_no_info, item_no_type = judge_item_no_type(items_con) # 1、获取题型行信息、按题型行切分 if subject == "语文": con4, title_info_dict, choice_class = get_item_head_info_cn(items_con) else: con4, title_info_dict, choice_class = get_item_head_info(items_con) all_type = title_info_dict["all_type"] select_type_id = title_info_dict["select_type_id"] each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"] # 2、据是否有题型行分两步进行 # 没有做拆图处理 res = [] if not all_type: print("不存在大题题型行或题型行格式有问题") if len(re.findall(r"\n\s*【答案】", items_con)) != len(re.findall(r"\n\s*【解析】", items_con)): return "不存在大题题型行或题型行格式有问题" else: item_no = [] subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", items_con.strip()) pattern1 = re.compile(r"([1-9]|[1-9][0-9])\s*[..、、].+?") if re.match(pattern1, subcon[0].strip()): st_id = re.match(pattern1, subcon[0].strip()).group(1) if int(st_id) > 1: item_no.append(int(st_id)) else: item_no.append(1) else: item_no.append(1) if len(subcon) == 5: # 只有1道题 dd = dict(zip(["stem", "key", "parse"], re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(items_con)))) dd["type"] = "" dd["stem"] = re.sub(r"^\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:] # dd["score"] = 0 dd["errmsgs"] = [] dd["item_id"] = item_no[0] # 要用实际id 不是索引序号 res.append(dd) else: # ------在下一题【解析】在本题【答案】之间找到下一题【stem】的位置-------- all_item, item_no, errmsg_dict, count = get_con(subcon, item_no_type, item_no, index=0) # item_no.extend(local_item_no) for idk, one_item in enumerate(all_item): if one_item: dd = dict(zip(["stem", "key", "parse"], re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?", table_label_cleal(one_item)))) dd["type"] = "" dd["stem"] = re.sub(r"\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:] # dd["score"] = 0 dd["errmsgs"] = [errmsg_dict[idk]] if idk in errmsg_dict else [] dd["item_id"] = item_no[idk] res.append(dd) else: if len(all_type) != len(con4): print("存在题型行没有换行") return "存在题型行末尾没有换行,请在所有题型行末尾重新换行" # 放第【2】种方案中进行处理 else: # if "非选择题" in all_type: # return "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确" index = 0 # 每个大题的第一题的题号索引位置 for num, one_type in enumerate(con4): count = 1 if len(re.findall(r"\n\s*【答案】", one_type)) == len(re.findall(r"\n\s*【解析】", one_type)): subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", one_type.strip()) # index根据第一道题的题号进行纠正 item_no = [] pattern1 = re.compile(r"([1-9]|[1-9][0-9])\s*[..、、].+?") if re.match(pattern1, subcon[0].strip()): st_id = re.match(pattern1, subcon[0].strip()).group(1) if num == 0 and int(st_id) != 1: index = int(st_id) - 1 item_no.append(int(st_id)) else: item_no.append(index+1) if len(subcon) == 5: # 只有1道题 dd = dict(zip(["stem", "key", "parse"], re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(one_type)))) dd["type"] = all_type[num] dd["stem"] = re.sub(r"^\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:] # dd["score"] = each_item_score[num] dd["errmsgs"] = [] dd["item_id"] = item_no[0] # 要用实际id 不是索引序号 # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys(): # dd["score"] = each_item_score2[str(dd["item_id"])] if select_type_id and dd["item_id"] in select_type_id: dd['is_optional'] = 'true' # if dd["score"] == 0.0 and title_info_dict["total_score"][num] > 0.0: # dd["score"] = title_info_dict["total_score"][num] res.append(dd) else: # ------在下一题【解析】在本题【答案】之间找到下一题【stem】的位置,再按此3个关键字进行 切分-------- all_item, item_no, errmsg_dict, count = get_con(subcon, item_no_type, item_no, all_type=all_type, num=num, index=index) # item_no.extend(local_item_no) for idk, one_item in enumerate(all_item): dd = dict(zip(["stem", "key", "parse"], re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?", table_label_cleal(one_item)))) dd["type"] = all_type[num] dd["stem"] = re.sub(r"\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:] # dd["score"] = each_item_score[num] dd["errmsgs"] = [errmsg_dict[idk]] if idk in errmsg_dict else [] dd["item_id"] = item_no[idk] # idk+1+index 为序号 if choice_class: for k, v in choice_class.items(): if dd["item_id"] in v: dd["type"] = k + "选题" # elif len(choice_class) == 1: # dd["type"] = "多选题" if k == "单" else "单选题" # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys(): # dd["score"] = each_item_score2[str(dd["item_id"])] if select_type_id and dd["item_id"] in select_type_id: dd['is_optional'] = 'true' res.append(dd) # pprint(res) else: return "第" + str(num + 1) + "大题《" + all_type[num] + "》中【答案】或【解析】格式有误或其中某道题中出现多个相同关键字或漏关键字" index += count for i, one_item in enumerate(res): if 'key' not in one_item: res[i]['key'] = "" if 'parse' not in one_item: res[i]['parse'] = "" return res, item_no_type def split_by_topicno(con_list, subject, is_dati=0): """ 第二种试卷格式: 不同时或都不含有{答案}和{解析}关键字 按题号切分每个题目 将所有行文本 按题型分大类,再在每个大类中切分每个题目 :param con_list: 所有行文本组成的list :return: [{},{}] """ con1 = list(filter(lambda x: x.strip() != "", con_list)) ress = stems_structure_byno(con1, subject, is_dati) # 按题号切分后的初步结构化 if type(ress) == str: return ress else: res, all_type, item_type_classify, item_no_type, item_type_num, new_item_no, item_groups = ress # res, all_type, item_type_classify = stems_structure_byno(con1) print("item_type_num:", item_type_num) # pprint(res) # 格式行调整 for nn, one_i in enumerate(res): if "com_stem" in one_i and re.search('

\n+$', one_i["com_stem"]): one_i["com_stem"], b, _ = re.split('(

\n+)$', one_i["com_stem"]) one_i["stem"] = b + one_i["stem"] if nn > 0 and re.search('

\n+$', res[nn-1]["stem"]): res[nn - 1]["stem"], b, _ = re.split('(

\n+)$', res[nn-1]["stem"]) if "com_stem" not in one_i: one_i["stem"] = b + one_i["stem"] # 可能存在有的题目有解析,有的没有 last_comstem_id = 0 ans_groups = {} no_ans_n = 0 for k, one_res in enumerate(res): if item_groups["is_groups"]: if "com_stem" in one_res: last_comstem_id = k if re.search('\n【(答案|[解分][析答]|详解|点[评睛]|考点|专题)】', one_res["stem"]): case = "case1" # 默认有“答案”关键字 if re.search(r'\n【答案】|[\n】]\s*答案\s*[::]', one_res["stem"]) is None: # 没“答案”关键字 case = "case0" dd1 = stem_ans_split(one_res, case) # 对切分后的每道题再细分 one_res["stem"] = dd1["stem"] del dd1["stem"] if not dd1["key"] and not dd1["parse"]: no_ans_n += 1 else: if subject in ["地理", "语文"] and no_ans_n == k-last_comstem_id > 0: if (k+1 < len(res) and ("com_stem" in res[k+1] or k+1 in item_groups["groups_data"])) or len(re.findall("【\d?题?详解】", dd1["parse"])) > 1\ or len(re.findall(r"(?<=[】\s\n])\d{1,2}\s*[、..、]|^\d{1,2}\s*[、..、]", dd1["key"])) > 1: # 默认是前后都是题组的情况 if is_dati: ans_groups["{}-{}".format(last_comstem_id + res[0]["item_id"], k + res[0]["item_id"])] = dd1 else: ans_groups["{}-{}".format(last_comstem_id + 1, k + 1)] = dd1 dd1 = {"key": "", "parse": ""} no_ans_n = 0 one_res.update(dd1) else: # 没有解析的情况 one_res.update({"key": "", "parse": ""}) no_ans_n += 1 one_res["stem"] = del_no(one_res["stem"], item_no_type) if 'pic' in one_res: one_res["stem"] += "\n" + "\n".join(one_res["pic"]) del one_res["pic"] # 先对题目的切分结果进行纠正!!!!! res = resplit(res) # pprint(res) # 对最后一个题后面带个别答案(无答案页) # if res: # pattern1 = re.search('\n\s*([1-9]|[1-9][0-9])\s*[..、、]\s*(解\s*[::]|【解析|【答案)', res[-1]["stem"]) # if pattern1: # breakp = pattern1.start() # ans_str = res[-1]["stem"][breakp:] # ans_no_info = pre_get_item_no(ans_str, item_no_type) # ans_no, ans_no_idx = get_right_no(ans_no_info) # all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])] # res[-1]["stem"] = res[-1]["stem"][:breakp] # res = get_ans_match(res, all_ans, ans_no) # else: # ans_str = res[-1]["stem"] + res[-1]["parse"] # ans_no_info = pre_get_item_no(ans_str, item_no_type) # ans_no, ans_no_idx = get_right_no(ans_no_info) # if len(ans_no) == len(res): # all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])] # res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]] # res = get_ans_match(res, all_ans, ans_no) # elif ans_no_idx: # try: # ans_no1, table_ans, st = get_table_ans(res[-1]["stem"][:ans_no_idx[0]], [], flag=1) # if table_ans and 0 < ans_no[0] - ans_no1[-1] < 3: # all_ans = table_ans # all_ans.extend([del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]) # new_ans_no = ans_no1 # new_ans_no.extend(ans_no) # if st >= 0: # res[-1]["stem"] = res[-1]["stem"][:st] # else: # res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]] # res = get_ans_match(res, all_ans, new_ans_no) # except: # if len(ans_no) > 4 and all([True if not one_res["key"] and not one_res["parse"] # else False for one_res in res[:-1]]): # all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])] # res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]] # res = get_ans_match(res, all_ans, ans_no) # 没有识别出答案切分点的情况,很可能答案里的部分也当成题文进行拆分,所以先判断下是否有相同的id all_no = [one_res['item_id'] for one_res in res] if len(list(set(all_no))) - len(all_no) < -2: Count_no = sorted(dict(Counter(all_no)).items(), key=lambda d: d[1], reverse=True) if Count_no[0][1] > 1: split_idx = [i for i, no in enumerate(all_no) if no == Count_no[0][0]][1] for one_res in res[split_idx:]: if re.search("[((]\s+[))]|(等于|存在|[是有为])多少|求.*?[??]", one_res["stem"] + "\n" + one_res["parse"]) is None: bef_no = [k for k, j in enumerate(res[:split_idx]) if j["item_id"]==one_res["item_id"]] if bef_no and not res[:split_idx][bef_no[0]]["parse"]: res[:split_idx][bef_no[0]]["parse"] = one_res["stem"] + "\n" + one_res["parse"] return res[:split_idx], item_no_type return res, item_no_type, item_groups, ans_groups