#!/usr/bin/env/python # -*- coding:utf-8 -*- import re from utils.washutil import table_label_cleal def stem_ans_split(one_item_dict, case): """ 将切出来的一道题 按 答案解析 进一步细分 :param one_item_dict: 单道题的初步结构字典{"stem": , "item_id": , "errmsgs": [],"type":,} :param case: 属于哪种情况 :return: {"stem": ,"key": ,"parse":} """ one_item = one_item_dict["stem"] item_type = one_item_dict["type"] # print(one_item) if case == 'case0': # 没“答案”关键字 inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?", table_label_cleal(one_item)) inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛'] else str(a).replace('None', '').strip() for a in inside_split] # print(':::', inside_split) # print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') dd = dict(zip(["stem", "parse_title"], inside_split[0:2])) dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n") dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"]) dd["key"] = "" else: # if case == 'case1': # 有“答案”关键字 dd = dict(zip(["stem", "key"], re.split(r"【答案】\n?|(?<=[\n】])\s*答案\s*[::]", table_label_cleal(one_item), maxsplit=1))) # pprint(dd) # 一般默认‘答案’在‘解析’的前面 subdd = dict(zip(["key", "parse_title", "parse"], re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", dd["key"], maxsplit=1))) dd["key"] = subdd["key"] if "parse_title" in subdd: dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"] dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"]) else: dd["parse"] = "" dd["stem"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:] # 获取答案 if not dd["key"]: dd["key"] = get_ans_from_parse(dd["parse"], item_type, dd["stem"]) # 补充!!!------------------------------------------ # if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)| # ans = re.search(r'故选[::]?]+?data-latex="\$?([A-Z;;和与、、\s]+)\$?".+?/>|故选[::]?([A-Z;;和与、、\s]+)', # dd["parse"].replace("$", "").replace(" ", "")) # if ans: # dd["key"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None # else: # dd["key"] = "" # else: # dd["key"] = "见解析" # ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"]) # if ans: # dd["key"] = ans.group(1) # ------------------------------------------------------ if "parse_title" in dd: del dd["parse_title"] return dd def get_ans_from_parse(item_parse, item_type, res_con): """ 从已知解析中 挑选 答案 :param item_parse: 总解析 :param item_type: 题型 :return: """ item_parse = re.split("【点评】|【点睛】", item_parse)[0].strip() # 将解析中末尾出现的图片去掉 while re.search('\n\s*\s*$', item_parse): item_parse = re.sub('\n\s*\s*$', "", item_parse) item_ans = "" if item_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"]: ans = re.search(r'故选\s*[::]?\s*' r'|故选\s*[::]?\s*([A-Z;;和与、、\s]+)', item_parse.replace("$", "")) if ans: item_ans = ans.group(1) if ans.group(1) is not None else ans.group(2) item_ans = re.sub(r"[.;;.]\s*$", "", item_ans) elif not ans: item_ans = "见解析" elif item_type: ans0 = re.search(r'故选\s*[::]?\s*([A-Z;;和与、、\s]+)[..;;。]?$', item_parse) # 试验题中可能还有选择题 ans01 = re.search(r'故选\s*[::]\s*', item_parse.replace("$", "").replace("\[", "").replace("\]", "")) # 选择题的题型可能前面分错 ans1 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(((?!()[..]?\s*(\n|$)', item_parse, re.S) ans22 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*([^∴∵因所故即【】]+?)([..]\s*(\n|$)|$)', item_parse) ans21 = re.search(r'综上所述\s*[::]\s*([^∴∵故因所即【】]+?)[..;;]\s*$', item_parse) ans3 = re.search(r'(故|因[而此]|所以|∴)\s*[::]?.+?[为是填]\s*[::]?\s*([^∴∵故因所即【】]+?)([..;;,,]\s*$|[..]\s*\n)', item_parse) ans31 = re.search(r'(故|因[而此]|所以|∴)\s*([^当为是填∴∵故因所即则【】]+?)[..;;]\s*$', item_parse) ans32 = re.search(r'(故|因[而此]|所以)\s*[::]?[^当为是填∴∵故因所即【】]+?[为是填]\s*[::]?\s*()[..]?\s*(\n|$)', item_parse, re.S) ans4 = re.search(r'\n\s*[==]([^=\n]+?)[..]?\s*$', item_parse) ans42 = re.search(r'[==](?!")(((?!([故=∴即]|原式|因[而此]|所以|\n|=[^"])).)+?)[..]?\s*$', item_parse) ans41 = re.search(r'原式\s*[==].+?[==](?!")(((?!(=|=[^"])).)+?|\s*)([..]?\s*$|[..]\s*\n)', item_parse) if not (item_type == '填空题' and len(re.findall(r"_{2,}|_+([^_]*?)_+", res_con)) == 1) and \ len(re.findall(r"[((]\d[))]|[\n::;;。】]([((](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[))]|[①②③④]\s*(?![+-]))", item_parse.replace(" ", ""))) > 1 or "证明" in item_parse: item_ans = "见解析" elif ans0: item_ans = ans0.group(1) elif ans01: item_ans = ans01.group(1) elif ans1 or ans11: item_ans = ans1.group(3) if ans1 else ans11.group(2) elif ans2: item_ans = ans2.group(3) elif ans22: item_ans = ans22.group(3) elif ans21: item_ans = ans21.group(1) elif (ans3 or ans31 or ans32) and '证明' not in item_parse: if ans3: item_ans = ans3.group(2) if ans31: item_ans = ans31.group(2) if ans32: item_ans = ans32.group(2) elif (ans4 or ans41 or ans42) and '证明' not in item_parse: if ans4: item_ans = ans4.group(1) if ans41: item_ans = ans41.group(1) if ans42: item_ans = ans42.group(1) else: item_ans = "见解析" return item_ans def get_split_pos(row_list): """ 获取题目、答案的切分位置 :return: """ # 寻找题目和答案的切分点,一定要有“答案”关键字 split_p1 = [k for k, v in enumerate(row_list) if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$' r'|答案[和与及]?解析([((].*?[))])?$' # |答\s*案$ r'|.{,15}(参考|考试|(考?试|检测)[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*$' r'|.{,15}评分(标准|意见|细则|参考)$' r'|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s..、、]+$' r'|.{,15}(参考|考试|(考?试|检测)[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*(物理|理综|数学|化学|生物)?\s*$' r'|.{,15}解析[和与及]答案$', re.sub(r"[上下]?学[年期]|[\d—【】..·、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]" r"|[高初][一二三]|部分", "", v.strip()))] if split_p1 and split_p1[0] < 30: if len(re.sub("|\s", "", "".join(row_list[:split_p1[0]])).strip())<60: return "题文全是图片,本通道无法解析" split_p1 = [p for p in split_p1 if p > 30] print("答案split_p1:", split_p1) # 没有答案关键字时 split_p0 = [k for k, v in enumerate(row_list) if re.search("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)?考试(评估)?|学期|[寒暑]假作业)[一二三四五六七八九试题((卷))\s]*?$", re.sub(r"[上下]?学[年期]度?|[\d—【】..、、::(())年第\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "", v.strip()))] split_p0 = [p for p in split_p0 if p > 30] print("试卷标题split_p0:", split_p0) if not split_p0: split_p0 = [k for k, v in enumerate(row_list) if re.search("^\s*第\s*[一IⅠ]\s*卷\s*([((]|非?选择题)", v.strip())] split_p0 = [p for p in split_p0 if p > 30] print("试卷标题split_p01:", split_p0) # if not split_p and len(re.split("【答案】", "@@\n".join(row_list))) == 2: # 参考答案的关键字只用了【答案】 # split_p.insert(0, len(re.split("【答案】", "@@\n".join(row_list))[0].split("@@\n"))-1) # print("split_p1:", split_p1) items_list, ans_list = [], [] # ===================================题目切分====================================================== pattern1 = re.compile("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)考试|学期)[试题((卷))\s]*?$" "|密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?" "|((学校|班级|姓名|座位?号|准考号|学号)[\s::_]*?){2,}|^\n*\s*\n*$") is_may_ans = 0 if not split_p1 and split_p0: split_p1 = [split_p0[-1]] # 优先当答案使用,选最后一个 is_may_ans = 1 if split_p1: new_p1 = split_p1[0] may_omit_info = re.match("((参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案|答案和解析)([\dA-E\s..、、]+)$", row_list[new_p1]) if may_omit_info and re.findall("[A-E]", may_omit_info.group(3)): row_list.insert(new_p1+1, may_omit_info.group(3)) while re.search(pattern1, row_list[new_p1 - 1]): new_p1 -= 1 items_list = row_list[:new_p1] ans_list = row_list[split_p1[0] + 1:] # 再判断是否有答题卷 split_p2 = [k for k, v in enumerate(row_list[:split_p1[0]]) if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡", re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))] if split_p2: # 答案前 while re.search(pattern1, items_list[split_p2[0] - 1]): split_p2[0] -= 1 items_list = items_list[:split_p2[0]] else: split_p2 = [k for k, v in enumerate(ans_list) if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡", re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))] if split_p2: # 答案后 while re.search(pattern1, ans_list[split_p2[0] - 1]): split_p2[0] -= 1 ans_list = ans_list[:split_p2[0]] # else: # 没有答案页,但可能也有答题卡 split_p2 = [k for k, v in enumerate(row_list) if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡", re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))] print("答题卡split_p2:", split_p2) if split_p2: while re.search(pattern1, row_list[split_p2[0] - 1]): split_p2[0] -= 1 row_list = row_list[:split_p2[0]] return row_list, items_list, ans_list, is_may_ans