123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- import re
- from utils.washutil import table_label_cleal
- def stem_ans_split(one_item_dict, case):
- """
- 将切出来的一道题 按 答案解析 进一步细分
- :param one_item_dict: 单道题的初步结构字典{"stem": , "item_id": , "errmsgs": [],"type":,}
- :param case: 属于哪种情况
- :return: {"stem": ,"key": ,"parse":}
- """
- one_item = one_item_dict["stem"]
- item_type = one_item_dict["type"]
- # print(one_item)
- if case == 'case0': # 没“答案”关键字
- inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
- table_label_cleal(one_item))
- inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
- else str(a).replace('None', '').strip() for a in inside_split]
- # print(':::', inside_split)
- # print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
- dd = dict(zip(["stem", "parse_title"], inside_split[0:2]))
- dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
- dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
- dd["key"] = ""
- else: # if case == 'case1': # 有“答案”关键字
- dd = dict(zip(["stem", "key"], re.split(r"【答案】\n?|(?<=[\n】])\s*答案\s*[::]",
- table_label_cleal(one_item), maxsplit=1)))
- # pprint(dd) # 一般默认‘答案’在‘解析’的前面
- subdd = dict(zip(["key", "parse_title", "parse"],
- re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", dd["key"], maxsplit=1)))
- dd["key"] = subdd["key"]
- if "parse_title" in subdd:
- # print(subdd["parse_title"])
- dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
- if subdd["parse_title"] not in ["解析", "解答", "详解"]:
- may_parse = re.split(r"【(解析|解答|分析|详解)】\n*?", dd["stem"], maxsplit=1)
- if len(may_parse) == 3:
- dd["stem"] = may_parse[0]
- dd["parse"] = "【" + may_parse[1] + "】" + may_parse[2] + dd["parse"]
- dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
- else:
- dd["parse"] = ""
- dd["stem"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:]
- # 获取答案
- if not dd["key"]:
- dd["key"] = get_ans_from_parse(dd["parse"], item_type, dd["stem"])
- # 补充!!!------------------------------------------
- # if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)|
- # ans = re.search(r'故选[::]?<imgsrc=[^>]+?data-latex="\$?([A-Z;;和与、、\s]+)\$?".+?/>|故选[::]?([A-Z;;和与、、\s]+)',
- # dd["parse"].replace("$", "").replace(" ", ""))
- # if ans:
- # dd["key"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None
- # else:
- # dd["key"] = ""
- # else:
- # dd["key"] = "见解析"
- # ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"])
- # if ans:
- # dd["key"] = ans.group(1)
- # ------------------------------------------------------
- if "parse_title" in dd:
- del dd["parse_title"]
- return dd
- def get_ans_from_parse(item_parse, item_type, res_con):
- """
- 从已知解析中 挑选 答案
- :param item_parse: 总解析
- :param item_type: 题型
- :return:
- """
- item_parse = re.split("【点评】|【点睛】", item_parse)[0].strip()
- # 将解析中末尾出现的图片去掉
- while re.search('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', item_parse):
- item_parse = re.sub('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', "", item_parse)
- item_ans = ""
- if item_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
- ans = re.search(r'故选\s*[::]?\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>'
- r'|故选\s*[::]?\s*([A-Z;;和与、、\s]+)', item_parse.replace("$", ""))
- if ans:
- item_ans = ans.group(1) if ans.group(1) is not None else ans.group(2)
- item_ans = re.sub(r"[.;;.]\s*$", "", item_ans)
- elif not ans:
- item_ans = "见解析"
- elif item_type:
- ans0 = re.search(r'故选\s*[::]?\s*([A-Z;;和与、、\s]+)[..;;。]?$', item_parse) # 试验题中可能还有选择题
- ans01 = re.search(r'故选\s*[::]\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>', item_parse.replace("$", "").replace("\[", "").replace("\]", "")) # 选择题的题型可能前面分错
- ans1 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(((?!(<img)).)+?)[..]?\s*(\n|$)', item_parse)
- ans11 = re.search(r'((?<!解)答\s*[::]|整理得\s*[::]?)\s*(.+?)([..;;]?\s*$|[..]\s*\n)', item_parse)
- ans2 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)', item_parse, re.S)
- ans22 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*([^∴∵因所故即【】]+?)([..]\s*(\n|$)|$)', item_parse)
- ans21 = re.search(r'综上所述\s*[::]\s*([^∴∵故因所即【】]+?)[..;;]\s*$', item_parse)
- ans3 = re.search(r'(故|因[而此]|所以|∴)\s*[::]?.+?[为是填]\s*[::]?\s*([^∴∵故因所即【】]+?)([..;;,,]\s*$|[..]\s*\n)', item_parse)
- ans31 = re.search(r'(故|因[而此]|所以|∴)\s*([^当为是填∴∵故因所即则【】]+?)[..;;]\s*$', item_parse)
- ans32 = re.search(r'(故|因[而此]|所以)\s*[::]?[^当为是填∴∵故因所即【】]+?[为是填]\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)',
- item_parse, re.S)
- ans4 = re.search(r'\n\s*[==]([^=\n]+?)[..]?\s*$', item_parse)
- ans42 = re.search(r'[==](?!")(((?!([故=∴即]|原式|因[而此]|所以|\n|=[^"])).)+?)[..]?\s*$', item_parse)
- ans41 = re.search(r'原式\s*[==].+?[==](?!")(((?!(=|=[^"])).)+?|\s*<imgsrc.+?/>)([..]?\s*$|[..]\s*\n)', item_parse)
- if not (item_type == '填空题' and len(re.findall(r"_{2,}|_+([^_]*?)_+", res_con)) == 1) and \
- len(re.findall(r"[((]\d[))]|[\n::;;。】]([((](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[))]|[①②③④]\s*(?![+-]))",
- item_parse.replace(" ", ""))) > 1 or "证明" in item_parse:
- item_ans = "见解析"
- elif ans0:
- item_ans = ans0.group(1)
- elif ans01:
- item_ans = ans01.group(1)
- elif ans1 or ans11:
- item_ans = ans1.group(3) if ans1 else ans11.group(2)
- elif ans2:
- item_ans = ans2.group(3)
- elif ans22:
- item_ans = ans22.group(3)
- elif ans21:
- item_ans = ans21.group(1)
- elif (ans3 or ans31 or ans32) and '证明' not in item_parse:
- if ans3:
- item_ans = ans3.group(2)
- if ans31:
- item_ans = ans31.group(2)
- if ans32:
- item_ans = ans32.group(2)
- elif (ans4 or ans41 or ans42) and '证明' not in item_parse:
- if ans4:
- item_ans = ans4.group(1)
- if ans41:
- item_ans = ans41.group(1)
- if ans42:
- item_ans = ans42.group(1)
- else:
- item_ans = "见解析"
- return item_ans
- def get_split_pos(row_list):
- """
- 获取题目、答案的切分位置
- :return:
- """
- # 寻找题目和答案的切分点,一定要有“答案”关键字
- split_p1 = [k for k, v in enumerate(row_list)
- if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$|答案[和与及]?解析([((].*?[))])?$' # |答\s*案$
- r'|.{,15}(参考|考试|(考?试|检测)[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*$'
- r'|.{,15}评分(标准|参考)|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s..、、]+$'
- r'|.{,15}(参考|考试|(考?试|检测)[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*(物理|理综|数学|化学|生物)?\s*$'
- r'|.{,15}解析[和与及]答案$',
- re.sub(r"[上下]?学[年期]|[\d—【】..·、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]"
- r"|[高初][一二三]|部分", "", v.strip()))]
- if split_p1 and split_p1[0] < 30:
- if len(re.sub("<imgsrc.*?/>|\s", "", "".join(row_list[:split_p1[0]])).strip())<60:
- return "题文全是图片,本通道无法解析"
- split_p1 = [p for p in split_p1 if p > 30]
- print("答案split_p1:", split_p1)
- # 没有答案关键字时
- split_p0 = [k for k, v in enumerate(row_list)
- if re.search("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)?考试(评估)?|学期|[寒暑]假作业)[一二三四五六七八九试题((卷))\s]*?$",
- re.sub(r"[上下]?学[年期]度?|[\d—【】..、、::(())年第\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "", v.strip()))]
- split_p0 = [p for p in split_p0 if p > 30]
- print("试卷标题split_p0:", split_p0)
- if not split_p0:
- split_p0 = [k for k, v in enumerate(row_list)
- if re.search("^\s*第\s*[一IⅠ]\s*卷\s*([((]|非?选择题)", v.strip())]
- split_p0 = [p for p in split_p0 if p > 30]
- print("试卷标题split_p01:", split_p0)
- # if not split_p and len(re.split("【答案】", "@@\n".join(row_list))) == 2: # 参考答案的关键字只用了【答案】
- # split_p.insert(0, len(re.split("【答案】", "@@\n".join(row_list))[0].split("@@\n"))-1)
- # print("split_p1:", split_p1)
- items_list, ans_list = [], []
- # ===================================题目切分======================================================
- pattern1 = re.compile("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)考试|学期)[试题((卷))\s]*?$"
- "|密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?"
- "|((学校|班级|姓名|座位?号|准考号|学号)[\s::_]*?){2,}|^\n*\s*\n*$")
- is_may_ans = 0
- if not split_p1 and split_p0:
- split_p1 = [split_p0[-1]] # 优先当答案使用,选最后一个
- is_may_ans = 1
- if split_p1:
- new_p1 = split_p1[0]
- may_omit_info = re.match("((参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案|答案和解析)([\dA-E\s..、、]+)$", row_list[new_p1])
- if may_omit_info and re.findall("[A-E]", may_omit_info.group(3)):
- row_list.insert(new_p1+1, may_omit_info.group(3))
- while re.search(pattern1, row_list[new_p1 - 1]):
- new_p1 -= 1
- items_list = row_list[:new_p1]
- ans_list = row_list[split_p1[0] + 1:]
- # 再判断是否有答题卷
- split_p2 = [k for k, v in enumerate(row_list[:split_p1[0]])
- if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
- re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
- if split_p2: # 答案前
- while re.search(pattern1, items_list[split_p2[0] - 1]):
- split_p2[0] -= 1
- items_list = items_list[:split_p2[0]]
- else:
- split_p2 = [k for k, v in enumerate(ans_list)
- if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
- re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
- if split_p2: # 答案后
- while re.search(pattern1, ans_list[split_p2[0] - 1]):
- split_p2[0] -= 1
- ans_list = ans_list[:split_p2[0]]
- # else:
- # 没有答案页,但可能也有答题卡
- split_p2 = [k for k, v in enumerate(row_list)
- if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
- re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
- print("答题卡split_p2:", split_p2)
- if split_p2:
- while re.search(pattern1, row_list[split_p2[0] - 1]):
- split_p2[0] -= 1
- row_list = row_list[:split_p2[0]]
- return row_list, items_list, ans_list, is_may_ans
|