# 各题型结构化 import re from structure.option import option_structure, new_options_rank, table_option_struc from structure.dati2slave import get_slave def one_item_structure(xyz): """ 判断解析类型,解析类型为: if: 1.stem不需要再做其他处理<-- 答案没有[;;],且答案不是ABCDEFG 2.选择题类,需要把stem中的ABCD各选项内容提取出来<--答案是ABCDEFG else: 都要看是否含有小题,如果含有小题,需要把小题提取出来,slave 3.填空题类,(1)需要提取stem中下划线的个数 选择题结构化:单选或者多选<--要把各选项是什么提取出来放在slave中 one_item:{"stem":xxxx,"key":xxx,"parse":xxx} consumer: 分“高中数学”还是“全学科”; item_no_type:题号是否以(\d)的形式 :return: """ one_item, consumer, item_no_type, subject, is_danti = xyz # print(one_item) if "【章节】" in one_item["parse"]: # 属于后一个题的,后面须调整 one_item["chapter"] = one_item["parse"].split("【章节】")[1].split("\n")[0] one_item["parse"] = one_item["parse"].replace("【章节】" + one_item["chapter"], "") if "【章节】" in one_item["stem"]: # 属于后一个题的,后面须调整 one_item["chapter"] = one_item["stem"].split("【章节】")[1].split("\n")[0] one_item["stem"] = one_item["stem"].replace("【章节】" + one_item["chapter"], "") if "【选做题】" in one_item["stem"] + one_item["key"] + one_item["parse"]: opt_str = re.search(r"【选做题】:'(\d+)分'", one_item["stem"] + one_item["key"] + one_item["parse"]) one_item["option_st"] = "选做题,"+opt_str.group(1) if opt_str else "选做题" # 选做题开始的位置,后面的题开始是选做题 one_item["stem"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["stem"]) one_item["key"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["key"]) one_item["parse"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["parse"]) ans = one_item["key"] con = one_item["stem"] parse = re.sub(r"((?<=[\n】])|^)\s*解\s*[::]", "", one_item["parse"]) topic_type_cn = "" if not one_item["type"] and (not is_danti or subject == "地理"): # one_item["errmsgs"].append("本题没有给出明确题型!") # return one_item if re.match(r"[A-Z][A-Zc;;和与、、\s]*?$", ans.strip()): one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题" elif re.search(r"[((]\s*[))]", one_item["stem"]) or \ len(re.findall(r"[\n\s\u4e00-\u9fa5①②③④]\s*[A-D]\s*[..、、]|\s/>[A-D]\s*[..、、]", one_item["stem"])) >= 4: one_item["type"] = "选择题" elif re.findall(r"_{2,}", one_item["stem"]): one_item["type"] = "填空题" else: one_item["type"] = "解答题" elif subject == "语文": # one_item["type"] == "综合题" and if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.replace("c", "C").strip()): topic_type_cn = "选择题" elif len(re.findall(r"[\n\s\u4e00-\u9fa5①②③④]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 3: # re.search(r"[((]\s*[))]", one_item["stem"]) 这个条件不行 if len(re.findall(r"\n\s*[((]\s*\d\s*[))]\s*[..、、]?", one_item["stem"])) >= 2: topic_type_cn = "小题多问" else: topic_type_cn = "选择题" if one_item["type"].replace("题", "") in ["单选", "多选", "选择", "不定选择"]: if subject == "地理": if len(re.findall(r"\n\s*[((]\s*\d\s*[))]\s*[..、、]?", one_item["stem"])) >= 2: one_item["type"] = "小题多问" if re.match(r"【?(对|正确的?|错误?的?|F|T)】?$", ans.strip()): one_item["type"] = "判断题" topic_type = one_item["type"] # print(topic_type, topic_type_cn) # print(one_item) if topic_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"] or topic_type_cn == "选择题": one_item = option_structure(one_item, con, ans, item_no_type) if 'options' not in one_item: one_item["options"] = [] # 表格类型的选项再解析, one_item["errmsgs"] = [emg for emg in one_item["errmsgs"] if "选项格式不正确" not in emg] if "" in one_item["stem"]: may_options = table_option_struc(one_item["stem"]) if may_options: one_item["options"] = may_options one_item["options_rank"] = 2 else: # 走toslave non_option_structure(one_item, con, parse, ans, topic_type) else:# 走toslave non_option_structure(one_item, con, parse, ans, topic_type) else: # 选择题结构化成功时,对选项排列方式再换思路算 options_rank_2 = new_options_rank(one_item["options"]) if options_rank_2: one_item["options_rank"] = options_rank_2 one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["stem"])) one_item["answer_type"] = "选择题" elif consumer == 'toslave': # 拆小问 non_option_structure(one_item, con, parse, ans, topic_type) else: # 不拆小问,非选择题 if "#" in ans: one_item["key"] = one_item["key"].replace("#", "; ") pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+(().)*?/>|[^_;;。?!\n])+?)_+" r"([cdkm上]?m?\s*.?[。.?]?\s*($|
|<==×÷/()()﹙﹚\[\]﹛﹜{\}∧∨∠▰▱△∆⊙⌒" r"⊆⊂⊇⊃∈∩∉∪⊕∥∣≌∽∞∝⊥∫∬∮∯Φ∅≮≯∁∴∵∷←↑→↓↖↗↘↙‖〒¤○′″¢°℃℉" r"αβγδεζηθικλμνξορστυφχψωϕ%‰℅㎎㎏㎜㎝㎞㎡㎥㏄㏎㏕$£¥º¹²³⁴ⁿ₁₂₃₄·∶½⅓⅔¼¾⅛⅜⅝⅞" r"ΑΒΓΔΕΖΗΘΙΚΜ]", "", ans)): one_item["type"] = "多选题" one_item = option_structure(one_item, con, ans, item_no_type) if 'options' not in one_item: one_item["options"] = [] if one_item["type"] == "填空题" and re.search("_{2,}|填正确答案", one_item['stem']) is None: if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()): one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题" one_item = option_structure(one_item, con, ans, item_no_type) if 'options' not in one_item: one_item["options"] = [] elif re.search(r"[((]\s*[))]", one_item["stem"]) or ('步骤' not in one_item["stem"] and len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 4): one_item["type"] = "选择题" one_item = option_structure(one_item, con, ans, item_no_type) if 'options' not in one_item: one_item["options"] = [] elif re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["stem"]): one_item["blank_num"] = len(re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["stem"])) elif re.findall('[ \s]{3,}[a-zA-Z]\s*[,;.。;,]', one_item["stem"]): one_item["blank_num"] = len(re.findall('\s{3,}\n*\s*[a-zA-Z]\s*[,;.。;,.]', one_item["stem"])) elif re.search(pattern1, one_item["stem"]) is None and re.search(pattern2, one_item["stem"]) is None: stem = re.sub("|[,,.。.、、]", "", one_item["stem"]) if len(stem) > 2: one_item["type"] = "解答题" # print('------------------------------------------------') if one_item: # if re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["stem"].strip()): # # print(one_item["stem"]) # score_info = re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["stem"].strip()) # one_item["score"] = float(score_info.group(2)) one_item["stem"] = re.sub(r"(\[.*?\])?\(.*?\d+分\)", "", one_item["stem"][:20]) + one_item["stem"][20:] return one_item def non_option_structure(one_item, con, parse, ans, topic_type): """ :return: """ if topic_type in ["作文", "书面表达", "写作"]: one_item["answer_type"] = "语文作文" pass elif re.search("_{2,}|^\s*.{,6}(表格|作文|书面表达).{,7}|写一篇文章|作文题目", con): if re.search("_{2,}|^\s*.{,6}表格", con) is None: one_item["answer_type"] = "语文作文" else: blank_num = len(re.findall(r"_{2,}", con)) if blank_num > 0: one_item["blank_num"] = blank_num one_item["answer_type"] = "填空题" else: one_item["answer_type"] = "解答题" if re.search("_{2,}", con): # 也可以在带小题中 one_item = get_slave(one_item, con, parse, ans) else: one_item = get_slave(one_item, con, parse, ans) # if 'options' not in one_item: # one_item["options"] = [] return one_item