123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- # 各题型结构化
- import re
- from structure.option import option_structure, new_options_rank, table_option_struc
- from structure.dati2slave import get_slave
- def one_item_structure(xyz):
- """
- 判断解析类型,解析类型为:
- if:
- 1.stem不需要再做其他处理<-- 答案没有[;;],且答案不是ABCDEFG
- 2.选择题类,需要把stem中的ABCD各选项内容提取出来<--答案是ABCDEFG
- else:
- 都要看是否含有小题,如果含有小题,需要把小题提取出来,slave
- 3.填空题类,(1)需要提取stem中下划线的个数
- 选择题结构化:单选或者多选<--要把各选项是什么提取出来放在slave中
- one_item:{"stem":xxxx,"key":xxx,"parse":xxx}
- consumer: 分“高中数学”还是“全学科”;
- item_no_type:题号是否以(\d)的形式
- :return:
- """
- one_item, consumer, item_no_type = xyz
- if "【章节】" in one_item["parse"]: # 属于后一个题的,后面须调整
- one_item["chapter"] = one_item["parse"].split("【章节】")[1].split("\n")[0]
- one_item["parse"] = one_item["parse"].replace("【章节】" + one_item["chapter"], "")
- if "【章节】" in one_item["stem"]: # 属于后一个题的,后面须调整
- one_item["chapter"] = one_item["stem"].split("【章节】")[1].split("\n")[0]
- one_item["stem"] = one_item["stem"].replace("【章节】" + one_item["chapter"], "")
- if "【选做题】" in one_item["stem"] + one_item["key"] + one_item["parse"]:
- opt_str = re.search(r"【选做题】:'(\d+)分'", one_item["stem"] + one_item["key"] + one_item["parse"])
- one_item["option_st"] = "选做题,"+opt_str.group(1) if opt_str else "选做题" # 选做题开始的位置,后面的题开始是选做题
- one_item["stem"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["stem"])
- one_item["key"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["key"])
- one_item["parse"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["parse"])
- ans = one_item["key"]
- con = one_item["stem"]
- parse = re.sub(r"((?<=[\n】])|^)\s*解\s*[::]", "", one_item["parse"])
- if not one_item["type"]:
- # one_item["errmsgs"].append("本题没有给出明确题型!")
- # return one_item
- if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()):
- one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题"
- elif re.search(r"[((]\s*[))]", one_item["stem"]) or \
- len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 4:
- one_item["type"] = "选择题"
- elif re.findall(r"_{2,}", one_item["stem"]):
- one_item["type"] = "填空题"
- else:
- one_item["type"] = "简答题"
- elif one_item["type"].replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
- # # 存在试卷所给题型不规范或本身就是错的情况
- if len(re.findall("\n\s*[((]\s*[1-9]\s*[))]", "\n" + con)) > 1 \
- or (re.search(r"[((]\s*[))][\s\n]*(<img src=|A\s*[..、、])", con) is None
- and len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", con)) < 2):
- if re.findall(r"_{2,}", con):
- one_item["type"] = "填空题"
- else:
- one_item["type"] = "解答题"
- topic_type = one_item["type"]
- # print(topic_type)
- if topic_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
- one_item = option_structure(one_item, con, ans, item_no_type)
- one_item["stem"] = re.sub("(?<![/\"]>)(<br\s*/?>|\n)+\s*(?!<img)", "", one_item["stem"])
- if 'options' not in one_item:
- one_item["options"] = []
- # 表格类型的选项再解析,
- if "<table>" in one_item["stem"]:
- may_options = table_option_struc(one_item["stem"])
- if may_options:
- one_item["options"] = may_options
- one_item["options_rank"] = 2
- one_item["errmsgs"] = [emg for emg in one_item["errmsgs"] if "选项格式不正确" not in emg]
- else: # 选择题结构化成功时,对选项排列方式再换思路算
- options_rank_2 = new_options_rank(one_item["options"])
- if options_rank_2:
- one_item["options_rank"] = options_rank_2
- one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["stem"]))
- one_item["answer_type"] = "选择题"
- elif consumer == 'toslave': # 拆小题
- one_item = get_slave(one_item, con, parse, ans)
- # if ('slave' not in one_item or not one_item['slave']) and 'analy' in one_item:
- # del one_item['analy']
- # if one_item["type"] == "多选题":
- # one_item = option_structure(one_item, con, ans, item_no_type)
- # if 'options' not in one_item:
- # one_item["options"] = []
- else: # 不拆小题,非选择题
- if "#" in ans:
- one_item["key"] = one_item["key"].replace("#", "; ")
- pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+((<img src=((?!/>).)*?/>|[^_;;。?!\n])+?)_+"
- r"([cdkm上]?m?\s*.?[。.?]?\s*($|<br/>|<img src|……))")
- pattern2 = re.compile(r"((有|存在|[是为])[\u4e00-\u9fa5]{0,2})\s*_+(\d+)_+\s*([\u4e00-\u9fa5,,;;。..])")
- if re.findall(r"_{2,}", one_item["stem"]): # re.search("_+([^_]*?)_+", one_item['stem']):
- one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["stem"]))
- else:
- # 是否只需将所有标点符号去除即可,这里容易判断错误!!!!
- if re.search("^[A-Z]{2,}$",
- re.sub(r"[^\w><≤≥≡≦≧+-≈≠﹢﹣±㏒㏑∑∏π><==×÷/()()﹙﹚\[\]﹛﹜{\}∧∨∠▰▱△∆⊙⌒"
- r"⊆⊂⊇⊃∈∩∉∪⊕∥∣≌∽∞∝⊥∫∬∮∯Φ∅≮≯∁∴∵∷←↑→↓↖↗↘↙‖〒¤○′″¢°℃℉"
- r"αβγδεζηθικλμνξορστυφχψωϕ%‰℅㎎㎏㎜㎝㎞㎡㎥㏄㏎㏕$£¥º¹²³⁴ⁿ₁₂₃₄·∶½⅓⅔¼¾⅛⅜⅝⅞"
- r"ΑΒΓΔΕΖΗΘΙΚΜ]", "", ans)):
- one_item["type"] = "多选题"
- one_item = option_structure(one_item, con, ans, item_no_type)
- if 'options' not in one_item:
- one_item["options"] = []
- if one_item["type"] == "填空题" and re.search("_{2,}|填正确答案", one_item['stem']) is None:
- # -----放在huanhang_wash_after中调整--------------
- # blank_ans =[]
- # while re.search(pattern1, one_item["stem"]): # 答案直接填在____上的情况
- # blank_con1 = re.search(pattern1, one_item["stem"])
- # one_item["stem"] = one_item["stem"].replace(blank_con1.group(0),
- # blank_con1.group(1) + "____" + blank_con1.group(5))
- # blank_ans.append(blank_con1.group(2))
- # while re.search(pattern2, one_item["stem"]): # 答案直接填在____上的情况
- # blank_con1 = re.search(pattern2, one_item["stem"])
- # one_item["stem"] = one_item["stem"].replace(blank_con1.group(0),
- # blank_con1.group(1) + "____" + blank_con1.group(3))
- # blank_ans.append(blank_con1.group(2))
- # if not ans:
- # one_item["key"] = ";".join(blank_ans)
- # one_item["blank_num"] = len(blank_ans)
- # ----------------------------------------------
- if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()):
- one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题"
- one_item = option_structure(one_item, con, ans, item_no_type)
- if 'options' not in one_item:
- one_item["options"] = []
- elif re.search(r"[((]\s*[))]", one_item["stem"]) or ('步骤' not in one_item["stem"] and
- len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 4):
- one_item["type"] = "选择题"
- one_item = option_structure(one_item, con, ans, item_no_type)
- if 'options' not in one_item:
- one_item["options"] = []
- elif re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["stem"]):
- one_item["blank_num"] = len(re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["stem"]))
- elif re.findall('[ \s]{3,}[a-zA-Z]\s*[,;.。;,]', one_item["stem"]):
- one_item["blank_num"] = len(re.findall('\s{3,}\n*\s*[a-zA-Z]\s*[,;.。;,.]', one_item["stem"]))
- elif re.search(pattern1, one_item["stem"]) is None and re.search(pattern2, one_item["stem"]) is None:
- stem = re.sub("<img src=.*?/>|[,,.。.、、]", "", one_item["stem"])
- if len(stem) > 2:
- one_item["type"] = "解答题"
- # print('------------------------------------------------')
- if one_item:
- # if re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["stem"].strip()):
- # # print(one_item["stem"])
- # score_info = re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["stem"].strip())
- # one_item["score"] = float(score_info.group(2))
- one_item["stem"] = re.sub(r"(\[.*?\])?\(.*?\d+分\)", "", one_item["stem"][:20]) + one_item["stem"][20:]
- return one_item
|