final_structure.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. # 各题型结构化
  2. import re
  3. from structure.option import option_structure, new_options_rank, table_option_struc
  4. from structure.dati2slave import get_slave
  5. def one_item_structure(xyz):
  6. """
  7. 判断解析类型,解析类型为:
  8. if:
  9. 1.stem不需要再做其他处理<-- 答案没有[;;],且答案不是ABCDEFG
  10. 2.选择题类,需要把stem中的ABCD各选项内容提取出来<--答案是ABCDEFG
  11. else:
  12. 都要看是否含有小题,如果含有小题,需要把小题提取出来,slave
  13. 3.填空题类,(1)需要提取stem中下划线的个数
  14. 选择题结构化:单选或者多选<--要把各选项是什么提取出来放在slave中
  15. one_item:{"stem":xxxx,"key":xxx,"parse":xxx}
  16. consumer: 分“高中数学”还是“全学科”;
  17. item_no_type:题号是否以(\d)的形式
  18. :return:
  19. """
  20. one_item, consumer, item_no_type, subject, is_danti = xyz
  21. # print(one_item)
  22. if "【章节】" in one_item["parse"]: # 属于后一个题的,后面须调整
  23. one_item["chapter"] = one_item["parse"].split("【章节】")[1].split("\n")[0]
  24. one_item["parse"] = one_item["parse"].replace("【章节】" + one_item["chapter"], "")
  25. if "【章节】" in one_item["stem"]: # 属于后一个题的,后面须调整
  26. one_item["chapter"] = one_item["stem"].split("【章节】")[1].split("\n")[0]
  27. one_item["stem"] = one_item["stem"].replace("【章节】" + one_item["chapter"], "")
  28. if "【选做题】" in one_item["stem"] + one_item["key"] + one_item["parse"]:
  29. opt_str = re.search(r"【选做题】:'(\d+)分'", one_item["stem"] + one_item["key"] + one_item["parse"])
  30. one_item["option_st"] = "选做题,"+opt_str.group(1) if opt_str else "选做题" # 选做题开始的位置,后面的题开始是选做题
  31. one_item["stem"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["stem"])
  32. one_item["key"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["key"])
  33. one_item["parse"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["parse"])
  34. ans = one_item["key"]
  35. con = one_item["stem"]
  36. parse = re.sub(r"((?<=[\n】])|^)\s*解\s*[::]", "", one_item["parse"])
  37. topic_type_cn = ""
  38. if not one_item["type"] and (not is_danti or subject == "地理"):
  39. # one_item["errmsgs"].append("本题没有给出明确题型!")
  40. # return one_item
  41. if re.match(r"[A-Z][A-Zc;;和与、、\s]*?$", ans.strip()):
  42. one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题"
  43. elif re.search(r"[((]\s*[))]", one_item["stem"]) or \
  44. len(re.findall(r"[\n\s\u4e00-\u9fa5①②③④]\s*[A-D]\s*[..、、]|\s/>[A-D]\s*[..、、]", one_item["stem"])) >= 4:
  45. one_item["type"] = "选择题"
  46. elif re.findall(r"_{2,}", one_item["stem"]):
  47. one_item["type"] = "填空题"
  48. else:
  49. one_item["type"] = "解答题"
  50. elif subject == "语文": # one_item["type"] == "综合题" and
  51. if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.replace("c", "C").strip()):
  52. topic_type_cn = "选择题"
  53. elif len(re.findall(r"[\n\s\u4e00-\u9fa5①②③④]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 3:
  54. # re.search(r"[((]\s*[))]", one_item["stem"]) 这个条件不行
  55. if len(re.findall(r"\n\s*[((]\s*\d\s*[))]\s*[..、、]?", one_item["stem"])) >= 2:
  56. topic_type_cn = "小题多问"
  57. else:
  58. topic_type_cn = "选择题"
  59. if one_item["type"].replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
  60. if subject == "地理":
  61. if len(re.findall(r"\n\s*[((]\s*\d\s*[))]\s*[..、、]?", one_item["stem"])) >= 2:
  62. one_item["type"] = "小题多问"
  63. if re.match(r"【?(对|正确的?|错误?的?|F|T)】?$", ans.strip()):
  64. one_item["type"] = "判断题"
  65. topic_type = one_item["type"]
  66. # print(topic_type, topic_type_cn)
  67. # print(one_item)
  68. if topic_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"] or topic_type_cn == "选择题":
  69. one_item = option_structure(one_item, con, ans, item_no_type)
  70. if 'options' not in one_item:
  71. one_item["options"] = []
  72. # 表格类型的选项再解析,
  73. one_item["errmsgs"] = [emg for emg in one_item["errmsgs"] if "选项格式不正确" not in emg]
  74. if "<table>" in one_item["stem"]:
  75. may_options = table_option_struc(one_item["stem"])
  76. if may_options:
  77. one_item["options"] = may_options
  78. one_item["options_rank"] = 2
  79. else: # 走toslave
  80. non_option_structure(one_item, con, parse, ans, topic_type)
  81. else:# 走toslave
  82. non_option_structure(one_item, con, parse, ans, topic_type)
  83. else: # 选择题结构化成功时,对选项排列方式再换思路算
  84. options_rank_2 = new_options_rank(one_item["options"])
  85. if options_rank_2:
  86. one_item["options_rank"] = options_rank_2
  87. one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["stem"]))
  88. one_item["answer_type"] = "选择题"
  89. elif consumer == 'toslave': # 拆小问
  90. non_option_structure(one_item, con, parse, ans, topic_type)
  91. else: # 不拆小问,非选择题
  92. if "#" in ans:
  93. one_item["key"] = one_item["key"].replace("#", "; ")
  94. pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+((<img src=((?!/>).)*?/>|[^_;;。?!\n])+?)_+"
  95. r"([cdkm上]?m?\s*.?[。.?]?\s*($|<br/>|<img src|……))")
  96. pattern2 = re.compile(r"((有|存在|[是为])[\u4e00-\u9fa5]{0,2})\s*_+(\d+)_+\s*([\u4e00-\u9fa5,,;;。..])")
  97. if re.findall(r"_{2,}", one_item["stem"]): # re.search("_+([^_]*?)_+", one_item['stem']):
  98. one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["stem"]))
  99. else:
  100. one_item["blank_num"] = 0
  101. # 是否只需将所有标点符号去除即可,这里容易判断错误!!!!
  102. if re.search("^[A-Z]{2,}$",
  103. re.sub(r"[^\w><≤≥≡≦≧+-≈≠﹢﹣±㏒㏑∑∏π><==×÷/()()﹙﹚\[\]﹛﹜{\}∧∨∠▰▱△∆⊙⌒"
  104. r"⊆⊂⊇⊃∈∩∉∪⊕∥∣≌∽∞∝⊥∫∬∮∯Φ∅≮≯∁∴∵∷←↑→↓↖↗↘↙‖〒¤○′″¢°℃℉"
  105. r"αβγδεζηθικλμνξορστυφχψωϕ%‰℅㎎㎏㎜㎝㎞㎡㎥㏄㏎㏕$£¥º¹²³⁴ⁿ₁₂₃₄·∶½⅓⅔¼¾⅛⅜⅝⅞"
  106. r"ΑΒΓΔΕΖΗΘΙΚΜ]", "", ans)):
  107. one_item["type"] = "多选题"
  108. one_item = option_structure(one_item, con, ans, item_no_type)
  109. if 'options' not in one_item:
  110. one_item["options"] = []
  111. if one_item["type"] == "填空题" and re.search("_{2,}|填正确答案", one_item['stem']) is None:
  112. if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()):
  113. one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题"
  114. one_item = option_structure(one_item, con, ans, item_no_type)
  115. if 'options' not in one_item:
  116. one_item["options"] = []
  117. elif re.search(r"[((]\s*[))]", one_item["stem"]) or ('步骤' not in one_item["stem"] and
  118. len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 4):
  119. one_item["type"] = "选择题"
  120. one_item = option_structure(one_item, con, ans, item_no_type)
  121. if 'options' not in one_item:
  122. one_item["options"] = []
  123. elif re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["stem"]):
  124. one_item["blank_num"] = len(re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["stem"]))
  125. elif re.findall('[ \s]{3,}[a-zA-Z]\s*[,;.。;,]', one_item["stem"]):
  126. one_item["blank_num"] = len(re.findall('\s{3,}\n*\s*[a-zA-Z]\s*[,;.。;,.]', one_item["stem"]))
  127. elif re.search(pattern1, one_item["stem"]) is None and re.search(pattern2, one_item["stem"]) is None:
  128. stem = re.sub("<img src=.*?/>|[,,.。.、、]", "", one_item["stem"])
  129. if len(stem) > 2:
  130. one_item["type"] = "解答题"
  131. # print('------------------------------------------------')
  132. if one_item:
  133. # if re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["stem"].strip()):
  134. # # print(one_item["stem"])
  135. # score_info = re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["stem"].strip())
  136. # one_item["score"] = float(score_info.group(2))
  137. one_item["stem"] = re.sub(r"(\[.*?\])?\(.*?\d+分\)", "", one_item["stem"][:20]) + one_item["stem"][20:]
  138. return one_item
  139. def non_option_structure(one_item, con, parse, ans, topic_type):
  140. """
  141. :return:
  142. """
  143. if topic_type in ["作文", "书面表达", "写作"]:
  144. one_item["answer_type"] = "语文作文"
  145. pass
  146. elif re.search("_{2,}</td>|^\s*.{,6}(表格|作文|书面表达).{,7}|写一篇文章|作文题目", con):
  147. if re.search("_{2,}</td>|^\s*.{,6}表格", con) is None:
  148. one_item["answer_type"] = "语文作文"
  149. else:
  150. blank_num = len(re.findall(r"_{2,}", con))
  151. if blank_num > 0:
  152. one_item["blank_num"] = blank_num
  153. one_item["answer_type"] = "填空题"
  154. else:
  155. one_item["answer_type"] = "解答题"
  156. if re.search("_{2,}</td>", con): # 也可以在带小题中
  157. one_item = get_slave(one_item, con, parse, ans)
  158. else:
  159. one_item = get_slave(one_item, con, parse, ans)
  160. # if 'options' not in one_item:
  161. # one_item["options"] = []
  162. return one_item