stem_ans_split.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from utils.washutil import table_label_cleal
  5. def stem_ans_split(one_item_dict, case):
  6. """
  7. 将切出来的一道题 按 答案解析 进一步细分
  8. :param one_item_dict: 单道题的初步结构字典{"stem": , "item_id": , "errmsgs": [],"type":,}
  9. :param case: 属于哪种情况
  10. :return: {"stem": ,"key": ,"parse":}
  11. """
  12. one_item = one_item_dict["stem"]
  13. item_type = one_item_dict["type"]
  14. # print(one_item)
  15. if case == 'case0': # 没“答案”关键字
  16. inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
  17. table_label_cleal(one_item))
  18. inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
  19. else str(a).replace('None', '').strip() for a in inside_split]
  20. # print(':::', inside_split)
  21. # print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
  22. dd = dict(zip(["stem", "parse_title"], inside_split[0:2]))
  23. dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
  24. dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
  25. dd["key"] = ""
  26. else: # if case == 'case1': # 有“答案”关键字
  27. dd = dict(zip(["stem", "key"], re.split(r"【答案】\n?|(?<=[\n】])\s*答案\s*[::]",
  28. table_label_cleal(one_item), maxsplit=1)))
  29. # pprint(dd) # 一般默认‘答案’在‘解析’的前面
  30. subdd = dict(zip(["key", "parse_title", "parse"],
  31. re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", dd["key"], maxsplit=1)))
  32. dd["key"] = subdd["key"]
  33. if "parse_title" in subdd:
  34. # print(subdd["parse_title"])
  35. dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
  36. if subdd["parse_title"] not in ["解析", "解答", "详解"]:
  37. may_parse = re.split(r"【(解析|解答|分析|详解)】\n*?", dd["stem"], maxsplit=1)
  38. if len(may_parse) == 3:
  39. dd["stem"] = may_parse[0]
  40. dd["parse"] = "【" + may_parse[1] + "】" + may_parse[2] + dd["parse"]
  41. dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
  42. else:
  43. dd["parse"] = ""
  44. dd["stem"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:]
  45. # 获取答案
  46. if not dd["key"]:
  47. dd["key"] = get_ans_from_parse(dd["parse"], item_type, dd["stem"])
  48. # 补充!!!------------------------------------------
  49. # if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)|
  50. # ans = re.search(r'故选[::]?<imgsrc=[^>]+?data-latex="\$?([A-Z;;和与、、\s]+)\$?".+?/>|故选[::]?([A-Z;;和与、、\s]+)',
  51. # dd["parse"].replace("$", "").replace(" ", ""))
  52. # if ans:
  53. # dd["key"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None
  54. # else:
  55. # dd["key"] = ""
  56. # else:
  57. # dd["key"] = "见解析"
  58. # ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"])
  59. # if ans:
  60. # dd["key"] = ans.group(1)
  61. # ------------------------------------------------------
  62. if "parse_title" in dd:
  63. del dd["parse_title"]
  64. return dd
  65. def get_ans_from_parse(item_parse, item_type, res_con):
  66. """
  67. 从已知解析中 挑选 答案
  68. :param item_parse: 总解析
  69. :param item_type: 题型
  70. :return:
  71. """
  72. item_parse = re.split("【点评】|【点睛】", item_parse)[0].strip()
  73. # 将解析中末尾出现的图片去掉
  74. while re.search('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', item_parse):
  75. item_parse = re.sub('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', "", item_parse)
  76. item_ans = ""
  77. if item_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
  78. ans = re.search(r'故选\s*[::]?\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>'
  79. r'|故选\s*[::]?\s*([A-Z;;和与、、\s]+)', item_parse.replace("$", ""))
  80. if ans:
  81. item_ans = ans.group(1) if ans.group(1) is not None else ans.group(2)
  82. item_ans = re.sub(r"[.;;.]\s*$", "", item_ans)
  83. elif not ans:
  84. item_ans = "见解析"
  85. elif item_type:
  86. ans0 = re.search(r'故选\s*[::]?\s*([A-Z;;和与、、\s]+)[..;;。]?$', item_parse) # 试验题中可能还有选择题
  87. ans01 = re.search(r'故选\s*[::]\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>', item_parse.replace("$", "").replace("\[", "").replace("\]", "")) # 选择题的题型可能前面分错
  88. ans1 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(((?!(<img)).)+?)[..]?\s*(\n|$)', item_parse)
  89. ans11 = re.search(r'((?<!解)答\s*[::]|整理得\s*[::]?)\s*(.+?)([..;;]?\s*$|[..]\s*\n)', item_parse)
  90. ans2 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)', item_parse, re.S)
  91. ans22 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*([^∴∵因所故即【】]+?)([..]\s*(\n|$)|$)', item_parse)
  92. ans21 = re.search(r'综上所述\s*[::]\s*([^∴∵故因所即【】]+?)[..;;]\s*$', item_parse)
  93. ans3 = re.search(r'(故|因[而此]|所以|∴)\s*[::]?.+?[为是填]\s*[::]?\s*([^∴∵故因所即【】]+?)([..;;,,]\s*$|[..]\s*\n)', item_parse)
  94. ans31 = re.search(r'(故|因[而此]|所以|∴)\s*([^当为是填∴∵故因所即则【】]+?)[..;;]\s*$', item_parse)
  95. ans32 = re.search(r'(故|因[而此]|所以)\s*[::]?[^当为是填∴∵故因所即【】]+?[为是填]\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)',
  96. item_parse, re.S)
  97. ans4 = re.search(r'\n\s*[==]([^=\n]+?)[..]?\s*$', item_parse)
  98. ans42 = re.search(r'[==](?!")(((?!([故=∴即]|原式|因[而此]|所以|\n|=[^"])).)+?)[..]?\s*$', item_parse)
  99. ans41 = re.search(r'原式\s*[==].+?[==](?!")(((?!(=|=[^"])).)+?|\s*<imgsrc.+?/>)([..]?\s*$|[..]\s*\n)', item_parse)
  100. if not (item_type == '填空题' and len(re.findall(r"_{2,}|_+([^_]*?)_+", res_con)) == 1) and \
  101. len(re.findall(r"[((]\d[))]|[\n::;;。】]([((](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[))]|[①②③④]\s*(?![+-]))",
  102. item_parse.replace(" ", ""))) > 1 or "证明" in item_parse:
  103. item_ans = "见解析"
  104. elif ans0:
  105. item_ans = ans0.group(1)
  106. elif ans01:
  107. item_ans = ans01.group(1)
  108. elif ans1 or ans11:
  109. item_ans = ans1.group(3) if ans1 else ans11.group(2)
  110. elif ans2:
  111. item_ans = ans2.group(3)
  112. elif ans22:
  113. item_ans = ans22.group(3)
  114. elif ans21:
  115. item_ans = ans21.group(1)
  116. elif (ans3 or ans31 or ans32) and '证明' not in item_parse:
  117. if ans3:
  118. item_ans = ans3.group(2)
  119. if ans31:
  120. item_ans = ans31.group(2)
  121. if ans32:
  122. item_ans = ans32.group(2)
  123. elif (ans4 or ans41 or ans42) and '证明' not in item_parse:
  124. if ans4:
  125. item_ans = ans4.group(1)
  126. if ans41:
  127. item_ans = ans41.group(1)
  128. if ans42:
  129. item_ans = ans42.group(1)
  130. else:
  131. item_ans = "见解析"
  132. return item_ans
  133. def get_split_pos(row_list):
  134. """
  135. 获取题目、答案的切分位置
  136. :return:
  137. """
  138. # 寻找题目和答案的切分点,一定要有“答案”关键字
  139. split_p1 = [k for k, v in enumerate(row_list)
  140. if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$|答案[和与及]?解析([((].*?[))])?$' # |答\s*案$
  141. r'|.{,15}(参考|考试|(考?试|检测)[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*$'
  142. r'|.{,15}评分(标准|参考)|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s..、、]+$'
  143. r'|.{,15}(参考|考试|(考?试|检测)[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*(物理|理综|数学|化学|生物)?\s*$'
  144. r'|.{,15}解析[和与及]答案$',
  145. re.sub(r"[上下]?学[年期]|[\d—【】..·、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]"
  146. r"|[高初][一二三]|部分", "", v.strip()))]
  147. if split_p1 and split_p1[0] < 30:
  148. if len(re.sub("<imgsrc.*?/>|\s", "", "".join(row_list[:split_p1[0]])).strip())<60:
  149. return "题文全是图片,本通道无法解析"
  150. split_p1 = [p for p in split_p1 if p > 30]
  151. print("答案split_p1:", split_p1)
  152. # 没有答案关键字时
  153. split_p0 = [k for k, v in enumerate(row_list)
  154. if re.search("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)?考试(评估)?|学期|[寒暑]假作业)[一二三四五六七八九试题((卷))\s]*?$",
  155. re.sub(r"[上下]?学[年期]度?|[\d—【】..、、::(())年第\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "", v.strip()))]
  156. split_p0 = [p for p in split_p0 if p > 30]
  157. print("试卷标题split_p0:", split_p0)
  158. if not split_p0:
  159. split_p0 = [k for k, v in enumerate(row_list)
  160. if re.search("^\s*第\s*[一IⅠ]\s*卷\s*([((]|非?选择题)", v.strip())]
  161. split_p0 = [p for p in split_p0 if p > 30]
  162. print("试卷标题split_p01:", split_p0)
  163. # if not split_p and len(re.split("【答案】", "@@\n".join(row_list))) == 2: # 参考答案的关键字只用了【答案】
  164. # split_p.insert(0, len(re.split("【答案】", "@@\n".join(row_list))[0].split("@@\n"))-1)
  165. # print("split_p1:", split_p1)
  166. items_list, ans_list = [], []
  167. # ===================================题目切分======================================================
  168. pattern1 = re.compile("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)考试|学期)[试题((卷))\s]*?$"
  169. "|密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?"
  170. "|((学校|班级|姓名|座位?号|准考号|学号)[\s::_]*?){2,}|^\n*\s*\n*$")
  171. is_may_ans = 0
  172. if not split_p1 and split_p0:
  173. split_p1 = [split_p0[-1]] # 优先当答案使用,选最后一个
  174. is_may_ans = 1
  175. if split_p1:
  176. new_p1 = split_p1[0]
  177. may_omit_info = re.match("((参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案|答案和解析)([\dA-E\s..、、]+)$", row_list[new_p1])
  178. if may_omit_info and re.findall("[A-E]", may_omit_info.group(3)):
  179. row_list.insert(new_p1+1, may_omit_info.group(3))
  180. while re.search(pattern1, row_list[new_p1 - 1]):
  181. new_p1 -= 1
  182. items_list = row_list[:new_p1]
  183. ans_list = row_list[split_p1[0] + 1:]
  184. # 再判断是否有答题卷
  185. split_p2 = [k for k, v in enumerate(row_list[:split_p1[0]])
  186. if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
  187. re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
  188. if split_p2: # 答案前
  189. while re.search(pattern1, items_list[split_p2[0] - 1]):
  190. split_p2[0] -= 1
  191. items_list = items_list[:split_p2[0]]
  192. else:
  193. split_p2 = [k for k, v in enumerate(ans_list)
  194. if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
  195. re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
  196. if split_p2: # 答案后
  197. while re.search(pattern1, ans_list[split_p2[0] - 1]):
  198. split_p2[0] -= 1
  199. ans_list = ans_list[:split_p2[0]]
  200. # else:
  201. # 没有答案页,但可能也有答题卡
  202. split_p2 = [k for k, v in enumerate(row_list)
  203. if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
  204. re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
  205. print("答题卡split_p2:", split_p2)
  206. if split_p2:
  207. while re.search(pattern1, row_list[split_p2[0] - 1]):
  208. split_p2[0] -= 1
  209. row_list = row_list[:split_p2[0]]
  210. return row_list, items_list, ans_list, is_may_ans