stem_ans_split.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from utils.washutil import table_label_cleal
  5. def stem_ans_split(one_item_dict, case):
  6. """
  7. 将切出来的一道题 按 答案解析 进一步细分
  8. :param one_item_dict: 单道题的初步结构字典{"content": , "item_id": , "errmsgs": [],"item_topic_name":,}
  9. :param case: 属于哪种情况
  10. :return: {"content": ,"answer": ,"parse":}
  11. """
  12. one_item = one_item_dict["content"]
  13. item_type = one_item_dict["item_topic_name"]
  14. # print(one_item)
  15. if case == 'case0': # 没“答案”关键字
  16. inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
  17. table_label_cleal(one_item))
  18. inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
  19. else str(a).replace('None', '').strip() for a in inside_split]
  20. # print(':::', inside_split)
  21. # print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
  22. dd = dict(zip(["content", "parse_title"], inside_split[0:2]))
  23. dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
  24. dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
  25. dd["answer"] = ""
  26. else: # if case == 'case1': # 有“答案”关键字
  27. dd = dict(zip(["content", "answer"], re.split(r"【答案】\n?",
  28. table_label_cleal(one_item), maxsplit=1)))
  29. # pprint(dd) # 一般默认‘答案’在‘解析’的前面
  30. subdd = dict(zip(["answer", "parse_title", "parse"],
  31. re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", dd["answer"], maxsplit=1)))
  32. dd["answer"] = subdd["answer"]
  33. if "parse_title" in subdd:
  34. dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
  35. dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
  36. else:
  37. dd["parse"] = ""
  38. dd["content"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
  39. # 获取答案
  40. if not dd["answer"]:
  41. dd["answer"] = get_ans_from_parse(dd["parse"], item_type, dd["content"])
  42. # 补充!!!------------------------------------------
  43. # if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)|
  44. # ans = re.search(r'故选[::]?<imgsrc=[^>]+?data-latex="\$?([A-Z;;和与、、\s]+)\$?".+?/>|故选[::]?([A-Z;;和与、、\s]+)',
  45. # dd["parse"].replace("$", "").replace(" ", ""))
  46. # if ans:
  47. # dd["answer"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None
  48. # else:
  49. # dd["answer"] = ""
  50. # else:
  51. # dd["answer"] = "见解析"
  52. # ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"])
  53. # if ans:
  54. # dd["answer"] = ans.group(1)
  55. # ------------------------------------------------------
  56. if "parse_title" in dd:
  57. del dd["parse_title"]
  58. return dd
  59. def get_ans_from_parse(item_parse, item_type, res_con):
  60. """
  61. 从已知解析中 挑选 答案
  62. :param item_parse: 总解析
  63. :param item_type: 题型
  64. :return:
  65. """
  66. item_parse = re.split("【点评】|【点睛】", item_parse)[0].strip()
  67. # 将解析中末尾出现的图片去掉
  68. while re.search('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', item_parse):
  69. item_parse = re.sub('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', "", item_parse)
  70. item_ans = ""
  71. if item_type.replace("题", "") in ["单选", "多选", "选择", "单项选择", "多项选择"]:
  72. ans = re.search(r'故选\s*[::]?\s*<imgsrc\d+\sdata-latex="\$?([A-Z;;和与、、\s]+)\$?"/>'
  73. r'|故选\s*[::]?\s*([A-Z;;和与、、\s]+)', item_parse.replace("$", ""))
  74. if ans:
  75. item_ans = ans.group(1) if ans.group(1) is not None else ans.group(2)
  76. item_ans = re.sub(r"[.;;.]\s*$", "", item_ans)
  77. elif not ans:
  78. item_ans = "见解析"
  79. elif item_type:
  80. ans0 = re.search(r'故选\s*[::]?\s*([A-Z;;和与、、\s]+)[..;;。]?$', item_parse) # 试验题中可能还有选择题
  81. ans01 = re.search(r'故选\s*[::]\s*<imgsrc\d+\sdata-latex="\$?([A-Z;;和与、、\s]+)\$?"/>', item_parse) # 选择题的题型可能前面分错
  82. ans1 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(((?!(<img)).)+?)[..]?\s*(\n|$)', item_parse)
  83. ans11 = re.search(r'((?<!解)答\s*[::]|整理得\s*[::]?)\s*(.+?)([..;;]?\s*$|[..]\s*\n)', item_parse)
  84. ans2 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)', item_parse, re.S)
  85. ans22 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*([^∴∵因所故即【】]+?)([..]\s*(\n|$)|$)', item_parse)
  86. ans21 = re.search(r'综上所述\s*[::]\s*([^∴∵故因所即【】]+?)[..;;]\s*$', item_parse)
  87. ans3 = re.search(r'(故|因[而此]|所以|∴)\s*[::]?.+?[为是填]\s*[::]?\s*([^∴∵故因所即【】]+?)([..;;,,]\s*$|[..]\s*\n)', item_parse)
  88. ans31 = re.search(r'(故|因[而此]|所以|∴)\s*([^当为是填∴∵故因所即则【】]+?)[..;;]\s*$', item_parse)
  89. ans32 = re.search(r'(故|因[而此]|所以)\s*[::]?[^当为是填∴∵故因所即【】]+?[为是填]\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)',
  90. item_parse, re.S)
  91. ans4 = re.search(r'\n\s*[==]([^=\n]+?)[..]?\s*$', item_parse)
  92. ans42 = re.search(r'[==](?!")(((?!([故=∴即]|原式|因[而此]|所以|\n|=[^"])).)+?)[..]?\s*$', item_parse)
  93. ans41 = re.search(r'原式\s*[==].+?[==](?!")(((?!(=|=[^"])).)+?|\s*<imgsrc.+?/>)([..]?\s*$|[..]\s*\n)', item_parse)
  94. if not (item_type == '填空题' and len(re.findall(r"_{2,}|_+([^_]*?)_+", res_con)) == 1) and \
  95. len(re.findall(r"[((]\d[))]|[\n::;;。】]([((](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[))]|[①②③④]\s*(?![+-]))",
  96. item_parse.replace(" ", ""))) > 1 or "证明" in item_parse:
  97. item_ans = "见解析"
  98. elif ans0:
  99. item_ans = ans0.group(1)
  100. elif ans01:
  101. item_ans = ans01.group(1)
  102. elif ans1 or ans11:
  103. item_ans = ans1.group(3) if ans1 else ans11.group(2)
  104. elif ans2:
  105. item_ans = ans2.group(3)
  106. elif ans22:
  107. item_ans = ans22.group(3)
  108. elif ans21:
  109. item_ans = ans21.group(1)
  110. elif (ans3 or ans31 or ans32) and '证明' not in item_parse:
  111. if ans3:
  112. item_ans = ans3.group(2)
  113. if ans31:
  114. item_ans = ans31.group(2)
  115. if ans32:
  116. item_ans = ans32.group(2)
  117. elif (ans4 or ans41 or ans42) and '证明' not in item_parse:
  118. if ans4:
  119. item_ans = ans4.group(1)
  120. if ans41:
  121. item_ans = ans41.group(1)
  122. if ans42:
  123. item_ans = ans42.group(1)
  124. else:
  125. item_ans = "见解析"
  126. return item_ans
  127. def get_split_pos(row_list):
  128. """
  129. 获取题目、答案的切分位置
  130. :return:
  131. """
  132. # 寻找题目和答案的切分点,一定要有“答案”关键字
  133. split_p1 = [k for k, v in enumerate(row_list)
  134. if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$|答案[和与及]?解析([((].*?[))])?$' # |答\s*案$
  135. r'|.{,15}(参考|考试|(考?试|检测)[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*$'
  136. r'|.{,15}评分(标准|参考)|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s..、、]+$'
  137. r'|.{,15}(参考|考试|(考?试|检测)[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*(物理|理综|数学|化学|生物)?\s*$'
  138. r'|.{,15}解析[和与及]答案$',
  139. re.sub(r"[上下]?学[年期]|[\d—【】..·、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]"
  140. r"|[高初][一二三]|部分", "", v.strip()))]
  141. if split_p1 and split_p1[0] < 30:
  142. if len(re.sub("<imgsrc.*?/>|\s", "", "".join(row_list[:split_p1[0]])).strip())<60:
  143. return "题文全是图片,本通道无法解析"
  144. split_p1 = [p for p in split_p1 if p > 30]
  145. print("答案split_p1:", split_p1)
  146. # 没有答案关键字时
  147. split_p0 = [k for k, v in enumerate(row_list)
  148. if re.search("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)考试(评估)?|学期|[寒暑]假作业)[一二三四五六七八九试题((卷))\s]*?$",
  149. re.sub(r"[上下]?学[年期]|[\d—【】..、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "", v.strip()))]
  150. split_p0 = [p for p in split_p0 if p > 30]
  151. print("试卷标题split_p0:", split_p0)
  152. if not split_p0:
  153. split_p0 = [k for k, v in enumerate(row_list)
  154. if re.search("^\s*第\s*[一IⅠ]\s*卷\s*([((]|非?选择题)", v.strip())]
  155. split_p0 = [p for p in split_p0 if p > 30]
  156. print("试卷标题split_p01:", split_p0)
  157. # if not split_p and len(re.split("【答案】", "@@\n".join(row_list))) == 2: # 参考答案的关键字只用了【答案】
  158. # split_p.insert(0, len(re.split("【答案】", "@@\n".join(row_list))[0].split("@@\n"))-1)
  159. # print("split_p1:", split_p1)
  160. items_list, ans_list = [], []
  161. # ===================================题目切分======================================================
  162. pattern1 = re.compile("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)考试|学期)[试题((卷))\s]*?$"
  163. "|密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?"
  164. "|((学校|班级|姓名|座位?号|准考号|学号)[\s::_]*?){2,}|^\n*\s*\n*$")
  165. is_may_ans = 0
  166. if not split_p1 and split_p0:
  167. split_p1 = split_p0 # 优先当答案使用
  168. is_may_ans = 1
  169. if split_p1:
  170. new_p1 = split_p1[0]
  171. may_omit_info = re.match("((参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案|答案和解析)([\dA-E\s..、、]+)$", row_list[new_p1])
  172. if may_omit_info and re.findall("[A-E]", may_omit_info.group(3)):
  173. row_list.insert(new_p1+1, may_omit_info.group(3))
  174. while re.search(pattern1, row_list[new_p1 - 1]):
  175. new_p1 -= 1
  176. items_list = row_list[:new_p1]
  177. ans_list = row_list[split_p1[0] + 1:]
  178. # 再判断是否有答题卷
  179. split_p2 = [k for k, v in enumerate(row_list[:split_p1[0]])
  180. if re.match(".*?(答题?[卷卡页]|试卷细目表)\s*$|\s*本卷.*?答题卡",
  181. re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
  182. if split_p2: # 答案前
  183. while re.search(pattern1, items_list[split_p2[0] - 1]):
  184. split_p2[0] -= 1
  185. items_list = items_list[:split_p2[0]]
  186. else:
  187. split_p2 = [k for k, v in enumerate(ans_list)
  188. if re.match(".*?(答题?[卷卡页]|试卷细目表)\s*$|\s*本卷.*?答题卡",
  189. re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
  190. if split_p2: # 答案后
  191. while re.search(pattern1, ans_list[split_p2[0] - 1]):
  192. split_p2[0] -= 1
  193. ans_list = ans_list[:split_p2[0]]
  194. # else:
  195. # 没有答案页,但可能也有答题卡
  196. split_p2 = [k for k, v in enumerate(row_list)
  197. if re.match(".*?(答题?[卷卡页]|试卷细目表)\s*$|\s*本卷.*?答题卡",
  198. re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
  199. print("答题卡split_p2:", split_p2)
  200. if split_p2:
  201. while re.search(pattern1, row_list[split_p2[0] - 1]):
  202. split_p2[0] -= 1
  203. row_list = row_list[:split_p2[0]]
  204. return row_list, items_list, ans_list, is_may_ans