dati2slave.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from structure.ans_structure import only_parse_split
  5. from structure.option import option_structure
  6. from structure.stems_to_groups import suojin
  7. def get_slave(one_item, con, parse, ans, parse_split=1):
  8. """
  9. 带小问的大题 按小问切分
  10. parse_split=1:解析拆
  11. :return:
  12. """
  13. # if re.search(r"[;;]", ans) and len(re.findall(r"[((]\s*\d\s*[))]", con)) > 1:
  14. # 模板要求老师小题题号(1)(2)
  15. con = re.sub("(<[/a-z]+>|[((]\s*\d+\s*分\s*[))])\s*([((]\s*\d\s*[))])", r"\1" + "\n" + r"\2", con)
  16. parse = re.sub("(<[/a-z]+>)\s*([((]\s*\d\s*[))])", r"\1" + "\n" + r"\2", parse)
  17. # kuo_num = len(re.findall(r"[((]\d[))]", con.replace(" ", "")))
  18. # circle_num = len(re.findall(r"\n[((](i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])[))]|\n[①②③④⑤]\s*(?![+-])", con.replace(" ", "")))
  19. # 小题干按\n(\d)拆
  20. if len(re.findall(r"\n[((]\d[))](?!小?题中)", con.replace(" ", ""))) > 1:
  21. # by_sub_item = True # 答案是按照小题获取
  22. # -----------------------题干拆分-----------------------------
  23. con = re.sub(r"((?<=[\n::;;。求])|^)\s*([((]\s*[1-9]\s*[))])\s*(?!小?题中)|\n\s*[((]\s*\d{2}\s*[))]",
  24. "【ⅳ】", con)
  25. con_list = re.split(r"【ⅳ】", con)
  26. con_list = [con_list[0]] + [c for c in con_list[1:] if c.strip()]
  27. # ---------------答案和解析拆分---------------------------------------
  28. # ans_list = [] if ans != "见解析" else "见解析"
  29. ans_list = []
  30. parse_list = []
  31. analy_comment = []
  32. com_ans = ""
  33. if parse_split: # 解析需拆分
  34. # 答案
  35. if re.search(r"(\n|\s{2,})[((](\d)[))]", ans.replace(" ", "")):
  36. if len(re.findall(r"[((]\d[))]", ans.replace(" ", ""))) > 1: # 优先按(\d)拆分
  37. ans = re.sub(r"((?<=[\n::;;。])\s*|\s{2,}|^\s*)([((]\s*\d\s*[))])", "【ⅳ】", ans)
  38. ans_list.extend(re.split(r"【ⅳ】", ans))
  39. com_ans = ans_list[0]
  40. ans_list = ans_list[1:]
  41. while ans_list and not ans_list[0]:
  42. ans_list = ans_list[1:]
  43. elif ans.strip() == "见解析":
  44. ans_list = ["见解析"] * (len(con_list) - 1)
  45. # 解析
  46. if parse:
  47. if re.search('【(详解|解析|解答|分析)】', parse): # 2020-6-10
  48. temp_parse = re.split('【详解】|【解析】|【解答】', parse, maxsplit=1)
  49. if len(temp_parse) == 1:
  50. temp_parse = re.split('【分析】', parse, maxsplit=1)
  51. parse = temp_parse[1]
  52. analy_comment.append(temp_parse[0])
  53. # 若分析也分小问来,则单独拆分
  54. # if len(re.findall(r"[((]\d[))]", temp_parse[0].replace(" ", ""))) > 1:
  55. # syn = re.sub(r"((?<=[\n::;;。】])|^)\s*([((]\s*\d\s*[))])", "【ⅳ】", temp_parse[0])
  56. # syn_list.extend(re.split(r"【ⅳ】", syn))
  57. # syn_list.append(temp_parse[0]) # 【详解】|【解析】|【解答】 前面的部分
  58. if re.search("【(点评|点睛)】", parse):
  59. comment = re.split('(【点评】|【点睛】)', parse, maxsplit=1)
  60. analy_comment.append(comment[-2] + comment[-1])
  61. parse = comment[0]
  62. other_parse_info = re.search("\n\s*" + str(one_item["item_id"]) + "\s*[、..、]\s*[((]\s*1\s*[))]",
  63. "\n"+parse) # \d、(1)xxxx
  64. if other_parse_info:
  65. parse = "(1)" + parse[other_parse_info.end():]
  66. analy_comment.append(parse[:other_parse_info.start()])
  67. # 解析拆分小问
  68. if len(re.findall(r"[((]\d[))]", parse.replace(" ", ""))) > 1:
  69. parse = re.sub(r"((?<=[\n::;;。])|^)\s*([((]\s*\d\s*[))])", "【ⅳ】", parse)
  70. parse = re.sub(r"(/>)\s*([((]\s*\d\s*[))])", r"\1【ⅳ】", parse)
  71. # 将解析末尾出现的‘故答案为’在成功slave后删掉,还是容易出错
  72. if re.search('(故|因[而此]|所以)\s*[::]?\s*答案分?别?([为是]|填)?\s*[::]\s*(.+?)(\n|$)', parse):
  73. ans_s = re.search('(\n.*?|^.*?|<p>)((故|因[而此]|所以)\s*[::]?\s*答案分?别?'
  74. '([为是]|填)?\s*[::]\s*(.+?))(\n|$)', parse)
  75. if ans_s.group(5) and ans_s.group(5).count("【ⅳ】") > 1: # 答案综述中(\d)出现多个时
  76. ans_summarize = ans_s.group(2)
  77. ans_s_index = parse.index(ans_summarize) if ans_s.group(1) == '<p>' or not ans_s.group(1).strip() \
  78. or ans_s.group(1).strip() is None else parse.index(ans_s.group(1))
  79. # ans_summarize = [ans_s.group(2), ans_s_index]
  80. # parse = parse.replace(ans_summarize[0], "")
  81. parse = parse.replace(ans_summarize, "")
  82. elif ans_s.group(5) and "【ⅳ】" in ans_s.group(5):
  83. aa5 = ans_s.group(5).replace("【ⅳ】", "")
  84. parse = parse.replace(ans_s.group(5), aa5)
  85. # -----------------------------------------------
  86. parse = re.sub("(【ⅳ】\s*解答?\s[::])\s*【ⅳ】", r"\1", parse)
  87. little_parse = re.split(r"【ⅳ】", parse)
  88. if len(little_parse) > 1:
  89. parse_list = little_parse[1:]
  90. if re.search("\n\s*(【参考译文】|参考译文\s*[::])", "\n"+little_parse[0]):
  91. analy_comment.append(little_parse[0])
  92. # -----------------------------------拆分后组合----------------------------------------
  93. one_item, yiwen = split2little_con(con_list, ans_list, parse_list, one_item) # 解析不拆分时,小问也要拆
  94. if parse_split and "slave" in one_item: # 解析需拆分; # 小问解析个数与小问一致时才组合的
  95. # 里层答案/解析存在时,外层就不需要了
  96. if one_item["slave"][0]["parse"].strip():
  97. one_item['parse'] = "\n".join(analy_comment)
  98. if one_item["slave"][0]["key"].strip(): # 里层答案存在时,外层就不需要了
  99. one_item['key'] = com_ans.strip()
  100. if yiwen:
  101. one_item['parse'] += "\n" + yiwen
  102. else: # 题干不存在多个\n(\d),不存在多问
  103. if re.findall(r"_{2,}", one_item["stem"]):
  104. one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["stem"]))
  105. one_item["answer_type"] = "填空题"
  106. # if parse_split and re.search("^[A-Z]{2,}$", re.sub("\W", "", ans)):
  107. # one_item["type"] = "多选题"
  108. elif len(re.findall(r"[\n\s\u4e00-\u9fa5]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 3: # 增加对选项的拆分处理
  109. one_item = option_structure(one_item, con, ans, 1)
  110. one_item["answer_type"] = "选择题"
  111. if 'options' not in one_item:
  112. one_item["options"] = []
  113. elif re.search("[((]\s+[))][\s\n]*$", one_item["stem"]) or one_item["type"] == "判断题":
  114. one_item["answer_type"] = "判断题"
  115. if re.match("【?(对的?|正确的?|[T√])】?$", one_item["key"].strip()):
  116. one_item["key"] = "正确"
  117. elif re.match("【?(错误?的?|不对的?|不正确的?|[F×])】?$", one_item["key"].strip()):
  118. one_item["key"] = "错误"
  119. elif re.search("[横划画]线处填写", one_item["stem"]) and "com_stem" in one_item:
  120. one_item["answer_type"] = "填空题"
  121. if "com_stem" in one_item:
  122. blank_num = len(re.findall(r"_{2,}", one_item["com_stem"]))
  123. if blank_num > 0:
  124. one_item["blank_num"] = blank_num
  125. else:
  126. one_item["answer_type"] = "解答题"
  127. else:
  128. one_item["answer_type"] = "解答题"
  129. return one_item
  130. def split2little_con(con_list, ans_list, parse_list, one_item):
  131. """
  132. 将按小问切分开的题干、答案、解析 进行 【结构化组合】,
  133. 包含将小问题干继续按选项拆分的情况
  134. :param con_list:切开了小问的题干
  135. :param ans_list:切开了小问的答案
  136. :param parse_list:切开了小问的解析
  137. :param one_item: 初步切开的一道题目
  138. :param is_sub_item: 答案是否按小题号获取(还是按照空的个数获取)的标志
  139. :param ans_summarize: 解析中的综述 [内容,索引]
  140. :return:
  141. """
  142. # print(con_list)
  143. # print(ans_list)
  144. # print(parse_list)
  145. # print('***********************')
  146. yiwen = ""
  147. if len(con_list) > 1:
  148. if con_list[0] == "": # 说明全是小题,没有总题文
  149. one_item["stem"] = ""
  150. else:
  151. # one_item["stem"] = suojin(con_list[0]) # 公共题文缩进处理
  152. one_item["stem"] = con_list[0]
  153. slave = []
  154. for index, s in enumerate(con_list[1:]): # 以题干拆分为主
  155. blank_num = len(re.findall(r"_{2,}", s))
  156. s = re.sub(r"[((]\d+分[))]", "", s[:9]) + s[9:]
  157. # 格式行调整
  158. if index > 0 and re.search('<p style=".*?">\n+$', slave[index - 1]["stem"]):
  159. slave[index - 1]["stem"], b, _ = re.split('(<p style=".*?">\n+)$', slave[index - 1]["stem"])
  160. s = b + s
  161. elif re.search('<p style=".*?">\n+$', one_item["stem"]):
  162. one_item["stem"], b, _ = re.split('(<p style=".*?">\n+)$', one_item["stem"])
  163. s = b + s
  164. one_slave = {"slave_no": "(%s)" % (index + 1),
  165. "stem": s,
  166. "key": "",
  167. "parse": "",
  168. "answer_type": "解答题",
  169. "errmsgs": [],
  170. }
  171. if len(con_list) - len(parse_list) == 1:
  172. one_slave["parse"] = parse_list[index] # 按索引取解析
  173. if index == len(parse_list) - 1 and re.search("\n\s*(【参考译文】|参考译文\s*[::])", "\n"+parse_list[index]):
  174. one_slave["parse"], yiwen = re.split("\n\s*【参考译文】|\n\s*参考译文\s*[::]",
  175. "\n"+parse_list[index])
  176. yiwen = "【参考译文】" + yiwen
  177. if isinstance(ans_list, list) and len(con_list) - len(ans_list) == 1:
  178. one_slave["key"] = ans_list[index]
  179. # 将小题干中的材料拿到公共题干中
  180. # may_stem_info = re.search("\n材料[一二三四五六七八九十]\s", s)
  181. # if may_stem_info:
  182. # one_slave["stem"] = s[:may_stem_info.start()]
  183. # one_item["stem"] += s[may_stem_info.start()+1:]
  184. # 判断小题干是否可以是选择题
  185. if len(re.findall(r"[\n\s\u4e00-\u9fa5]\s*[A-D]\s*[..、、]", s)) >= 3:
  186. raw_ans = one_slave["key"]
  187. raw_stem = one_slave["stem"]
  188. one_slave = option_structure(one_slave, s, one_slave["key"], 1)
  189. one_slave["answer_type"] = "选择题"
  190. if "options" not in one_slave or not one_slave["options"]:
  191. one_slave["key"] = raw_ans # 选择题解析不成功时,答案还原
  192. elif blank_num > 1:
  193. one_slave["answer_type"] = "填空题"
  194. one_slave["key"] = raw_ans
  195. one_slave["stem"] = raw_stem
  196. del one_slave["options"], one_slave["options_rank"]
  197. if "options" not in one_slave or not one_slave["options"]:
  198. if blank_num > 0:
  199. one_slave["blank_num"] = blank_num
  200. one_slave["answer_type"] = "填空题"
  201. elif re.search("[((]\s+[))]\s*$|判断.*?正误", s):
  202. one_slave["answer_type"] = "判断题"
  203. elif re.search("[横划画]线处填写", s):
  204. one_slave["answer_type"] = "填空题"
  205. blank_num = len(re.findall(r"_{2,}", one_item["stem"]))
  206. if blank_num > 0:
  207. one_slave["blank_num"] = blank_num
  208. else:
  209. one_slave["answer_type"] = "解答题"
  210. if "errmsgs" in one_slave:
  211. del one_slave["errmsgs"]
  212. # 对带小题的大题,对每个小题的答案重新再提取一次
  213. if one_slave["parse"].strip() and (not ans_list or "key" not in one_slave or not one_slave["key"]
  214. or one_slave["key"] == '见解析'):
  215. new_ans = only_parse_split(one_slave["parse"], one_item["type"], one_slave["stem"], reparse_n=2) # 再解析
  216. if new_ans["key"]:
  217. one_slave["key"] = new_ans["key"]
  218. if not new_ans["parse"]:
  219. one_slave["parse"] = ""
  220. slave.append(one_slave)
  221. one_item["slave"] = slave
  222. one_item["slave_no"] = "1-{}".format(len(slave)) if len(slave) > 1 else "1"
  223. # if slave:
  224. # pass
  225. # else:
  226. # if ans_summarize:
  227. # one_item["parse"] = one_item["parse"][:ans_summarize[1]] + '\n' + ans_summarize[0] \
  228. # + '\n' + one_item["parse"][ans_summarize[1]:]
  229. return one_item, yiwen