dati2slave_2.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from structure.ans_structure import only_parse_split
  5. from structure.option import option_structure
  6. from structure.stems_to_groups import suojin
  7. def get_slave(one_item, con, parse, ans, flag=1):
  8. """
  9. 带小问的大题 按小问切分
  10. flag=1:解析先不拆
  11. :return:
  12. """
  13. # if re.search(r"[;;]", ans) and len(re.findall(r"[((]\s*\d\s*[))]", con)) > 1: # 模板要求老师小题题号(1)(2)
  14. th1 = {"(Ⅰ)": "(1)", "(Ⅱ)": "(2)", "(Ⅲ)": "(3)", "(IV)": "(4)", "(Ⅳ)": "(4)", "(Ⅴ)": "(5)",
  15. "Ⅰ": "(1)", "Ⅱ": "(2)", "Ⅲ": "(3)", "IV": "(4)", "Ⅳ": "(4)", "Ⅴ": "(5)"}
  16. con = re.sub(r"([\n】])\s*[((]\s*(" + "|".join(th1.keys()) + ")\s*[))]", lambda x: x.group(1) + th1[x.group(2)], con)
  17. parse = re.sub(r"([\n】])\s*[((]\s*(" + "|".join(th1.keys()) + ")\s*[))]", lambda x: x.group(1) + th1[x.group(2)],
  18. parse)
  19. con = re.sub("(<[/a-z]+>|[((]\s*\d+\s*分\s*[))])\s*([((]\s*\d\s*[))])", r"\1" + "\n" + r"\2", con)
  20. parse = re.sub("(<[/a-z]+>)\s*([((]\s*\d\s*[))])", r"\1" + "\n" + r"\2", parse)
  21. # parse = re.sub("(答案分?别?[为是]?\s*[::])\s*[((]\s*(\d)\s*[))]", r"\1[#[\2]#]", parse)
  22. kuo_num = len(re.findall(r"[((]\d[))]", con.replace(" ", "")))
  23. circle_num = len(re.findall(r"\n[((](i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])[))]|\n[①②③④⑤]\s*(?![+-])", con.replace(" ", "")))
  24. if len(re.findall(r"[((]\d[))]|\n[((](i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])[))]|\n[①②③④⑤]\s*(?![+-])", con.replace(" ", ""))) > 1:
  25. by_sub_item = True # 答案是按照小题获取还是按照空的个数,答案老师有时候全部用;隔开,有时候又会分题号
  26. # 题干
  27. if kuo_num > 1:
  28. con = re.sub(r"((?<=[\n::;;。求])|^)\s*([((]\s*\d\s*[))])\s*(?!小?题?中)", "【ⅳ】", con)
  29. elif circle_num > 1:
  30. con = re.sub(r"((?<=[\n::;;。求])|^)\s*([((]\s*(\d|i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])\s*[))]|[①②③④⑤]\s*(?![+-]))", "【ⅳ】", con)
  31. # print(con)
  32. # print('-------------------------------')
  33. con_list = re.split(r"【ⅳ】", con)
  34. # print(con_list)
  35. # ---------------答案和解析拆分---------------------------------------
  36. # ans_list = [] if ans != "见解析" else "见解析"
  37. ans_list = []
  38. parse_list = []
  39. syn_list = []
  40. analy_comment = []
  41. parse_common = ""
  42. ans_summarize = ""
  43. if not flag:
  44. # 答案 不能只用空格隔开
  45. if re.search(r"[;;]|\n[((](\d|i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])[))]|\n[①②③④⑤]\s*(?![+-])", ans.replace(" ", "")):
  46. if len(re.findall(r"[((]\d[))]", ans.replace(" ", ""))) > 1: # 优先按(\d)拆分
  47. # and len(re.findall(r"\n[((](i{1,3}|[ⅰⅱⅲⅳ①②③④])[))]|\n[①②③④]\s*(?![+-])", ans.replace(" ", ""))) > 0:
  48. ans = re.sub(r"((?<=[\n::;;。])|^)\s*([((]\s*\d\s*[))])", "【ⅳ】", ans)
  49. elif len(re.findall(r"\n[((](i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])[))]|\n[①②③④⑤]\s*(?![+-])", ans.replace(" ", ""))) > 1:
  50. ans = re.sub(r"((?<=[\n::;;。])|^)\s*([((]\s*(\d|i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])\s*[))]|[①②③④⑤]\s*(?![+-]))",
  51. "【ⅳ】", ans)
  52. ans_list.extend(re.split(r"【ⅳ】", ans))
  53. while not ans_list[0]:
  54. ans_list = ans_list[1:]
  55. if len(ans_list) < len(con_list) - 1:
  56. ans_list = re.split(r"[;;](?! height)", ans)
  57. by_sub_item = False
  58. # 解析
  59. if parse:
  60. if re.search('【(详解|解析|解答)】', parse): # 2020-6-10
  61. temp_parse = re.split('【详解】|【解析】|【解答】', parse)
  62. parse = temp_parse[1]
  63. # parse_list.append(temp_parse[0])
  64. # 若分析也分小问来,则单独拆分
  65. if len(re.findall(r"[((]\d[))]", temp_parse[0].replace(" ", ""))) > 1:
  66. syn = re.sub(r"((?<=[\n::;;。】])|^)\s*([((]\s*\d\s*[))])", "【ⅳ】", temp_parse[0])
  67. syn_list.extend(re.split(r"【ⅳ】", syn))
  68. syn_list.append(temp_parse[0]) # 【详解】|【解析】|【解答】 前面的部分
  69. if re.search("【(点评|点睛)】", parse):
  70. comment = re.split('(【点评】|【点睛】)', parse)
  71. analy_comment.append(comment[-2] + comment[-1])
  72. parse = comment[0]
  73. # 解析拆分小问
  74. if len(re.findall(r"[((]\d[))]", parse.replace(" ", ""))) > 1:
  75. parse = re.sub(r"((?<=[\n::;;。])|^)\s*([((]\s*\d\s*[))])", "【ⅳ】", parse)
  76. parse = re.sub(r"(/>)\s*([((]\s*\d\s*[))])", r"\1【ⅳ】", parse)
  77. else:
  78. parse = re.sub(r"((?<=[\n::;;。])|^)\s*([((]\s*(\d|i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])\s*[))]|[①②③④⑤]\s*(?![+-]))", "【ⅳ】", parse)
  79. parse = re.sub(r"(/>)\s*([((]\s*(\d|i{1,3}|[ⅰⅱⅲⅳ①②③④⑤])\s*[))]|[①②③④⑤]\s*(?![+-]))", r"\1【ⅳ】", parse)
  80. # 将解析末尾出现的‘故答案为’在成功slave后删掉
  81. if re.search('(故|因[而此]|所以)\s*[::]?\s*答案分?别?([为是]|填)?\s*[::]\s*(.+?)(\n|$)', parse):
  82. ans_s = re.search('(\n.*?|^.*?|<p>)((故|因[而此]|所以)\s*[::]?\s*答案分?别?([为是]|填)?\s*[::]\s*(.+?))(\n|$)', parse)
  83. # print("ans_s:",ans_s)
  84. # print(parse)
  85. if ans_s.group(5) and ans_s.group(5).count("【ⅳ】") > 1:
  86. ans_summarize = ans_s.group(2)
  87. ans_s_index = parse.index(ans_summarize) if ans_s.group(1) == '<p>' or not ans_s.group(1).strip() \
  88. or ans_s.group(1).strip() is None else parse.index(ans_s.group(1))
  89. ans_summarize = [ans_s.group(2), ans_s_index]
  90. parse = parse.replace(ans_summarize[0], "")
  91. elif ans_s.group(5) and "【ⅳ】" in ans_s.group(5):
  92. aa5 = ans_s.group(5).replace("【ⅳ】", "")
  93. parse = parse.replace(ans_s.group(5), aa5)
  94. parse = re.sub("(【ⅳ】\s*解答?\s[::])\s*【ⅳ】", r"\1", parse)
  95. little_parse = re.split(r"【ⅳ】", parse)
  96. if len(syn_list) - 1 == len(little_parse) and len(little_parse) > 2: # 不拼接;syn_list长有4,little_parse长为3
  97. parse_list = ["分析:{}\n解答:{}".format(syn_list[k + 1], p) for k, p in
  98. enumerate(little_parse[1:])]
  99. parse_common = syn_list[0] + '\n' + little_parse[0] # 分小问解析的共同部分
  100. else:
  101. if syn_list and len(re.sub("[^\u4e00-\u9fa5]", "", syn_list[-1])) > 4: # 有4个汉字以上
  102. analy = syn_list[-1]
  103. analy_comment.insert(0, analy)
  104. parse_list.extend(little_parse)
  105. if len(parse_list) > 1:
  106. # if parse_list[0].strip():
  107. # common = parse_list[0]
  108. # parse_list = ["{} {}".format(common, p) for p in parse_list]
  109. parse_common = parse_list[0]
  110. parse_list = parse_list[1:]
  111. # ---------------------------------------------------------------------------------
  112. one_item = split2little_con(con_list, ans_list, parse_list, one_item, by_sub_item, ans_summarize)
  113. if "slave" in one_item and one_item["slave"]:
  114. if not flag:
  115. one_item['parse'] = parse_common
  116. if analy_comment:
  117. one_item['analy'] = "\n".join(analy_comment)
  118. else:
  119. if re.findall(r"_{2,}", one_item["stem"]):
  120. one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["stem"]))
  121. if flag and re.search("^[A-Z]{2,}$", re.sub("\W", "", ans)):
  122. one_item["type"] = "多选题"
  123. if len(re.findall(r"[\n\s\u4e00-\u9fa5]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 3: # 增加对选项的拆分处理
  124. one_item = option_structure(one_item, con, ans, 1)
  125. return one_item
  126. def split2little_con(con_list, ans_list, parse_list, one_item, is_sub_item, ans_summarize):
  127. """
  128. 将按小问切分开的题干、答案、解析 进行 【结构化组合】
  129. :param con_list:切开了小问的题干
  130. :param ans_list:切开了小问的答案
  131. :param parse_list:切开了小问的解析
  132. :param one_item: 初步切开的一道题目
  133. :param is_sub_item: 答案是否按小题号获取(还是按照空的个数获取)的标志
  134. :param ans_summarize: 解析中的综述 [内容,索引]
  135. :return:
  136. """
  137. # print(con_list)
  138. # print(ans_list)
  139. # print(parse_list)
  140. # print('***********************')
  141. old_con = one_item["stem"]
  142. if len(con_list) > 1:
  143. if con_list[0] == "": # 说明全是小题,没有总题文
  144. one_item["stem"] = ""
  145. else:
  146. # 添加缩进属性<p style="text-indent: 2em">、居中属性<p style="text-align:center">
  147. # com_stem_list = re.split("\n+", con_list[0])
  148. # com_stem = '<p style="text-indent: 2em">' + '</p><p style="text-indent: 2em">'.join(
  149. # com_stem_list) + "</p>"
  150. one_item["stem"] = suojin(con_list[0])
  151. slave = []
  152. for index, s in enumerate(con_list[1:]):
  153. blank_num = len(re.findall(r"_{2,}", s))
  154. s = re.sub(r"[((]\d+分[))]", "", s[:9]) + s[9:]
  155. one_slave = {}
  156. if len(con_list) - len(parse_list) == 1:
  157. one_slave = {"slave_no": "(%s)" % (index + 1), # index + 1,
  158. "stem": s,
  159. # "key": ans_list[index],
  160. "parse": parse_list[index]} # 按索引取解析
  161. if isinstance(ans_list, list) and len(con_list) - len(ans_list) == 1:
  162. one_slave["key"] = ans_list[index]
  163. elif not parse_list and isinstance(ans_list, list) and len(con_list) - len(ans_list) == 1:
  164. one_slave = {"slave_no": "(%s)" % (index + 1), # index + 1,
  165. "stem": s,
  166. "key": ans_list[index],
  167. }
  168. elif (not ans_list or ans_list == "见解析") and not parse_list:
  169. one_slave = {"slave_no": "(%s)" % (index + 1),
  170. "stem": s,
  171. "key": "",
  172. "parse": "",
  173. "errmsgs": [],
  174. }
  175. may_stem_info = re.search("\n材料[一二三四五六七八九十]\s", s)
  176. if may_stem_info:
  177. one_slave["stem"] = s[:may_stem_info.start()]
  178. one_item["stem"] += s[may_stem_info.start()+1:]
  179. if len(re.findall(r"[\n\s\u4e00-\u9fa5]\s*[A-D]\s*[..、、]", s)) >= 4:
  180. one_slave = option_structure(one_slave, s, "", 1)
  181. del one_slave["errmsgs"]
  182. if one_slave:
  183. one_slave["blank_num"] = blank_num
  184. if ans_list:
  185. if is_sub_item is False: # 有具体答案的情况,可能要按照空的个数来拿,但至少一个小题一个答案
  186. if blank_num >= 1: # 填空题
  187. one_ans = re.sub(r"(\n|^)\s*[((]\s*" + str(index + 1) + r"\s*[))](.+)", r"\2",
  188. ";".join(ans_list[:blank_num]))
  189. one_ans = re.sub(r"((?<=[\n;;。])|^)\s*([((]\s*(i{1,3}|[ⅰⅱⅲⅳ①②③④])\s*[))]|[①②③④]\s*(?![+-]))(.+)",
  190. r"【ⅳ】\4", one_ans)
  191. one_slave["key"] = one_ans.replace("【ⅳ】", "")
  192. ans_list = ans_list[blank_num:]
  193. else: # 没有空的时候,这个题也应该至少有一个答案
  194. one_slave["key"] = ans_list[0] if len(ans_list) > 0 else ""
  195. ans_list = ans_list[1:]
  196. elif type(ans_list) == str: # 无具体答案的情况:答案为‘见解析’
  197. one_slave["key"] = ans_list
  198. # if one_item["type"] == "填空题":
  199. # one_item["type"] = "解答题"
  200. # if blank_num > 0:
  201. # one_slave["blank_num"] = blank_num
  202. # else:
  203. # if one_item["type"] == "填空题":
  204. # one_item["type"] = "解答题"
  205. # 对带小题的大题,对每个小题的答案重新再提取一次
  206. if not ans_list or "key" not in one_slave or not one_slave["key"] or one_slave["key"] == '见解析':
  207. new_ans = only_parse_split(one_slave["parse"], one_item["type"], one_slave["stem"], reparse_n=2) # 再解析
  208. if new_ans["key"]:
  209. one_slave["key"] = new_ans["key"]
  210. if not new_ans["parse"]:
  211. one_slave["parse"] = ""
  212. slave.append(one_slave)
  213. one_item["slave"] = slave
  214. one_item["slave_no"] = "1-{}".format(len(slave)) if len(slave) > 1 else "1"
  215. # 有slave的话,就把外面的ans和parse字段给删除掉
  216. if slave:
  217. pass
  218. # del one_item["key"]
  219. # if parse_list:
  220. # del one_item["parse"]
  221. else:
  222. one_item["stem"] = old_con
  223. if ans_summarize:
  224. one_item["parse"] = one_item["parse"][:ans_summarize[1]] + '\n' + ans_summarize[0] + '\n' + one_item["parse"][ans_summarize[1]:]
  225. return one_item