stems_to_groups2.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from pprint import pprint
  5. def regroup(res_list, item_groups, ans_groups):
  6. """
  7. 将多个题共用一个题干的情况进行slave重组,如地理
  8. item_groups中的groups_data,key值表示带公共题干的试题位置,从0开始计;
  9. value值:'fei'表示本题不是小题多问;
  10. '\d-\d'表示哪几个题合成slave;
  11. ''空表示带公共题干试题开始位置,没有slave范围
  12. 例:item_groups: {'is_groups': 1, 'groups_data':
  13. {0: 'fei', 5: '', 8: '', 11: 'fei', 12: '', 15: '', 19: '20-21', 23: 'fei'}}
  14. :param res_list:
  15. :param item_groups:
  16. :param ans_groups:
  17. :return:
  18. """
  19. new_res_dict = []
  20. groups_data = item_groups["groups_data"]
  21. start_no = list(groups_data.keys())
  22. start_no.sort() # 排序
  23. def takefirst(elem):
  24. return int(elem.split("-")[0])
  25. ans_start_no = []
  26. if ans_groups:
  27. ans_start_no = list(ans_groups.keys())
  28. ans_start_no.sort(key=takefirst) # 排序
  29. contained_no = []
  30. for ans_no in ans_start_no:
  31. st1, ed1 = ans_no.split("-")
  32. contained_no.extend(list(range(int(st1)-1, int(ed1))))
  33. not_contained_no = set(range(len(res_list))) - set(contained_no)
  34. added_nos = [] # 已经slave了的题号
  35. # 开始是单层题型结构时
  36. temp_no = -1
  37. while groups_data and groups_data[start_no[0]] == "fei":
  38. new_res_dict.append(res_list[start_no[0]])
  39. temp_no = start_no[0]
  40. del start_no[0]
  41. if start_no[0] > 0:
  42. new_res_dict.extend(res_list[temp_no+1:start_no[0]])
  43. one_group = {}
  44. alone_item_nos = []
  45. print("start_no:", start_no)
  46. for n, group_no in enumerate(start_no):
  47. if "com_stem" not in res_list[group_no]: # 不带"com_stem"
  48. if group_no == start_no[-1] and groups_data[group_no] == "fei": # 最后一个不管
  49. continue
  50. new_res_dict.append(res_list[group_no])
  51. added_nos.append(group_no+1)
  52. continue
  53. # 遇到带"com_stem"的试题
  54. one_group["com_stem"] = res_list[group_no]["com_stem"]
  55. del res_list[group_no]["com_stem"]
  56. if "-" in groups_data[group_no]: # slave范围知道时
  57. st, end = groups_data[group_no].split("-")
  58. if not added_nos: # 开始
  59. if n + 1 < len(start_no) and start_no[n + 1] <= int(end): # 公共题文中的结束序号信息有误,以下一个题的key值为主
  60. one_group["slave"] = res_list[int(st) - 1: start_no[n + 1]]
  61. added_nos.append(start_no[n + 1])
  62. else:
  63. added_nos.append(int(end))
  64. if len(res_list) < int(end):
  65. st = int(st) - int(res_list[0]["topic_num"]) + 1
  66. end = int(end) - int(res_list[0]["topic_num"]) + 1
  67. one_group["slave"] = res_list[int(st) - 1:int(end)]
  68. elif int(st) <= added_nos[-1]: # 公共题文中的初始序号信息有误
  69. if n + 1 < len(start_no): # 不是最后一组
  70. if int(end) < start_no[n + 1]:
  71. one_group["slave"] = res_list[added_nos[-1]:int(end)]
  72. added_nos.append(int(end))
  73. else: # 结束序号有误,
  74. one_group["slave"] = res_list[added_nos[-1]: start_no[n + 1]]
  75. added_nos.append(start_no[n + 1])
  76. else:
  77. if int(end) >= added_nos[-1]:
  78. one_group["slave"] = res_list[added_nos[-1]:int(end)]
  79. added_nos.append(int(end))
  80. else: # end值出错
  81. if str(group_no+1) + "-" in "#".join(ans_groups.keys()):
  82. end = re.search("[^#]{}-(\d+)[$#]".format(group_no), "#".join(ans_groups.keys())).group(1)
  83. one_group["slave"] = res_list[group_no: int(end) + 1]
  84. else:
  85. endp = [m for m, j in enumerate(res_list[added_nos[-1]:])
  86. if j["type"] != res_list[added_nos[-1]]["type"]]
  87. if endp:
  88. one_group["slave"] = res_list[added_nos[-1]:endp[0] + len(res_list[:added_nos[-1]])]
  89. added_nos.append(endp[0] + len(res_list[:added_nos[-1]]))
  90. else:
  91. one_group["slave"] = res_list[group_no:]
  92. else:
  93. added_nos.append(int(end))
  94. one_group["slave"] = res_list[int(st) - 1:int(end)]
  95. if int(st) > added_nos[-1] + 1:
  96. new_res_dict.extend(res_list[added_nos[-1]:int(st) - 1])
  97. else: # salve范围不知道时
  98. if group_no != start_no[-1]: # 不是最后一个
  99. # print("yyy:", group_no, start_no, groups_data)
  100. if groups_data[group_no] == "fei":
  101. new_res_dict.append(res_list[group_no])
  102. added_nos.append(group_no)
  103. continue
  104. elif "#" + str(group_no + 1) + "-" in "#" + "#".join(ans_groups.keys()): # 以答案的序号为准
  105. aa = ("#" + "#".join(ans_groups.keys())).split("#{}-".format(group_no + 1))
  106. end = aa[-1].split("#", maxsplit=1)[0]
  107. one_group["slave"] = res_list[group_no: int(end)]
  108. added_nos.append(int(end))
  109. if int(end) < start_no[n+1]: # 中间单独的题目
  110. alone_item_nos.append([int(end), start_no[n + 1]])
  111. added_nos.append(start_no[n + 1])
  112. # new_res_dict.extend(res_list[int(end)+1:])
  113. else:
  114. one_group["slave"] = res_list[group_no: start_no[n+1]]
  115. added_nos.append(start_no[n+1])
  116. else:
  117. if groups_data[group_no] == "fei":
  118. continue
  119. elif str(group_no + 1) + "-" in "#".join(ans_groups.keys()): # 以答案的序号为准
  120. aa = ("#" + "#".join(ans_groups.keys())).split("#{}-".format(group_no + 1))
  121. end = aa[-1].split("#", maxsplit=1)[-1]
  122. one_group["slave"] = res_list[group_no: int(end)]
  123. added_nos.append(int(end))
  124. else:
  125. endp = [m for m, j in enumerate(res_list[added_nos[-1]:])
  126. if j["type"] != res_list[added_nos[-1]]["type"]] if added_nos else []
  127. if endp:
  128. one_group["slave"] = res_list[added_nos[-1]:endp[0] + len(res_list[:added_nos[-1]])]
  129. added_nos.append(endp[0] + len(res_list[:added_nos[-1]]))
  130. # new_res_dict.extend(res_list[added_nos[-1]:])
  131. else:
  132. one_group["slave"] = res_list[group_no:]
  133. added_nos.append(len(res_list))
  134. one_group["type"] = one_group["slave"][0]["type"] if one_group["slave"] else ""
  135. one_group["que_num"] = len(one_group["slave"])
  136. if one_group["slave"]:
  137. if one_group["slave"][-1]["topic_num"] != one_group["slave"][0]["topic_num"]:
  138. one_group["topic_num"] = "{}-{}".format(one_group["slave"][0]["topic_num"], one_group["slave"][-1]["topic_num"])
  139. else:
  140. one_group["topic_num"] = one_group["slave"][0]["topic_num"]
  141. else:
  142. one_group["topic_num"] = ""
  143. if ans_start_no:
  144. for k in ans_start_no:
  145. if k == one_group["topic_num"]:
  146. st1, end1 = k.split("-") # 真实题号组
  147. parse_list = []
  148. if len(re.findall("【详解】", ans_groups[k]["parse"])) > 1:
  149. parse_list = re.split("【详解】", ans_groups[k]["parse"])[1:]
  150. else:
  151. t_seq_no = list(range(int(st1), int(end1)+1))
  152. t_seq_no = list(map(str, t_seq_no))
  153. if any([True if len(no) > 1 else False for no in t_seq_no]):
  154. parse_list = re.split(r"(?<=[】\n])\s*(" + "|".join(t_seq_no) + r")\s*[、..、]",
  155. "\n" + ans_groups[k]["parse"])[1:]
  156. parse_list = [pr for idn, pr in enumerate(parse_list) if idn % 2 == 1]
  157. else:
  158. parse_list = re.split(r"(?<=[】\n])\s*["+"".join(t_seq_no)+r"]\s*[、..、]",
  159. "\n"+ans_groups[k]["parse"])[1:]
  160. if len(parse_list) > 1:
  161. ans_list = re.split("(?<=[】\s])\d{1,2}\s*[、..、]|^\d{1,2}\s*[、..、]", ans_groups[k]["key"])[1:]
  162. if len(parse_list) == int(end1)+1 - int(st1):
  163. for i in range(len(parse_list)):
  164. pr = parse_list[i].strip()
  165. if i == len(parse_list) - 1 and re.search("\n\s*[【参考]*?译文\s*[】::]", pr):
  166. pr, hd, one_group["parse"] = re.split("\n\s*([【参考]*?译文\s*[】::])", pr)
  167. one_group["parse"] = hd + one_group["parse"]
  168. one_group["slave"][i]["parse"] = pr
  169. if "本题缺少答案和解析" in one_group["slave"][i]["errmsgs"]:
  170. one_group["slave"][i]["errmsgs"] = one_group["slave"][i]["errmsgs"]\
  171. .replace("本题缺少答案和解析", "")
  172. if "slave" in one_group["slave"][i]: # 解析再拆
  173. slave_parse_list = re.split("(?<=[\s\n])[((]\s*\d{1,2}[))]", "\n" + parse_list[i].strip())
  174. if len(slave_parse_list)-1 == len(one_group["slave"][i]["slave"]):
  175. for pi in range(len(slave_parse_list)-1):
  176. one_group["slave"][i]["slave"][pi]["parse"] = slave_parse_list[pi+1].strip()
  177. one_group["slave"][i]["parse"] = slave_parse_list[0].strip()
  178. else:
  179. # 就将各题解析合在一起
  180. one_group["parse"] = ans_groups[k]["parse"]
  181. if len(ans_list) == int(end1)+1 - int(st1):
  182. for j in range(len(ans_list)):
  183. one_group["slave"][j]["key"] = ans_list[j].strip()
  184. if "本题缺少答案和解析" in one_group["slave"][j]["errmsgs"]:
  185. one_group["slave"][j]["errmsgs"] = one_group["slave"][j]["errmsgs"]\
  186. .replace("本题缺少答案和解析", "")
  187. if "slave" in one_group["slave"][j]: # 答案再拆
  188. slave_ans = re.sub(r"([((]\s*\d\s*[))])\s*[、..、,,::]\s*\1", r"\1", ans_list[j])
  189. slave_ans_list = re.split("(?<=[\s\n])[((]\s*\d{1,2}[))]", "\n" + slave_ans.strip())
  190. if len(slave_ans_list) - 1 == len(one_group["slave"][j]["slave"]):
  191. for aj in range(len(slave_ans_list)-1):
  192. one_group["slave"][j]["slave"][aj]["key"] = slave_ans_list[aj+1].strip()
  193. one_group["slave"][j]["key"] = slave_ans_list[0].strip()
  194. else:
  195. one_group["key"] = ans_groups[k]["key"]
  196. ans_start_no.remove(k)
  197. break
  198. else:
  199. one_group['key'] = ans_groups[k]["key"]
  200. one_group['parse'] = ans_groups[k]["parse"]
  201. for si, s in enumerate(one_group["slave"]):
  202. if "本题缺少答案和解析" in s["errmsgs"]:
  203. one_group["slave"][si]["errmsgs"] = s["errmsgs"].replace("本题缺少答案和解析", "")
  204. else:
  205. # 针对答案在后面且【答案】1.xx 2.xx \n【解析】1.xx 2.xx \n【答案】3.xx 4.xx \n【解析】3.xx 4.xx
  206. if one_group["slave"][0]["parse"] in ["略", ""] and one_group["slave"][-1]["parse"]:
  207. st1, end1 = one_group["topic_num"].split("-") # 真实题号组
  208. t_seq_no = list(range(int(st1), int(end1) + 1))
  209. t_seq_no = list(map(str, t_seq_no))
  210. parse_list = re.split(r"(?<=[】\n])\s*(" + "|".join(t_seq_no) + r")\s*[、..、]",
  211. "\n" + one_group["slave"][-1]["parse"])[1:]
  212. parse_list = [pr.strip() for idn, pr in enumerate(parse_list) if idn % 2 == 1]
  213. if len(parse_list) == int(end1) + 1 - int(st1):
  214. for ni, pr in enumerate(parse_list):
  215. if ni == int(end1) - int(st1):
  216. pr = re.sub("\n\s*【答案】$", "", pr)
  217. if re.search("\n\s*[【参考]*?译文\s*[】::]", pr):
  218. pr, hd, one_group["parse"] = re.split("\n\s*[【参考]*?译文\s*[】::]", pr)
  219. one_group["parse"] = hd + one_group["parse"]
  220. one_group["slave"][ni]["parse"] = pr
  221. new_res_dict.append(one_group)
  222. if alone_item_nos:
  223. for alone_no in alone_item_nos:
  224. new_res_dict.extend(res_list[alone_no[0]: alone_no[1]])
  225. alone_item_nos = []
  226. one_group = {}
  227. if added_nos[-1] < len(res_list):
  228. new_res_dict.extend(res_list[added_nos[-1]:])
  229. if not_contained_no:
  230. for one_no in not_contained_no:
  231. for idx, one_res in enumerate(new_res_dict):
  232. if one_no+1 == one_res["topic_num"]:
  233. one_res = parse_split2group(one_res)
  234. # if "slave" in one_res:
  235. # print(one_res)
  236. for one_res in new_res_dict:
  237. if "com_stem" in one_res: # 公共题文中暂不考虑填空个数
  238. # 添加缩进属性<p style="text-indent: 2em">、居中属性<p style="text-align:center">
  239. new_com_stem = suojin(one_res["com_stem"])
  240. new_com_stem = new_com_stem.replace(" ", "&nbsp;&nbsp;") # 允许手动调整的空格保留
  241. one_res["stem"] = new_com_stem + "\n" + one_res["stem"] if "stem" in one_res else new_com_stem
  242. del one_res["com_stem"]
  243. elif "slave" in one_res:
  244. new_stem = suojin(one_res["stem"])
  245. one_res["stem"] = new_stem
  246. one_res["topic_num"] = str(one_res["topic_num"])
  247. if "slave" in one_res:
  248. one_res['type'] = '小题多问类'
  249. elif "options" in one_res:
  250. one_res['type'] = '选择类'
  251. else:
  252. one_res['type'] = '解答类'
  253. ind_label = '<p style="text-indent: 2em">'
  254. if "【范文】" in one_res['key']: # "写作"
  255. anss = re.split("\n+", one_res['key'])
  256. ids = [n for n, a in enumerate(anss) if "【范文】" in a][0]
  257. may_title = anss[ids].replace("【范文】", "").strip()
  258. if not may_title:
  259. ids += 1
  260. may_title = anss[ids].strip()
  261. if 0 < len(may_title) < 5:
  262. new_ans = "\n".join(anss[:ids]) + '<p style="text-align:center">' + anss[ids] + "</p>" \
  263. + ind_label + ('</p>' + ind_label).join(anss) + "</p>"
  264. else:
  265. new_ans = ind_label + '</p><p style="text-indent: 2em">'.join(anss) + "</p>"
  266. one_res['key'] = new_ans
  267. elif re.search(r"(阅读|针对).{,4}[资材]料|(\n|^)\s*材料一\s", one_res['stem']) \
  268. and "text-indent: 2em" not in one_res['stem']:
  269. one_res['stem'] = suojin(one_res['stem'])
  270. return new_res_dict
  271. def suojin(item_str):
  272. """
  273. 文本缩进处理
  274. :param item_str:
  275. :return:
  276. """
  277. ind_label = '<p style="text-indent: 2em">'
  278. con_list = re.split("\n+", item_str.strip())
  279. if len(con_list) > 1 and re.search("(阅读|针对).{,4}[资材]料", con_list[0]):
  280. new_con = con_list[0] + ind_label + ('</p>' + ind_label).join(con_list[1:]) + "</p>"
  281. else:
  282. new_con = ind_label + ('</p>' + ind_label).join(con_list) + "</p>"
  283. new_con = re.sub(r'<p style="text-indent: 2em">(\s*<img .+?)</p>($|<p style="text-indent: 2em">)',
  284. r'\1\n\2', new_con, flags=re.S).strip()
  285. return new_con
  286. def parse_split2group(item_list):
  287. """
  288. 有slave的题目将外层的解析拆入salve中
  289. :return:
  290. """
  291. # print(item_list)
  292. raw_item_list = item_list.copy()
  293. flag = 0
  294. # print(item_list)
  295. if "com_stem" in item_list and "slave" in item_list and len(item_list["slave"]) == 1: # 嵌套
  296. item_list = item_list["slave"][0]
  297. flag = 1
  298. if "slave" in item_list and (item_list["key"] or item_list["parse"]) and \
  299. any([True if not (s["key"] + s["parse"]).strip() else False for s in item_list["slave"]]):
  300. # 解析
  301. parse_list = re.split(r"(?<=[\s\n】])[((]\s*[\dl]{1,2}\s*[))]", "\n" + item_list["parse"].strip())
  302. if len(parse_list) - 1 == len(item_list["slave"]):
  303. for pi in range(len(parse_list) - 1):
  304. item_list["slave"][pi]["parse"] = parse_list[pi + 1].strip()
  305. item_list["parse"] = parse_list[0].strip()
  306. # 答案
  307. ans = re.sub(r"([((]\s*\d\s*[))])\s*[、..、,,::]\s*(\1)", r"\2", item_list["key"])
  308. ans_list = re.split("(?<=[\s\n】])[((]\s*[\dl]{1,2}\s*[))]", "\n" + ans.strip())
  309. if len(ans_list) - 1 == len(item_list["slave"]):
  310. for aj in range(len(ans_list) - 1):
  311. item_list["slave"][aj]["key"] = ans_list[aj + 1].strip()
  312. item_list["key"] = ans_list[0].strip()
  313. # 2021-12-21
  314. if "com_stem" in item_list:
  315. item_list["stem"] = item_list["com_stem"].strip() + "<br/>" + item_list["stem"] \
  316. if "stem" in item_list else item_list["com_stem"]
  317. del item_list["com_stem"]
  318. if flag:
  319. raw_item_list["slave"] = [item_list]
  320. item_list = raw_item_list
  321. return item_list
  322. def regroup_old(res_list, item_groups):
  323. """
  324. 将多个题共用一个题干的情况进行slave重组,如地理
  325. :param res_list: 拆分为小题后的结果
  326. :return:
  327. """
  328. new_res_dict = []
  329. start_no = [i for i in item_groups.keys() if i != "pos"]
  330. if not start_no:
  331. return res_list
  332. def takefirst(elem):
  333. return int(elem.split("-")[0])
  334. start_no.sort(key=takefirst) # 排序
  335. print(start_no)
  336. one_group = {}
  337. added_nos = [] # 已经slave了的题号
  338. for n, group_no in enumerate(start_no):
  339. one_group["common_stem"] = item_groups[group_no]
  340. st, end = group_no.split("-") # 真实题号组
  341. if not added_nos: # 开始
  342. if item_groups["pos"][n + 1] <= int(end): # 公共题文中的结束序号信息有误
  343. one_group["slave"] = res_list[int(st) - 1:item_groups["pos"][n + 1] - 1]
  344. added_nos.append(item_groups["pos"][n + 1] - 1)
  345. else:
  346. added_nos.append(int(end))
  347. one_group["slave"] = res_list[int(st) - 1:int(end)]
  348. elif int(st) <= added_nos[-1]: # 公共题文中的初始序号信息有误
  349. if n + 1 < len(item_groups["pos"]): # 不是最后一组
  350. if int(end) < item_groups["pos"][n + 1]:
  351. one_group["slave"] = res_list[added_nos[-1]:int(end)]
  352. added_nos.append(int(end))
  353. else: # 结束序号有误,以pos为主
  354. one_group["slave"] = res_list[added_nos[-1]:item_groups["pos"][n + 1] - 1]
  355. added_nos.append(item_groups["pos"][n + 1] - 1)
  356. else:
  357. if int(end) >= added_nos[-1]:
  358. one_group["slave"] = res_list[added_nos[-1]:int(end)]
  359. added_nos.append(int(end))
  360. else: # end值出错
  361. endp = [m for m, j in enumerate(res_list[added_nos[-1]:])
  362. if j["type"] != res_list[added_nos[-1]]["type"]]
  363. if endp:
  364. one_group["slave"] = res_list[added_nos[-1]:endp[0] + len(res_list[:added_nos[-1]])]
  365. added_nos.append(endp[0] + len(res_list[:added_nos[-1]]))
  366. else:
  367. added_nos.append(int(end))
  368. one_group["slave"] = res_list[int(st) - 1:int(end)]
  369. if int(st) > added_nos[-1] + 1:
  370. new_res_dict.extend(res_list[added_nos[-1]:int(st) - 1])
  371. one_group["type"] = one_group["slave"][0]["type"]
  372. one_group["que_num"] = len(one_group["slave"])
  373. new_res_dict.append(one_group)
  374. one_group = {}
  375. if added_nos[-1] < len(res_list):
  376. new_res_dict.extend(res_list[added_nos[-1]:])
  377. return new_res_dict