stems_structure.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from utils.item_type_line import get_item_head_info, get_item_head_info_cn
  5. from utils.topic_no import judge_item_no_type, get_right_no, pre_get_item_no
  6. def stems_structure_byno(stem_con, subject="", is_danti=0):
  7. """
  8. 按题号进行切分;
  9. 针对无解析的试卷中所有题目的拆分;
  10. :return:{"stem": , "item_id": , "errmsgs": [],"type":,}
  11. """
  12. head_cons = []
  13. if not is_danti:
  14. try:
  15. while stem_con and ((re.search(r"[\u4e00-\u9fa5]", stem_con[0]) is None) or
  16. (((subject != "语文" and re.search(r"[一二三四五]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", stem_con[0]) is None)
  17. or (subject == "语文" and
  18. re.search(r"[一二三四五]\s*[、..、]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)|^([\[【]题文[\]】]|阅读)", stem_con[0]) is None))
  19. and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[..、、]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$",
  20. stem_con[0]) is None)):
  21. head_cons.append(stem_con[0])
  22. del stem_con[0]
  23. except:
  24. return "本份试卷开头格式有问题,请按试卷格式来!"
  25. # else:
  26. head_cons = "\n".join(head_cons).strip()
  27. stem_str = "\n" + "\n".join(stem_con)
  28. # 题号格式有条件清洗
  29. # def sub1(ss):
  30. # if int(ss.group(5)) - int(ss.group(2)) in [1, 2]:
  31. # return ss.group(1)+"\n"+ss.group(5)+"、"+ss.group(6)
  32. # else:
  33. # return ss.group(0)
  34. # stem_str = re.sub(r"(\n\s*([1-9][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-9][0-9]|[1-9])\s*[..、、]).)+?)\n+\s*([1-9][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", sub1, stem_str, flags=re.S)
  35. # 也可以用下面方法,但比较啰嗦
  36. pattern_3 = re.compile(r"(\n\s*([1-9][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-9][0-9]|[1-9])\s*[..、、]).)+?)"
  37. r"\n+\s*([1-9][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", re.S)
  38. while re.search(pattern_3, stem_str):
  39. wrong_id_info = re.search(pattern_3, stem_str)
  40. stem_str = stem_str.replace(wrong_id_info.group(0),
  41. wrong_id_info.group(1)+"\n"+wrong_id_info.group(5)+"、"+wrong_id_info.group(6))
  42. # print(stem_str)
  43. # 1、获取题型行信息、按题型行切分
  44. if subject == "语文":
  45. con11, title_info_dict, choice_class = get_item_head_info_cn(stem_str)
  46. else:
  47. con11, title_info_dict, choice_class = get_item_head_info(stem_str)
  48. all_type = title_info_dict["all_type"]
  49. title_type_num = title_info_dict["title_type_num"]
  50. select_type_id = title_info_dict["select_type_id"]
  51. each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"]
  52. # -------------------------------------------------------
  53. res = []
  54. item_type_classify = {} # 记录每类题型中含有的题目个数,含合并的情况
  55. item_type_num = [] # 题型不合并
  56. pic_no = {} # 记录每个题的图
  57. new_item_no = [] # 将纠错后的题号再记录一份
  58. item_groups = {"is_groups": 0, "groups_data": {}} # 公共题干的位置,从哪个题开始,比如地理选择题
  59. if not all_type:
  60. print("不存在大题题型行或题型行格式或名称有问题")
  61. # 初步获取题号,题号类型
  62. stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
  63. # 获取正确题号的位置,进行切分
  64. new_item_no, items_no_idx = get_right_no(item_no_info)
  65. one_item_split = [stem_str[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
  66. if items_no_idx and stem_str[:items_no_idx[0]]:
  67. if is_danti or "【题文】" in stem_str[:items_no_idx[0]]:
  68. head_cons = stem_str[:items_no_idx[0]]
  69. dd = {}
  70. for n, one_item in enumerate(one_item_split):
  71. if subject in ["地理", "语文"]:
  72. res, com_stem, item_groups = split_with_comstem(head_cons, one_item_split, res, n, item_groups, subject)
  73. if com_stem:
  74. # print("com_stem:", com_stem)
  75. dd.update(com_stem)
  76. elif is_danti and not n: # 2022/2/24
  77. dd.update({"com_stem": head_cons})
  78. item_groups["is_groups"] = 1
  79. item_groups["groups_data"][0] = ""
  80. # if not n:
  81. # mix_con = "\n".join(head_cons).strip()
  82. # else:
  83. # mix_con = one_item_split[n - 1].strip()
  84. # pattern_3 = re.search("(完成|回答)下?[面列]?的?.*?[\d小]题.{,2}$", mix_con)
  85. # if pattern_3:
  86. # com_stem, bef_con = "", ""
  87. # if re.search("【题文】", mix_con):
  88. # bef_con, com_stem = mix_con.split("【题文】", maxsplit=1)
  89. # else:
  90. # com_stem_idx = [i.end() for i in re.finditer(r"(所以|故而?).{,3}(选择?[A-Fa-f]选?项?"
  91. # r"|选项[A-Fa-f]正确).{,2}\n"
  92. # r"|\n[A-E]\s*[..、、::].*?\n", mix_con)]
  93. # if com_stem_idx:
  94. # com_stem = mix_con[com_stem_idx[-1]+1:]
  95. # bef_con = mix_con[:com_stem_idx[-1]+1]
  96. # else:
  97. # if not n:
  98. # com_stem = mix_con
  99. # else:
  100. # mix_con_list = mix_con.split(r"(详解】|解析】|答案】)")
  101. # if len(mix_con_list) > 2:
  102. # bef_con = "".join(bef_con[:-1])
  103. # mix_con = bef_con[-1]
  104. # else:
  105. # mix_con = bef_con[0]
  106. # paras = bef_con.split("\n")
  107. # paras = [para for para in paras if para.strip()]
  108. # if len(paras)>1:
  109. # if len(paras) == 2 or len(paras[-1]) >= 20:
  110. # com_stem = paras[-1]
  111. # bef_con = bef_con + "\n".join(paras[:-1])
  112. # else:
  113. # com_stem = paras[-2:]
  114. # bef_con = bef_con + "\n".join(paras[:-2])
  115. # else:
  116. # bef_con = bef_con + "\n".join(paras)
  117. # dd["com_stem"] = com_stem
  118. # res[-1]["stem"] = bef_con
  119. dd["stem"] = one_item
  120. dd["item_id"] = new_item_no[n] # 题目本身的题号
  121. dd["type"] = "" # 先题型不备注,根据答案再看
  122. dd["errmsgs"] = []
  123. # dd["score"] = 0
  124. # if select_type_id and dd["item_id"] in select_type_id:
  125. # dd['is_optional'] = 'true'
  126. res.append(dd)
  127. dd = {}
  128. # 先不做拆图处理了
  129. else:
  130. # print(all_type, len(con11))
  131. if len(all_type) == len(con11)-1: # 第一部分的题型行掉了
  132. all_type.insert(0, "")
  133. each_item_score.insert(0, 0) # 按题型行拿的分数也会掉,先补上默认的0
  134. elif len(all_type) != len(con11):
  135. print("第二种试卷格式:存在题型行没有换行") # 可能造成题目和题型行在同一行
  136. # return "存在题型行末尾没有换行或题型行中题型不明确"
  137. # else:
  138. # # print(all_type)
  139. # if "非选择题" in all_type:
  140. # error_info1 = "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
  141. # if any([True for one_type in all_type if re.search("必考基础综合中等面列下各", one_type)]):
  142. # error_info1 = "存在题型行中题型不明确"
  143. # ---------------------------------------------------------------------
  144. # 思路:>>>>先纠正题号,再拆分题目;等切分好后再纠正(删减添加操作)比较费时
  145. # 按题号切分,可再加些细节!!!! 1>>题号不要求连续
  146. # 初步判断题号类型
  147. stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
  148. # >>>>切分题目
  149. for num, one_type in enumerate(con11):
  150. # 初步获取题号
  151. item_no_info = pre_get_item_no("\n"+one_type, item_no_type)
  152. # 获取正确题号的位置,进行切分
  153. if res:
  154. items_no_temp, items_no_idx = get_right_no(item_no_info, have_type=1, last_id=res[-1]["item_id"])
  155. else:
  156. items_no_temp, items_no_idx = get_right_no(item_no_info)
  157. if not items_no_temp:
  158. res.append({
  159. "stem": one_type,
  160. "item_id": res[-1]["item_id"]+1 if res else 1,
  161. "type": all_type[num],
  162. "errmsgs": [],
  163. })
  164. continue
  165. is_from_0 = 1
  166. if items_no_temp and items_no_idx[0] != 0 and ((not num and items_no_temp[0] > 1) or
  167. (num and items_no_temp[0] > new_item_no[-1]+1)): # 以防出现题号漏了的情况
  168. items_no_idx.insert(0, 0)
  169. is_from_0 = 0
  170. one_item_split = [("\n" + one_type)[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
  171. # 针对多个题共用一段材料的情况2021-11-3
  172. head_item = one_type[0:items_no_idx[0]]
  173. if not head_item.strip() and head_cons.strip() and not num: # 2022.2.24
  174. head_item = head_cons
  175. # if head_item and subject == "地理":
  176. # res, com_stem, item_group = split_with_comstem(head_item, [], res, 0)
  177. # item_groups.update(item_group)
  178. # if com_stem:
  179. # dd["com_stem"] = com_stem
  180. # common_stem_may = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第~~-]+?(\d{1,2})题", head_item.replace(" ", ""))
  181. # if common_stem_may:
  182. # if not item_groups:
  183. # item_groups["pos"].append(1)
  184. # else:
  185. # item_groups["pos"].append(len(res)+1)
  186. # item_groups["{}-{}".format(common_stem_may.group(2), common_stem_may.group(3))] = head_item
  187. # ------------------每个大题的第一题前面的图可能有漏的情况------------------------
  188. may_oimt_pic = []
  189. if not is_from_0:
  190. may_oimt_pic_info = re.search("\n(<imgsrc.+?/>(\s*<imgsrc((?!/>).)+?/>)*?)\s*\n?$", one_item_split[0])
  191. if may_oimt_pic_info:
  192. if len(one_item_split) > 1 and re.search("如[上下左右]?图", one_item_split[1]):
  193. may_oimt_pic.extend(re.findall("<imgsrc.+?/>", may_oimt_pic_info.group(1)))
  194. # print("may_oimt_pic:", may_oimt_pic)
  195. # ------------------------------------------------------------------
  196. if len(all_type) == len(con11):
  197. if not is_from_0:
  198. if all_type[num] and all_type[num].replace("题", "") not in ['选择', '单选', '多选', '不定选择']:
  199. one_item_split = one_item_split[1:]
  200. else: # 针对选择题第1题或前面几题题号漏了的情况
  201. one_item_split1 = one_item_split[1:]
  202. # ------针对分错的细节继续拆分,如上一题选项行与下一题题干没有换行-----------------
  203. new_one_item_split = []
  204. pattern_1 = re.compile(
  205. r"([CDE]\s*[..、、].+?)(?<![::])\s([1-9]|1[0-9])\s*[..、、](.+?([是为有]|等于)[((]\s*[))]\n)")
  206. pattern_2 = re.compile(
  207. r"(([CDE]\s*[..、、]|\([CDE]\)).+?)(?<![::])\s\(([1-9]|1[0-9])\)(.+?([是为有]|等于)[((]\s*[))]\n)")
  208. for nn, one_item in enumerate(one_item_split1):
  209. if item_no_type == 1 and re.search(pattern_1, one_item):
  210. err_optcon = re.sub(pattern_1, r"\1【】\3", one_item) # 太粗糙了
  211. new_one_item_split.extend(err_optcon.split("【】"))
  212. items_no_temp.insert(nn+1, int(re.search(pattern_1, one_item).group(2)))
  213. elif item_no_type == 2 and re.search(pattern_2, one_item):
  214. err_optcon = re.sub(pattern_2, r"\1【】\4", one_item)
  215. new_one_item_split.extend(err_optcon.split("【】"))
  216. items_no_temp.insert(nn + 1, int(re.search(pattern_2, one_item).group(3)))
  217. else:
  218. new_one_item_split.append(one_item)
  219. if re.match(r'(^|\n)+\s*[A-Z]\s*[..、、]|(^|\n)+\s*[((]\s*[A-Z]\s*[))]\s*[..、、]?',
  220. str(one_item_split[0]).strip(), re.S) or \
  221. re.search("如[上下左右]图|[((]\s+[))]\s*($|\n)", one_item_split[0].strip()):
  222. new_one_item_split.insert(0, one_item_split[0])
  223. items_no_temp.insert(0, items_no_temp[0]-1)
  224. one_item_split = new_one_item_split
  225. # ----------------------------------------------------------
  226. if all_type[num] in item_type_classify: # 统计每类题型含有的题目个数
  227. item_type_classify[all_type[num]] += len(one_item_split)
  228. elif all_type[num]:
  229. item_type_classify[all_type[num]] = len(one_item_split)
  230. item_type_num.append((all_type[num], len(one_item_split)))
  231. else:
  232. if not is_from_0:
  233. one_item_split = one_item_split[1:]
  234. new_item_no.extend(items_no_temp)
  235. # ---------------------------------------------------------------
  236. # 从题型行中判断单选和多选 放在这里收集也可以:可可能出现两不同题型行提到的序号一样
  237. # choice_class = {}
  238. # if all_type[num] == "选择题":
  239. # multi_choice_info = re.findall("[\s,,;;((]+第?(\d+)[至到\-~]+(\d+)题[是为]([多单])项?选择?题?",
  240. # all_type_info[num][2])
  241. # if multi_choice_info:
  242. # for mu in multi_choice_info:
  243. # choice_class[mu[2]] = list(range(int(mu[0]), int(mu[1]) + 1))
  244. # ---------------------------------------------------------------
  245. dd = {}
  246. for nn, one_item in enumerate(one_item_split):
  247. # 针对多个题共用一段材料的情况2021-11-3
  248. if subject in ["地理", "语文"]:
  249. res, com_stem, item_groups = split_with_comstem(head_item, one_item_split, res, nn, item_groups, subject)
  250. if com_stem:
  251. dd.update(com_stem)
  252. # if subject == "语文":
  253. # if not nn and head_item.strip():
  254. # dd["com_stem"] = head_item.strip()
  255. # common_stem_may = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第~~-]+?(\d{1,2})题",
  256. # one_item.replace(" ", "").replace("\n", ""))
  257. # if subject == "地理" and common_stem_may:
  258. # common_stem_info = re.search("((\n\s*[ABCDE][^\n]+?\n?)+)\n", one_item, flags=re.S)
  259. # if common_stem_info:
  260. # common_stem = one_item[common_stem_info.end():]
  261. # item_groups["pos"].append(up_num+nn+2)
  262. # item_groups["{}-{}".format(common_stem_may.group(2),
  263. # common_stem_may.group(3))] = common_stem
  264. # one_item = one_item[:common_stem_info.end()]
  265. # item_groups["{}-{}".format(common_stem_may.group(1), common_stem_may.group(2))] = head_item
  266. dd["stem"] = one_item
  267. # ------------------对每个大题的第一题加上may_oimt_pic--------------------------
  268. if nn == 0 and may_oimt_pic:
  269. dd['susp_pic'] = may_oimt_pic
  270. # ------------------------------------------------------------------------
  271. dd["item_id"] = items_no_temp[nn]
  272. # dd["score"] = each_item_score[num]
  273. # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
  274. # dd["score"] = each_item_score2[str(dd["item_id"])]
  275. dd["errmsgs"] = []
  276. if all_type[num] and re.search("必考基础综合中等面列下各非", all_type[num]) is None:
  277. dd["type"] = all_type[num] if re.sub('[((]', "", all_type[num]) != '本大题' else "解答题"
  278. else:
  279. dd["type"] = ""
  280. if choice_class:
  281. for k, v in choice_class.items():
  282. if dd["item_id"] in v:
  283. dd["type"] = k + "选题"
  284. # elif len(choice_class) == 1:
  285. # dd["type"] = "多选题" if k == "单" else "单选题"
  286. if select_type_id and dd["item_id"] in select_type_id:
  287. dd['is_optional'] = 'true'
  288. # if len(one_item_split) == 1 and dd["score"] == 0.0 and title_info_dict["total_score"][num] > 0.0:
  289. # dd["score"] = title_info_dict["total_score"][num]
  290. res.append(dd)
  291. # 多图在一起的情况进行拆分
  292. # if len(re.findall(r"第?\(?([1-9]|[1-9][0-9])\)?\s*题图", one_item)) > 1 and \
  293. # re.search(r"<imgsrc\d.+?\n+\s*第?\(?([1-9]|[1-9][0-9])\)?\s*题图", one_item, re.S):
  294. pic_info = re.search(r"(<imgsrc\d.+?)\n+\s*((第?\(?([1-9]|[1-9][0-9])\)?\s*题图.?\s*){2,})", one_item, re.S)
  295. if pic_info:
  296. pic_list = re.findall(r"<imgsrc\d.*?/>", pic_info.group(1))
  297. pic_w = re.findall("([1-9]|[1-9][0-9])[))]?\s*(?=题)", pic_info.group(2))
  298. if len(pic_list) >= len(pic_w):
  299. pic_no = {int(p): pic_list[len(pic_list) - len(pic_w) + k] for k, p in enumerate(pic_w)}
  300. dd["stem"] = dd["stem"].replace(pic_info.group(2), "")
  301. for k, pic in enumerate(pic_list[::-1]):
  302. if k < len(pic_w):
  303. dd["stem"] = dd["stem"].replace(pic, "")
  304. # -----------------------------------------------------
  305. dd = {}
  306. if pic_no:
  307. for i in list(pic_no.keys()):
  308. res[i-1]["stem"] = res[i-1]["stem"].strip() + "\n" + pic_no[i] + "\n" + "第" + str(i) + "题图"
  309. # ----------------------------------------------------------------------
  310. # 可能出现选择题类的选做题
  311. # --------最后判断一下题量是否正确-----------------------------------------
  312. # 针对拆分后题量特别多的情况,将拆分后题量与已知题量一样的题目保留
  313. new_res = []
  314. right_type = [] # 记录与已知题目个数一样的分块{第几个分块,题型}
  315. if all_type and sum(list(item_type_classify.values())) > 40: # 很可能存在大片不是题号的题号
  316. title_type_num_all = sum([t[1] for t in title_type_num]) # 题型行给出的题目个数
  317. if title_type_num_all > 0:
  318. for idx, type_num in enumerate(item_type_num):
  319. if type_num[1] == title_type_num[idx][1] > 0: # 同一题型是否题量一致
  320. right_type.append((idx, type_num[0]))
  321. elif title_type_num[idx][1] == 0 and type_num[1] == 1:
  322. title_type_num[idx][1] = 1
  323. right_type.append((idx, type_num[0]))
  324. if right_type:
  325. for rtype in right_type:
  326. for idx, item in enumerate(res):
  327. r_st = 0 if rtype[0]==0 else sum([t[1] for t in item_type_num[:rtype[0]]])
  328. r_ed = r_st + item_type_num[rtype[0]][1]
  329. if item["type"] == rtype[1] and r_st<=idx<r_ed:
  330. new_res.append(item)
  331. res = new_res
  332. item_type_num = title_type_num # 将题目已知题型数量信息作为正确信息
  333. new_item_type_classify = {}
  334. for i in right_type:
  335. if title_type_num[i[0]][0] not in new_item_type_classify:
  336. new_item_type_classify[title_type_num[i[0]][0]] = title_type_num[i[0]][1]
  337. else:
  338. new_item_type_classify[title_type_num[i[0]][0]] += title_type_num[i[0]][1]
  339. item_type_classify = new_item_type_classify
  340. return res, all_type, item_type_classify, item_no_type, item_type_num, new_item_no, item_groups
  341. def split_with_comstem(head_cons, one_item_split, bef_res, pc_idx, item_groups, subject):
  342. """
  343. head_cons: 每类题最前面的题文部分
  344. one_item_split:按题号切分的试题list
  345. bef_res:前面已经初步结构化好的题目
  346. pc_idx: 索引计时器
  347. :return:
  348. """
  349. if not pc_idx: # 没有题型行,第一题前
  350. mix_con = head_cons
  351. else:
  352. mix_con = one_item_split[pc_idx - 1].strip()
  353. common_stem_info1 = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第~~-]+?(\d{1,2})小?题",
  354. mix_con.replace(" ", "").replace("\n", ""))
  355. common_stem_info2 = re.search("(完成|回答)下?[面列]?的?.*?[\d小各]题.{,2}\n*?(<img\s*src.*?/>[\s\n]*?)*?.{,2}$",
  356. mix_con, flags=re.S)
  357. if subject == "语文":
  358. common_stem_info2 = re.search("(完成|回答)下?[面列]?的?.*?[\d小各]题.{,2}\n|\n\s*阅读.*?按要求[作回]答.{,2}\n", mix_con)
  359. bef_con = ""
  360. one_stem = {}
  361. if common_stem_info2:
  362. item_groups["is_groups"] = 1
  363. if common_stem_info1:
  364. st = common_stem_info1.group(2)
  365. end = common_stem_info1.group(3)
  366. if not item_groups["groups_data"]:
  367. item_groups["groups_data"][0] = "{}-{}".format(st, end)
  368. else:
  369. item_groups["groups_data"][len(bef_res)] = "{}-{}".format(st, end)
  370. elif subject == "地理":
  371. item_groups["groups_data"][len(bef_res)] = ""
  372. # item_groups["groups_data"][len(bef_res)] = ""
  373. # elif common_stem_info1:
  374. # item_groups["is_groups"] = 1
  375. if (item_groups["is_groups"] and common_stem_info2) or "【题文】" in mix_con or subject == "语文":
  376. com_stem = ""
  377. if re.search("【题文】", mix_con):
  378. bef_con, com_stem = mix_con.split("【题文】", maxsplit=1)
  379. if len(bef_res) not in item_groups["groups_data"]:
  380. item_groups["groups_data"][len(bef_res)] = ""
  381. if not item_groups["is_groups"]:
  382. item_groups["is_groups"] = 1
  383. else:
  384. com_id = sorted(item_groups["groups_data"])
  385. common_stem_info3 = re.search("(完成|回答)下?[面列]?的?.*?问题.{,2}\n", one_item_split[pc_idx]) # 带多问的大题
  386. com_stem_idx = [i.end() for i in re.finditer(r"(所以|故而?|答案).{,3}(选择?[A-Fa-f]选?项?.{,2}"
  387. r"|选项[A-Fa-f]+正确|[A-Fa-f]+项正确).{,2}($|\n\s*【点睛】\s*\n.+?\n|\n)"
  388. r"|(选?项?[A-Fa-f]+项?错误?|[A-D]正确|排除[A-D]"
  389. r"|[A-D][A-D、、]*?不符?合题意).{,2}\s*($|\n\s*【点睛】\s*\n.+?\n|\n)"
  390. r"|(\n\s*[A-E]\s*[..、、::][^\n]*?\n?)+(\n|$)", mix_con,
  391. flags=re.S)] # add 第1条后部分 2022-2-14
  392. # print("common_stem_info3:", common_stem_info3)
  393. if subject == "地理" and common_stem_info3:
  394. item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1] + 1, len(bef_res)-1)
  395. elif com_stem_idx and subject == "地理":
  396. com_stem = mix_con[com_stem_idx[-1]:]
  397. bef_con = mix_con[:com_stem_idx[-1]]
  398. # print("com_stem:", com_stem)
  399. # print("bef_con:", bef_con)
  400. else:
  401. if not pc_idx:
  402. com_stem = mix_con
  403. if len(com_stem) < 25 and re.search("任选一题", com_stem):
  404. com_stem = ""
  405. if not com_stem:
  406. item_groups["groups_data"][len(bef_res)] = "fei"
  407. else:
  408. if subject == "语文":
  409. mix_con_list = re.split(r"(\n\s*阅读.*?按要求[作回]答.{,2}\n"
  410. r"|\n\s*阅读下[面列].*?[完成回答]+各题.{,2}\n"
  411. r"|[((][一二三四五][))]\s*.{,6}\n|[((][一二三四五][))]\s*.{,6}$)",
  412. # r"|\n+\s*[((][一二三四五][))])", # 与上一条都满足时,匹配短的
  413. re.sub("[((][本题共\d小\s]*?\d{1,2}分\s*[))].?", "", mix_con))
  414. if len(mix_con_list) == 1:
  415. mix_con_list = re.split(r"([((][一二三四五][))].*?[完成回答]+下?[面列]?的?.*?[\d小]\s*?题.{,2})\n",
  416. re.sub("[((]\s*\d{1,2}\s*分\s*[))].?", "", mix_con))
  417. if len(mix_con_list) == 1:
  418. mix_con_list = re.split(r"\n+\s*[((][一二三四五][))]",
  419. re.sub("[((][本题共\d小\s]*?\d{1,2}分\s*[))].?", "", mix_con))
  420. if len(mix_con_list) >= 3:
  421. com_stem = "\n".join(mix_con_list[-2:])
  422. # 有([一二三四五]),但不一定有公共题文
  423. if re.search("[((]\s*[一二三四五]\s*[))](语言文字运用|微?写作|选择|单选|语言表达|作文"
  424. r"|.{,4}(文本阅读|诗歌阅读|文言文阅读|名著阅读|默写)题?).{,10}$", com_stem.strip()):
  425. com_stem = ""
  426. bef_con = "".join(mix_con_list[:-2])
  427. if com_id[-1] not in item_groups["groups_data"] or not item_groups["groups_data"][com_id[-1]]:
  428. item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1]+1, len(bef_res))
  429. common_stem_info4 = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第~~-]+?(\d{1,2})小?题",
  430. mix_con_list[-2].replace(" ", "").replace("\n", ""))
  431. if common_stem_info4:
  432. item_groups["groups_data"][len(bef_res)] = "{}-{}".format(
  433. common_stem_info1.group(2), common_stem_info1.group(3))
  434. elif common_stem_info2:
  435. item_groups["groups_data"][len(bef_res)] = ""
  436. common_stem_list = re.split("([完成回答]下?[面列]?的?.*?[\d小各]题.{,2}\n"
  437. "|\n\s*阅读.*?按要求[作回]答.{,2}\n)", mix_con)
  438. if len(common_stem_list) > 3:
  439. com_stem = common_stem_list[-1]
  440. mix_con = "".join(common_stem_list[:-1])
  441. else:
  442. com_stem = mix_con[common_stem_info2.end():]
  443. mix_con = mix_con[:common_stem_info2.end()] # 没切全,还需进一步
  444. # print("mix_con:",mix_con)
  445. paras = mix_con.split("\n")
  446. paras = [para for para in paras if para.strip()] # 去掉空行
  447. if len(paras) > 1:
  448. com_stem = paras[-1] + com_stem
  449. bef_con = "\n".join(paras[:-1])
  450. # print(bef_con)
  451. else:
  452. bef_con = mix_con
  453. else:
  454. mix_con_list = re.split(r"(详解】|解析】|答案】)", mix_con)
  455. if len(mix_con_list) > 2: # 含解析:
  456. bef_con = "".join(mix_con_list[:-1])
  457. mix_con = mix_con_list[-1]
  458. else:
  459. mix_con = mix_con_list[0]
  460. paras = mix_con.split("\n") # 将混淆的部分换行拆分
  461. paras = [para for para in paras if para.strip()] # 去掉空行
  462. if len(paras) > 1:
  463. if len(paras) == 2 or len(paras[-1]) >= 20: # 2段or段长
  464. com_stem = paras[-1]
  465. if re.search("^\s*[((]\s*\d\s*[))]", com_stem):
  466. com_stem = ""
  467. else:
  468. bef_con = bef_con + "\n".join(paras[:-1])
  469. else:
  470. com_stem = "\n".join(paras[-2:])
  471. bef_con = bef_con + "\n".join(paras[:-2])
  472. else:
  473. bef_con = bef_con + "\n".join(paras)
  474. # 此时 com_stem 为空
  475. if subject == "地理":
  476. if com_stem:
  477. one_stem["com_stem"] = com_stem
  478. if bef_res and pc_idx: # 不包括第一题
  479. bef_res[-1]["stem"] = bef_con
  480. else:
  481. if re.sub("</?tbody>|</?table>|\n", "", com_stem):
  482. one_stem["com_stem"] = com_stem
  483. item_groups["is_groups"] = 1
  484. if len(bef_res) not in item_groups["groups_data"]:
  485. item_groups["groups_data"][len(bef_res)] = ""
  486. if bef_res and bef_con:
  487. bef_res[-1]["stem"] = bef_con
  488. # elif not pc_idx:
  489. # item_groups["groups_data"][len(bef_res)] = ""
  490. return bef_res, one_stem, item_groups