three_parse_structure.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. # paper3_process: 第三类word试卷模式, 题目和答案分开的情况
  4. # split2one_item:将所有行文本 按题型分大类,再在每个大类中切分每个题目
  5. # split2one_item_by_topicno:将所有行文本 按题型分大类,再在每个大类中按题号切分每个题目
  6. """
  7. 总共3种方案:1、教师用卷;2、按题号切分;3、划分试题和答案,再按题号切分
  8. """
  9. from structure.ans_structure import *
  10. from utils.insert_keywords import get_con
  11. from utils.item_resplit import resplit
  12. from utils.washutil import table_label_cleal
  13. from structure.stems_structure import stems_structure_byno
  14. from utils.item_type_line import get_item_head_info, get_item_head_info_cn
  15. from utils.topic_no import judge_item_no_type, get_right_no
  16. from utils.stem_ans_split import stem_ans_split
  17. from collections import Counter
  18. from pprint import pprint
  19. def items_ans_reform(items_list, ans_list, subject):
  20. """
  21. 第三种word试卷格式, 题目和答案分开的情况
  22. 答案也有几种类型:带题型?
  23. :param items_list:
  24. :param ans_list:
  25. :param subject:
  26. :return:
  27. """
  28. con1 = list(filter(lambda x: x.strip() != "", items_list)) # 题目
  29. anss1 = list(filter(lambda x: x.strip() != "", ans_list)) # 答案,list中的每个元素为一行
  30. if re.match(".+?省.+?试[卷题]", con1[-1]):
  31. con1 = con1[:-1]
  32. if re.match(".+?省.+?试[卷题]|.*?答题?[卷卡页]", anss1[0]):
  33. anss1 = anss1[1:]
  34. # --------------答案页也包含题目的情况----------但可能题目不存在-----------------------
  35. ans_n = re.findall("【答案】", "\n".join(anss1))
  36. if subject not in ["地理", "语文"] and ans_n and len(ans_n) == len(re.findall("【解析】", "\n".join(anss1))) > 2: # 带相同个数的答案和解析
  37. print("答案页中有相同个数的答案和解析,可以答案中也带题干")
  38. item_res = split_by_keywords(anss1, subject)
  39. print("item_res:", item_res)
  40. if type(item_res) != str:
  41. # 还要判断题目是否为空
  42. if len(item_res[0]) > 10 and len([i["item_id"] for i in item_res[0] if len(i["stem"].strip()) < 5]) < 2:
  43. return item_res
  44. # ----------------- 【解析 题目】----------------------------
  45. print('---------------解析 题目-------------------')
  46. ress = stems_structure_byno(con1, subject)
  47. if type(ress) == str:
  48. return ress
  49. else:
  50. item_res, all_type, item_type_classify, item_no_type, \
  51. item_type_num, new_item_no, item_groups = ress # 全题目(不含解析)的结构化
  52. print("item_groups:",item_groups)
  53. # pprint(item_groups)
  54. # 将空题目去掉
  55. new_res = []
  56. for k, sub_res in enumerate(item_res):
  57. if sub_res['stem'].strip():
  58. sub_res['stem'] = del_no(sub_res['stem'])
  59. new_res.append(sub_res)
  60. item_res = new_res
  61. # 先对题目的切分结果进行纠正!!!!!
  62. item_res = resplit(item_res)
  63. print("item_type_classify:", item_type_classify)
  64. print("item_type_num:", item_type_num)
  65. print('----------解析 答案---------------')
  66. # -------------解析 答案---------------------------
  67. # 分两种情况:1>>答案中又按题型排列, 如一、选择题 1.答案 2.答案
  68. # 2>>答案中不含题型关键字,只按序号排列
  69. # 3>>答案中不含题型关键字,且题目中也没有,all_type, item_type_classify为空
  70. rd1_is_fail = 0
  71. have_type_line = re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等((\[]{2,5}题", "\n".join(anss1))
  72. anss1_cy = anss1.copy() # 复制一份,保证不能影响后面
  73. if have_type_line and subject != "语文":
  74. # 这里的anss1的清洗不应该影响rd2_is_fail中的原始文本!!先不修改看看再说
  75. while re.search(r"<td><p>[A-F]</p></td>|</td><td>[A-F]</td><td>|([A-F]\s*){3,}", anss1_cy[0]) is None and \
  76. (re.search(r"[\u4e00-\u9fa5]", anss1_cy[0]) is None
  77. or re.search(r"[一二三四五六七八九十]\s*[、..、]\s*(<imgsrc.*?/>)?\s*(.{2,5}题"
  78. r"|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏))", anss1_cy[0]) is None):
  79. del anss1_cy[0]
  80. # 答案中的题型
  81. all_type2 = re.findall(r"\n\s*[一二三四五六七八九十]\s*[、..、::]\s*([^必考基础综合中共等::((\[]{2,5}题)|"
  82. r"\n\s*[、..、::]?\s*(单选题|非?选择题|不定选择题|多选题|填空题|计算题|[解简]答题|实验题|作图题|论述题|探究题)",
  83. "\n" + "\n".join(anss1_cy))
  84. all_type2 = ["".join(a) for a in all_type2]
  85. # '本大题' 后面处理
  86. print("答案中的题型:", all_type2)
  87. ans_str = "\n" + "\n".join(anss1_cy)
  88. try:
  89. item_res, rd1_is_fail = anss_structure_with_type(item_res, ans_str, all_type, all_type2, item_type_num, item_type_classify)
  90. except:
  91. rd1_is_fail = 1
  92. # 没有题型行或第一次解析失败
  93. rd2_is_fail = 0
  94. if not have_type_line or rd1_is_fail or subject == "语文": # 答案中没有题型行 或题型行名称不规范
  95. print('没有题型行或题目和答案的题型个数不一致或第一次解析失败')
  96. anss1 = list(
  97. map(lambda x: re.sub(r"(\n|^)\s*[((]?\s*[一二三四五六七八九十]\s*[))]?\s*[、..、::]?\s*(<p>)?"
  98. r"(\s*(.{2,5}题|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)).+?分\s*[.。]?\s*$"
  99. r"|.*?[((].+?[得共]\d+分.*?[))].*?$"
  100. r"|\s*(.{2,5}题|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏))\s*([((].+?[))])?).*?$"
  101. r"|(\n|^)\s*([^\d]{2,5}题|.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏))(.+?分\s*[))])?\s*$", "", x), anss1))
  102. # print("anss1:", anss1)
  103. raw_item_res = item_res
  104. # try:
  105. item_res = ans_structure_step1(anss1, item_type_classify, item_res) # 答案整体结构化
  106. if str(raw_item_res) != str(item_res):
  107. rd2_is_fail = 1
  108. # except:
  109. # rd2_is_fail = 1
  110. # for i, one_item in enumerate(item_res):
  111. # item_res[i].update({'key': "", 'parse': ""})
  112. # return item_res, item_no_type, rd2_is_fail
  113. for i, one_item in enumerate(item_res):
  114. if 'key' not in one_item:
  115. item_res[i]['key'] = ""
  116. if 'parse' not in one_item:
  117. item_res[i]['parse'] = ""
  118. return item_res, item_no_type, rd2_is_fail, item_groups
  119. def split_by_keywords(con_list, subject):
  120. """
  121. 第一种试卷格式:教师用卷,含答案和解析关键字
  122. 切分思路:
  123. 1.根据大题型分,再按【答案|解析】初步拆分题目,再在‘解析’和‘答案’间细分‘题干’和‘解析’
  124. :param con_list:
  125. :return: 每个切分后的题目组成的dict
  126. """
  127. # items_con = "\n" + "\n".join(con_list)
  128. # judge_item_no_type(items_con)
  129. # item_no_type = 1
  130. # all_con = table_label_cleal()
  131. # item_no = [int(no) for no in re.findall(r'\n+\s*([1-9][0-9]?)\s*[..、、]', all_con)]
  132. # if len(item_no) <= 2:
  133. # item_no_type = 2
  134. # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9][0-9]?)\s*[))]\s*[..、、]?', all_con)]
  135. # if len(item_no) > 3:
  136. # 去掉多余空格,作用不大
  137. con2 = ["【delete】" if (k < len(con_list) - 1 and v.strip() == "" and (
  138. re.match(r"【(答案|解析)】|(答案|解析)\s*[::]|<imgsrc\d+|\s+", con_list[k + 1].strip()) or
  139. re.match(r"(([1-9]|[1-9][0-9])\s*[..、、]|[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)",
  140. con_list[k + 1].strip()) is None))
  141. or (k > 0 and v.strip() == "" and (
  142. re.match(r"【(答案|解析)】$|(答案|解析)\s*[::]", con_list[k - 1].strip()) or
  143. re.match(r"[a-z<>/\s]*?[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题",
  144. con_list[k - 1].strip())))
  145. else v for k, v in enumerate(con_list)]
  146. con3 = list(filter(lambda x: x != "【delete】", con2))
  147. while con3 and con3[-1].strip() == "":
  148. del con3[-1]
  149. while con3 and con3[0].strip() == "":
  150. del con3[0]
  151. con3.append("") # 不然最后一个题就漏掉了
  152. # 开头没用信息处理
  153. con3[0] = re.sub(r"([一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)", r"\n\1", con3[0])
  154. while con3 and ((re.search(r"[\u4e00-\u9fa5]", con3[0]) is None) or
  155. (((subject != "语文" and re.search(r"[一二三四五]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con3[0]) is None)
  156. or (subject == "语文" and re.search(r"[一二三四五]\s*[、..、]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文)", con3[0]) is None))
  157. and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[..、、]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$", con3[0]) is None)):
  158. # while con3 and (re.search(r"[\u4e00-\u9fa5]", con3[0]) is None
  159. # or (re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con3[0]) is None
  160. # and re.match("\s*[1-9]\s*[、..、].+?", con3[0]) is None)):
  161. del con3[0]
  162. # ----------------------------------开始结构化---------------------------------------------
  163. items_con = "\n" + "\n".join(con3)
  164. # 初步获取题号,题号类型
  165. items_con, item_no_info, item_no_type = judge_item_no_type(items_con)
  166. # 1、获取题型行信息、按题型行切分
  167. if subject == "语文":
  168. con4, title_info_dict, choice_class = get_item_head_info_cn(items_con)
  169. else:
  170. con4, title_info_dict, choice_class = get_item_head_info(items_con)
  171. all_type = title_info_dict["all_type"]
  172. select_type_id = title_info_dict["select_type_id"]
  173. each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"]
  174. # 2、据是否有题型行分两步进行
  175. # 没有做拆图处理
  176. res = []
  177. if not all_type:
  178. print("不存在大题题型行或题型行格式有问题")
  179. if len(re.findall(r"\n\s*【答案】", items_con)) != len(re.findall(r"\n\s*【解析】", items_con)):
  180. return "不存在大题题型行或题型行格式有问题"
  181. else:
  182. item_no = []
  183. subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", items_con.strip())
  184. pattern1 = re.compile(r"([1-9]|[1-9][0-9])\s*[..、、].+?")
  185. if re.match(pattern1, subcon[0].strip()):
  186. st_id = re.match(pattern1, subcon[0].strip()).group(1)
  187. if int(st_id) > 1:
  188. item_no.append(int(st_id))
  189. else:
  190. item_no.append(1)
  191. else:
  192. item_no.append(1)
  193. if len(subcon) == 5: # 只有1道题
  194. dd = dict(zip(["stem", "key", "parse"],
  195. re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(items_con))))
  196. dd["type"] = ""
  197. dd["stem"] = re.sub(r"^\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:]
  198. # dd["score"] = 0
  199. dd["errmsgs"] = []
  200. dd["item_id"] = item_no[0] # 要用实际id 不是索引序号
  201. res.append(dd)
  202. else:
  203. # ------在下一题【解析】在本题【答案】之间找到下一题【stem】的位置--------
  204. all_item, item_no, errmsg_dict, count = get_con(subcon, item_no_type, item_no, index=0)
  205. # item_no.extend(local_item_no)
  206. for idk, one_item in enumerate(all_item):
  207. if one_item:
  208. dd = dict(zip(["stem", "key", "parse"],
  209. re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?",
  210. table_label_cleal(one_item))))
  211. dd["type"] = ""
  212. dd["stem"] = re.sub(r"\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:]
  213. # dd["score"] = 0
  214. dd["errmsgs"] = [errmsg_dict[idk]] if idk in errmsg_dict else []
  215. dd["item_id"] = item_no[idk]
  216. res.append(dd)
  217. else:
  218. if len(all_type) != len(con4):
  219. print("存在题型行没有换行")
  220. return "存在题型行末尾没有换行,请在所有题型行末尾重新换行" # 放第【2】种方案中进行处理
  221. else:
  222. # if "非选择题" in all_type:
  223. # return "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
  224. index = 0 # 每个大题的第一题的题号索引位置
  225. for num, one_type in enumerate(con4):
  226. count = 1
  227. if len(re.findall(r"\n\s*【答案】", one_type)) == len(re.findall(r"\n\s*【解析】", one_type)):
  228. subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", one_type.strip())
  229. # index根据第一道题的题号进行纠正
  230. item_no = []
  231. pattern1 = re.compile(r"([1-9]|[1-9][0-9])\s*[..、、].+?")
  232. if re.match(pattern1, subcon[0].strip()):
  233. st_id = re.match(pattern1, subcon[0].strip()).group(1)
  234. if num == 0 and int(st_id) != 1:
  235. index = int(st_id) - 1
  236. item_no.append(int(st_id))
  237. else:
  238. item_no.append(index+1)
  239. if len(subcon) == 5: # 只有1道题
  240. dd = dict(zip(["stem", "key", "parse"],
  241. re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(one_type))))
  242. dd["type"] = all_type[num]
  243. dd["stem"] = re.sub(r"^\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:]
  244. # dd["score"] = each_item_score[num]
  245. dd["errmsgs"] = []
  246. dd["item_id"] = item_no[0] # 要用实际id 不是索引序号
  247. # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
  248. # dd["score"] = each_item_score2[str(dd["item_id"])]
  249. if select_type_id and dd["item_id"] in select_type_id:
  250. dd['is_optional'] = 'true'
  251. # if dd["score"] == 0.0 and title_info_dict["total_score"][num] > 0.0:
  252. # dd["score"] = title_info_dict["total_score"][num]
  253. res.append(dd)
  254. else:
  255. # ------在下一题【解析】在本题【答案】之间找到下一题【stem】的位置,再按此3个关键字进行 切分--------
  256. all_item, item_no, errmsg_dict, count = get_con(subcon, item_no_type, item_no,
  257. all_type=all_type, num=num, index=index)
  258. # item_no.extend(local_item_no)
  259. for idk, one_item in enumerate(all_item):
  260. dd = dict(zip(["stem", "key", "parse"],
  261. re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?",
  262. table_label_cleal(one_item))))
  263. dd["type"] = all_type[num]
  264. dd["stem"] = re.sub(r"\d+\s*[..、、]", "", dd["stem"][:5]) + dd["stem"][5:]
  265. # dd["score"] = each_item_score[num]
  266. dd["errmsgs"] = [errmsg_dict[idk]] if idk in errmsg_dict else []
  267. dd["item_id"] = item_no[idk] # idk+1+index 为序号
  268. if choice_class:
  269. for k, v in choice_class.items():
  270. if dd["item_id"] in v:
  271. dd["type"] = k + "选题"
  272. # elif len(choice_class) == 1:
  273. # dd["type"] = "多选题" if k == "单" else "单选题"
  274. # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
  275. # dd["score"] = each_item_score2[str(dd["item_id"])]
  276. if select_type_id and dd["item_id"] in select_type_id:
  277. dd['is_optional'] = 'true'
  278. res.append(dd)
  279. # pprint(res)
  280. else:
  281. return "第" + str(num + 1) + "大题《" + all_type[num] + "》中【答案】或【解析】格式有误或其中某道题中出现多个相同关键字或漏关键字"
  282. index += count
  283. for i, one_item in enumerate(res):
  284. if 'key' not in one_item:
  285. res[i]['key'] = ""
  286. if 'parse' not in one_item:
  287. res[i]['parse'] = ""
  288. return res, item_no_type
  289. def split_by_topicno(con_list, subject, is_dati=0):
  290. """
  291. 第二种试卷格式: 不同时或都不含有{答案}和{解析}关键字
  292. 按题号切分每个题目
  293. 将所有行文本 按题型分大类,再在每个大类中切分每个题目
  294. :param con_list: 所有行文本组成的list
  295. :return: [{},{}]
  296. """
  297. con1 = list(filter(lambda x: x.strip() != "", con_list))
  298. ress = stems_structure_byno(con1, subject, is_dati) # 按题号切分后的初步结构化
  299. if type(ress) == str:
  300. return ress
  301. else:
  302. res, all_type, item_type_classify, item_no_type, item_type_num, new_item_no, item_groups = ress
  303. # res, all_type, item_type_classify = stems_structure_byno(con1)
  304. print("item_type_num:", item_type_num)
  305. # pprint(res)
  306. # 格式行调整
  307. for nn, one_i in enumerate(res):
  308. if "com_stem" in one_i and re.search('<p style=".*?">\n+$', one_i["com_stem"]):
  309. one_i["com_stem"], b, _ = re.split('(<p style=".*?">\n+)$', one_i["com_stem"])
  310. one_i["stem"] = b + one_i["stem"]
  311. if nn > 0 and re.search('<p style=".*?">\n+$', res[nn-1]["stem"]):
  312. res[nn - 1]["stem"], b, _ = re.split('(<p style=".*?">\n+)$', res[nn-1]["stem"])
  313. if "com_stem" not in one_i:
  314. one_i["stem"] = b + one_i["stem"]
  315. # 可能存在有的题目有解析,有的没有
  316. last_comstem_id = 0
  317. ans_groups = {}
  318. no_ans_n = 0
  319. for k, one_res in enumerate(res):
  320. if item_groups["is_groups"]:
  321. if "com_stem" in one_res:
  322. last_comstem_id = k
  323. if re.search('\n【(答案|[解分][析答]|详解|点[评睛]|考点|专题)】', one_res["stem"]):
  324. case = "case1" # 默认有“答案”关键字
  325. if re.search(r'\n【答案】|[\n】]\s*答案\s*[::]', one_res["stem"]) is None:
  326. # 没“答案”关键字
  327. case = "case0"
  328. dd1 = stem_ans_split(one_res, case) # 对切分后的每道题再细分
  329. one_res["stem"] = dd1["stem"]
  330. del dd1["stem"]
  331. if not dd1["key"] and not dd1["parse"]:
  332. no_ans_n += 1
  333. else:
  334. if subject in ["地理", "语文"] and no_ans_n == k-last_comstem_id > 0:
  335. if (k+1 < len(res) and ("com_stem" in res[k+1] or k+1 in item_groups["groups_data"])) or len(re.findall("【\d?题?详解】", dd1["parse"])) > 1\
  336. or len(re.findall(r"(?<=[】\s\n])\d{1,2}\s*[、..、]|^\d{1,2}\s*[、..、]", dd1["key"])) > 1:
  337. # 默认是前后都是题组的情况
  338. if is_dati:
  339. ans_groups["{}-{}".format(last_comstem_id + res[0]["item_id"], k + res[0]["item_id"])] = dd1
  340. else:
  341. ans_groups["{}-{}".format(last_comstem_id + 1, k + 1)] = dd1
  342. dd1 = {"key": "", "parse": ""}
  343. no_ans_n = 0
  344. one_res.update(dd1)
  345. else: # 没有解析的情况
  346. one_res.update({"key": "", "parse": ""})
  347. no_ans_n += 1
  348. one_res["stem"] = del_no(one_res["stem"], item_no_type)
  349. if 'pic' in one_res:
  350. one_res["stem"] += "\n" + "\n".join(one_res["pic"])
  351. del one_res["pic"]
  352. # 先对题目的切分结果进行纠正!!!!!
  353. res = resplit(res)
  354. # pprint(res)
  355. # 对最后一个题后面带个别答案(无答案页)
  356. # if res:
  357. # pattern1 = re.search('\n\s*([1-9]|[1-9][0-9])\s*[..、、]\s*(解\s*[::]|【解析|【答案)', res[-1]["stem"])
  358. # if pattern1:
  359. # breakp = pattern1.start()
  360. # ans_str = res[-1]["stem"][breakp:]
  361. # ans_no_info = pre_get_item_no(ans_str, item_no_type)
  362. # ans_no, ans_no_idx = get_right_no(ans_no_info)
  363. # all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
  364. # res[-1]["stem"] = res[-1]["stem"][:breakp]
  365. # res = get_ans_match(res, all_ans, ans_no)
  366. # else:
  367. # ans_str = res[-1]["stem"] + res[-1]["parse"]
  368. # ans_no_info = pre_get_item_no(ans_str, item_no_type)
  369. # ans_no, ans_no_idx = get_right_no(ans_no_info)
  370. # if len(ans_no) == len(res):
  371. # all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
  372. # res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
  373. # res = get_ans_match(res, all_ans, ans_no)
  374. # elif ans_no_idx:
  375. # try:
  376. # ans_no1, table_ans, st = get_table_ans(res[-1]["stem"][:ans_no_idx[0]], [], flag=1)
  377. # if table_ans and 0 < ans_no[0] - ans_no1[-1] < 3:
  378. # all_ans = table_ans
  379. # all_ans.extend([del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])])
  380. # new_ans_no = ans_no1
  381. # new_ans_no.extend(ans_no)
  382. # if st >= 0:
  383. # res[-1]["stem"] = res[-1]["stem"][:st]
  384. # else:
  385. # res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
  386. # res = get_ans_match(res, all_ans, new_ans_no)
  387. # except:
  388. # if len(ans_no) > 4 and all([True if not one_res["key"] and not one_res["parse"]
  389. # else False for one_res in res[:-1]]):
  390. # all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
  391. # res[-1]["stem"] = res[-1]["stem"][:ans_no_idx[0]]
  392. # res = get_ans_match(res, all_ans, ans_no)
  393. # 没有识别出答案切分点的情况,很可能答案里的部分也当成题文进行拆分,所以先判断下是否有相同的id
  394. all_no = [one_res['item_id'] for one_res in res]
  395. if len(list(set(all_no))) - len(all_no) < -2:
  396. Count_no = sorted(dict(Counter(all_no)).items(), key=lambda d: d[1], reverse=True)
  397. if Count_no[0][1] > 1:
  398. split_idx = [i for i, no in enumerate(all_no) if no == Count_no[0][0]][1]
  399. for one_res in res[split_idx:]:
  400. if re.search("[((]\s+[))]|(等于|存在|[是有为])多少|求.*?[??]",
  401. one_res["stem"] + "\n" + one_res["parse"]) is None:
  402. bef_no = [k for k, j in enumerate(res[:split_idx]) if j["item_id"]==one_res["item_id"]]
  403. if bef_no and not res[:split_idx][bef_no[0]]["parse"]:
  404. res[:split_idx][bef_no[0]]["parse"] = one_res["stem"] + "\n" + one_res["parse"]
  405. return res[:split_idx], item_no_type
  406. return res, item_no_type, item_groups, ans_groups