parse_chunk.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. # 本文件包含以下函数
  4. # stem_ans_split:将切出来的一道题 按答案解析 进一步细分
  5. # correct_wrong_no :针对分错的题号进行 纠正 或 报错
  6. # stems_structure_byno:按题号进行切分;
  7. # dati2slave :带小问的大题 按小问切分
  8. # split2little_con: 将带小问的填空题或解答题 按 小问 继续划分,小问已切分好
  9. # get_options_arrange: 判断word中选项每行排版个数
  10. import re
  11. from washutil import table_label_cleal
  12. from ans_structrue import only_parse_split, get_ans_from_parse
  13. from pprint import pprint
  14. from collections import Counter
  15. def stem_ans_split(one_item_dict, case):
  16. """
  17. 将切出来的一道题 按 答案解析 进一步细分
  18. :param one_item_dict: 单道题的初步结构字典{"content": , "item_id": , "errmsgs": [],"item_topic_name":,}
  19. :param case: 属于哪种情况
  20. :return: {"content": ,"answer": ,"parse":}
  21. """
  22. one_item = one_item_dict["content"]
  23. item_type = one_item_dict["item_topic_name"]
  24. # print(one_item)
  25. if case == 'case0': # 没“答案”关键字
  26. inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
  27. table_label_cleal(one_item))
  28. inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
  29. else str(a).replace('None', '').strip() for a in inside_split]
  30. # print(':::', inside_split)
  31. # print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
  32. dd = dict(zip(["content", "parse_title"], inside_split[0:2]))
  33. dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
  34. dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
  35. dd["answer"] = ""
  36. else: # if case == 'case1': # 有“答案”关键字
  37. dd = dict(zip(["content", "answer"], re.split(r"【答案】\n?",
  38. table_label_cleal(one_item), maxsplit=1)))
  39. # pprint(dd) # 一般默认‘答案’在‘解析’的前面
  40. subdd = dict(zip(["answer", "parse_title", "parse"],
  41. re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", dd["answer"], maxsplit=1)))
  42. dd["answer"] = subdd["answer"]
  43. if "parse_title" in subdd:
  44. dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
  45. dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
  46. else:
  47. dd["parse"] = ""
  48. dd["content"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
  49. # 获取答案
  50. if not dd["answer"]:
  51. dd["answer"] = get_ans_from_parse(dd["parse"], item_type, dd["content"])
  52. # 补充!!!------------------------------------------
  53. # if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)|
  54. # ans = re.search(r'故选[::]?<imgsrc=[^>]+?data-latex="\$?([A-Z;;和与、、\s]+)\$?".+?/>|故选[::]?([A-Z;;和与、、\s]+)',
  55. # dd["parse"].replace("$", "").replace(" ", ""))
  56. # if ans:
  57. # dd["answer"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None
  58. # else:
  59. # dd["answer"] = ""
  60. # else:
  61. # dd["answer"] = "见解析"
  62. # ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"])
  63. # if ans:
  64. # dd["answer"] = ans.group(1)
  65. # ------------------------------------------------------
  66. if "parse_title" in dd:
  67. del dd["parse_title"]
  68. return dd
  69. def stem_ans_split2(one_type_list, idx1, idx2, item_type, case):
  70. """
  71. 将切出来的一道题 按答案解析 进一步细分
  72. :param one_type_list: 一类题文的list
  73. :param idx1:题目开头,包含
  74. :param idx2:下一题开头
  75. :param item_type:题型
  76. :param case: 属于哪种情况
  77. :return:{"content": ,"answer": ,"parse":}
  78. """
  79. one_item = one_type_list[idx1:idx2]
  80. if idx2 == -1:
  81. one_item = one_type_list[idx1:]
  82. if case == 'case1': # 没“答案”关键字
  83. inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
  84. table_label_cleal("\n".join(one_item)))
  85. inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
  86. else str(a).replace('None', '').strip() for a in inside_split]
  87. dd = dict(zip(["content", "parse_title"], inside_split[0:2]))
  88. dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
  89. else:
  90. dd = dict(zip(["content", "answer"], re.split(r"【答案】\n?|答案\s*[::]\n?",
  91. table_label_cleal("\n".join(one_item)), maxsplit=1)))
  92. subdd = dict(zip(["answer", "parse_title", "parse"],
  93. re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?|(解析|解答|分析|详解|点评|点睛)\s*[::]", dd["answer"], maxsplit=1)))
  94. dd["answer"] = subdd["answer"]
  95. if "parse_title" in subdd:
  96. dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
  97. dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
  98. dd["content"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
  99. dd["item_topic_name"] = item_type if re.sub('[((]', "", item_type) != '本大题' else "解答题"
  100. if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)|
  101. ans = re.search(r'故选[::]?<imgsrc\d+data-latex="([A-Z;;和与、、\s]+)"/>|故选[::]?([A-Z;;和与、、\s]+)',
  102. dd["parse"].replace("$", "").replace(" ", ""))
  103. if ans:
  104. dd["answer"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None
  105. else:
  106. dd["answer"] = ""
  107. else:
  108. dd["answer"] = "见解析"
  109. ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"])
  110. if ans:
  111. dd["answer"] = ans.group(1)
  112. del dd["parse_title"]
  113. return dd
  114. # def correct_wrong_no(con_list, items_no, item_no_type):
  115. # """
  116. # 针对分错的题号进行纠正 ;;带解析的划分题目最好按关键字拆分!!!!
  117. # 题号划分错误有:题号重复,题号遗漏,题号偏离很远的错误如88.等
  118. # 无题型行时,con_list中每个元素代表每一行
  119. # 有题型行时,con_list中每个元素代表每个题型中的所有题目
  120. # items_no:初步找到的所有题号
  121. # :return: con_list
  122. # """
  123. # # items_no = [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
  124. # seq_no = find_seq_num(items_no) # 找到连续的分组
  125. # print("items_no:", items_no)
  126. # print("seq_no:", seq_no)
  127. #
  128. # err_no_idx = {} # 分错的分组序号和错误题号,主要针对2个以内成组的序号
  129. # double_no = [] # 针对2个以上成组,且重复序号 分错的情况
  130. # omit_no = [] # 因没有换行或无题号导致 没有 切分出来的题号
  131. # right_no_list = []
  132. # if len(seq_no) > 1: # 存在分断或分错的地方
  133. # print('按题号切分的过程中,存在分断或分错的地方')
  134. # right_no = [i for i in seq_no if len(i) > 2]
  135. # if len(find_seq_num(sum(right_no, []))) == 1: # 2个以上成的所有组是连续的
  136. # # 题号序列异常值判断
  137. # right_seq = del_exception_value(items_no) # 主要去掉异常的大值
  138. # # print("right_seq:",right_seq)
  139. # right_max_v = -1
  140. # if not right_seq:
  141. # right_max_v = max(items_no)
  142. # else:
  143. # right_max_v = right_seq[-1]
  144. # # print("right_max_v:", right_max_v)
  145. # if sum(right_no, [])[0] == 1 and sum(right_no, [])[-1] == right_max_v: # 题号从1开始
  146. # # [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
  147. # right_no_list.extend([i for k, i in enumerate(seq_no) if len(i) > 2])
  148. # err_no_idx.update({k: i for k, i in enumerate(seq_no) if len(i) <= 2}) # 出现重复题号
  149. # else: # 说明左右两边有遗漏
  150. # # [[1, 2], [4, 5], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]]
  151. # # [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [20, 21]]
  152. # # [[1, 2], [4, 5], [7, 8, 9, 10, 11], [6], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]]
  153. # # todo_no = [i for i in seq_no if len(i) <= 2]
  154. # right_no_idx = [k for k, i in enumerate(seq_no) if len(i) > 2]
  155. # if seq_no[:right_no_idx[0]]: # k>=1 左边有遗漏
  156. # que_no = set(range(1, sum(right_no, [])[0])) - set(sum(seq_no[:right_no_idx[0]], []))
  157. # omit_no.extend(list(que_no))
  158. # elif len(right_no_idx) == 1 and seq_no[right_no_idx[0]+1:]: # 右边有遗漏
  159. # que_no = set(range(sum(right_no, [])[-1]+1, right_max_v)) - set(sum(seq_no[right_no_idx[0]+1:], []))
  160. # omit_no.extend(list(que_no))
  161. # # print("omit_no:",omit_no)
  162. # # 既遗漏又有重复的错误不同时考虑!!!!,先报遗漏错误,教师修改后再对重复部分进行纠正
  163. # else:
  164. # # 存在题号错误:一种是与正确的重复,另一种是与序号偏离的很远,如81,目前是暂定取99内的数字作为序号
  165. # # [[1, 2], [4, 5, 6, 7, 8, 9, 10, 11], [13, 14], [16, 17, 18, 19, 20, 21]]
  166. # # [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [13, 14], [16, 17, 18, 19, 20, 21]]
  167. # num_count = Counter(items_no)
  168. # # print("num_count:",num_count)
  169. # if len(set(num_count.values())) > 1:
  170. # print("存在{题号重复}的切分错误")
  171. # for k, v in num_count.items():
  172. # if v >= 2: # 重复2次以上
  173. # # print(items_no.index(k)) # 只能获取第一个元素的索引值
  174. # v2_index = [index for (index, value) in enumerate(items_no) if value == k][1:] # 重复序号的索引
  175. # # 判断重复序号哪个是错误的,这里没有考虑题号遗漏的情况
  176. # if v2_index[0]+items_no[0] > k: # 位置 > 序号, 一般要求题号从1开始
  177. # for subi in v2_index:
  178. # # print(subi, k)
  179. # double_no.append((k, 'xiao'))
  180. # del items_no[subi]
  181. # if v2_index[0]++items_no[0] < k: # 位置 < 序号
  182. # for subi in v2_index:
  183. # double_no.append((k, 'da'))
  184. # del items_no[subi]
  185. #
  186. # else: # 存在题号遗漏
  187. # print("存在题号遗漏")
  188. # for k, i in enumerate(right_no):
  189. # if k == 0:
  190. # if i[0] == 2:
  191. # omit_no.append(1)
  192. # if i[0] > 2:
  193. # omit_no.append("1~"+str(i[0]-1))
  194. # if 0 < k < len(right_no):
  195. # omit_no.extend(list(range(right_no[k-1][-1]+1, i[0])))
  196. # # if omit_no:
  197. # # return "第" + ",".join(map(str, omit_no)) + "题的格式是否正确,不要放在表格中,且要求题号从1开始并连续;" \
  198. # # "若格式正确,请将第" + ",".join(map(str, omit_no)) + "题的题号(包括题号后的标点符号)重新手输且与上一题重新换行"
  199. #
  200. # if double_no and len(find_seq_num(items_no)) == 1:
  201. # # 在分错题号前加标识
  202. # all_con = "@@\n" + "@@\n".join(con_list)
  203. # for db in double_no:
  204. # may_no_st = re.search(r"\n\s*" + str(db[0]) + r'\s*([..、、].+?)',
  205. # all_con, re.S).start() # 分错位置在全文中的索引
  206. # if item_no_type == 2:
  207. # may_no_st = re.search(r"\n\s*[((]\s*" + str(db[0]) + r'\s*[))]\s*([..、、]?.+?)',
  208. # all_con, re.S).start() # 分错位置在全文中的索引
  209. # if db[1] == 'xiao': # 重复的切分错误的序号在正确的后面,第一个匹配到的是正确的
  210. # # all_con = all_con[:may_no_st] + re.sub(r"\s+((?!src).)+?", r"\1", all_con[may_no_st:][:15]) + all_con[may_no_st:][15:]
  211. # # 该正则表示空格后面是src字符串时,空格保留;最开始时图片已做过替换,这里也可以去掉图片信息中的空格
  212. #
  213. # err_no_st = re.search(r"\n\s*" + str(db[0]) + r'\s*([..、、].+?)',
  214. # all_con[may_no_st+10:], re.S).start() # 分错位置在全文中的索引
  215. # if item_no_type == 2:
  216. # err_no_st = re.search(r"\n\s*[((]\s*" + str(db[0]) + r'\s*[))]\s*([..、、]?.+?)',
  217. # all_con[may_no_st + 10:], re.S).start() # 分错位置在全文中的索引
  218. # # print("err_no_st:", err_no_st, all_con[may_no_st + err_no_st+10:may_no_st + err_no_st+20])
  219. #
  220. # all_con = all_con[:may_no_st + err_no_st + 11] + "【fei】" \
  221. # + all_con[may_no_st + err_no_st + 11:] # 在分错题号前加标识
  222. #
  223. # if db[1] == 'da': # 重复的切分错误的序号在正确的前面,第一个匹配到的是错误的
  224. # all_con = all_con[:may_no_st + 1] + "【fei】" \
  225. # + all_con[may_no_st + 1:] # 在分错题号前加标识
  226. # # print("all_con:",all_con)
  227. # con_list = all_con.split("@@\n")[1:]
  228. #
  229. # # 针对2个以内成组的序号 加错误标识
  230. # sorted_idx = sorted(err_no_idx.keys(), reverse=False) # 对字典按索引位置排序
  231. # print("err_no_idx:", err_no_idx, "sorted_idx:", sorted_idx)
  232. # if err_no_idx:
  233. # if sorted_idx[0] > 0:
  234. # all_con = "@@\n" + "@@\n".join(con_list)
  235. # st_flag = str(seq_no[sorted_idx[0] - 1][-1]) # 分错位置的前一个题号
  236. # # 分错位置的前一个题号在全文中的索引
  237. # # if err_no_idx[sorted_idx[0]][0] == int(st_flag):
  238. # # return st_flag + "题题号出现重复"
  239. # st_flag_index = re.search(r"\n+\s*" + st_flag + r'\s*([..、、].+?)', all_con, re.S).start()
  240. # if item_no_type == 2:
  241. # st_flag_index = re.search(r"\n+\s*[((]\s*" + st_flag + r'\s*[))]\s*([..、、]?.+?)', all_con, re.S).start()
  242. # for k in sorted_idx: # 遍历键
  243. # for subk in err_no_idx[k]: # 遍历 键 的值
  244. # # print('*****************')
  245. # # print("st_flag:", st_flag, '---subk:', subk)
  246. # # print("st_flag_index:",st_flag_index)
  247. # err_no_st = re.search(r"\n\s*" + str(subk) + r'\s*([..、、].+?)',
  248. # all_con[st_flag_index:], re.S).start() # 分错位置在全文中的索引
  249. # if item_no_type == 2:
  250. # err_no_st = re.search(r"\n\s*[((]\s*" + str(subk) + r'\s*[))]\s*([..、、]?.+?)',
  251. # all_con[st_flag_index:], re.S).start() # 分错位置在全文中的索引
  252. # all_con = all_con[:st_flag_index + err_no_st + 1] + "【fei】" \
  253. # + all_con[st_flag_index + err_no_st + 1:] # 在分错题号前加标识
  254. # con_list = all_con.split("@@\n")[1:]
  255. # else: # 拿到了前面不是题号的序号 [27, 27, 1, 2, 3, 4, 5, 6, 7]
  256. # all_con = "@@\n" + "@@\n".join(con_list)
  257. # if items_no.count(1) == 1:
  258. # con_1 = re.split(r"@@\n\s*1\s*[..、、]", all_con)[1]
  259. # con_list = ("1、"+con_1).split("@@\n") # right_no_list = sum(right_no_list, [])
  260. # # right_no_list = str(right_no_list).replace("[", "").replace("]", "").replace(" ", "").split(",")
  261. #
  262. # # con_list = re.split(r"\n\s*("+ r"|".join(right_no_list) + ")\s*[..、、]", all_con)[1:]
  263. # # if len(con_list) > 1:
  264. # # con_list = [con for k, con in enumerate(con_list) if k % 2 == 1]
  265. # return con_list
  266. def split2one_item(con_list):
  267. """
  268. 第一种试卷格式:教师用卷,含答案和解析关键字
  269. 输入html文件,先按大题将 一篇文档分开
  270. 切分思路:
  271. 1.按空行分割,首先将【答案】,【解析】,<img src=<img src="files/image\d+.png">前面的空行<p> </p>删掉,然后直接按<p></p>来split
  272. 格式要求:每小题 21. 数字+英文点号 大题:中文 一二三四+中文顿号
  273. :return:
  274. """
  275. # item_no_type = 1
  276. # # all_con = table_label_cleal("\n" + "\n".join(con_list))
  277. # # item_no = [int(no) for no in re.findall(r'\n+\s*([1-9][0-9]?)\s*[..、、]', all_con)]
  278. # # if len(item_no) <= 2:
  279. # # item_no_type = 2
  280. # # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9][0-9]?)\s*[))]\s*[..、、]?', all_con)]
  281. # # if len(item_no) > 3:
  282. # # all_con = re.sub(r'\n\s*\(([1-9][0-9]?)\)\s*[..、、]?', "\n" + r"【@\1、", all_con)
  283. # # con_list = all_con.replace("【@", "").split("\n")[1:]
  284. # # ----------------------------------------------------------------------------
  285. # # 去掉多余空格,作用不大
  286. # con2 = ["【delete】" if (k < len(con_list) - 1 and v.strip() == "" and (
  287. # re.match(r"【(答案|解析)】|(答案|解析)\s*[::]|<imgsrc\d+|\s+", con_list[k + 1].strip()) or
  288. # re.match(r"(([1-9]|[1-4][0-9])\s*[..、、]|[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)",
  289. # con_list[k + 1].strip()) is None))
  290. # or (k > 0 and v.strip() == "" and (
  291. # re.match(r"【(答案|解析)】$|(答案|解析)\s*[::]", con_list[k - 1].strip()) or
  292. # re.match(r"[a-z<>/\s]*[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题",
  293. # con_list[k - 1].strip())))
  294. # else v for k, v in enumerate(con_list)]
  295. # con3 = list(filter(lambda x: x != "【delete】", con2))
  296. # while len(con3) > 0:
  297. # if con3[-1].strip() == "":
  298. # del con3[-1]
  299. # if con3[0].strip() == "":
  300. # del con3[0]
  301. # con3.append("") # 不然最后一个题就漏掉了
  302. #
  303. # # 开头没用信息处理
  304. # con3[0] = re.sub(r"([一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)", r"\n\1", con3[0])
  305. # while con3 and (re.search(r"[\u4e00-\u9fa5]", con3[0]) is None
  306. # or re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con3[0]) is None):
  307. # del con3[0]
  308. #
  309. # # ----------------------解析 方案【1】-------------------------------------------------------------
  310. # # 根据大题型分,再按【答案|解析】初步拆分题目,再在‘解析’和‘答案’间细分‘题干’和‘解析’
  311. # # 1、获取题型行信息、按题型行切分
  312. # con4, all_type_info, all_type, each_item_score, each_item_score2, select_type_id, choice_class \
  313. # = get_item_head_info("\n" + "\n".join(con3))
  314. #
  315. # # 2、据是否有题型行分两步进行
  316. # res = []
  317. # if not all_type:
  318. # print("不存在大题题型行或题型行格式有问题")
  319. # return "不存在大题题型行或题型行格式有问题,请检查" # 放第【2】种方案中进行处理
  320. # else:
  321. # if len(all_type) != len(con4):
  322. # print("存在题型行没有换行")
  323. # return "存在题型行末尾没有换行,请在所有题型行末尾重新换行" # 放第【2】种方案中进行处理
  324. # else:
  325. # # if "非选择题" in all_type:
  326. # # return "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
  327. # index = 0
  328. # for num, one_type in enumerate(con4):
  329. # count = 1
  330. # if len(re.findall(r"\n\s*【答案】", one_type)) == len(re.findall(r"\n\s*【解析】", one_type)):
  331. # subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", one_type.strip())
  332. # # index根据第一道题的题号进行纠正
  333. # st_pat = re.match(r"([1-9]|[1-6][0-9])\s*[..、、].+?", subcon[0].strip())
  334. # if st_pat and num == 0:
  335. # st_id = st_pat.group(1)
  336. # if int(st_id) != 1:
  337. # index = int(st_id) - 1
  338. #
  339. # if len(subcon) == 5: # 只有1道题
  340. # dd = dict(zip(["content", "answer", "parse"],
  341. # re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(one_type))))
  342. # dd["item_topic_name"] = all_type[num]
  343. # dd["content"] = re.sub(r"\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
  344. # dd["score"] = each_item_score[num]
  345. # dd["errmsgs"] = []
  346. # dd["item_id"] = count + index
  347. # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
  348. # dd["score"] = each_item_score2[str(dd["item_id"])]
  349. # if select_type_id and dd["item_id"] in select_type_id:
  350. # dd['is_optional'] = 'true'
  351. # res.append(dd)
  352. # # count += 1
  353. # else:
  354. # # ------在下一题【解析】在本题【答案】之间找到下一题【content】的位置--------
  355. # for id in range(len(subcon)):
  356. # if re.match(r"\n*\s*【解析】", subcon[id]) and id < len(subcon) - 2: # 不是最后一个解析,倒数第二个是最后一个解析
  357. # count += 1
  358. # ssub = subcon[id + 1].strip().split("\n") # 首尾空行先去掉
  359. # blank_line = [i for i, v in enumerate(ssub) if v.strip() == ""] # 空格索引
  360. # # 索引to题号字典
  361. # con_id_line_dict = {i: re.match(r"([1-9]|[1-6][0-9])\s*[..、、]", v.strip()).group(1)
  362. # for i, v in enumerate(ssub)
  363. # if re.match(r"([1-9]|[1-6][0-9])\s*[..、、]", v.strip())}
  364. # # print("con_id_line_dict",con_id_line_dict)
  365. # con_id_line = list(con_id_line_dict.keys()) # 行索引,第几行
  366. # topicno = list(con_id_line_dict.values()) # 题号序列
  367. # topicno_line_idx = dict(zip(topicno, con_id_line)) # 题号to行索引字典
  368. # if len(con_id_line) != len(topicno_line_idx):
  369. # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
  370. # index + count) + "题)的题文和上一题的解析之间出现【多个相同的题目序号】,请重新确认!"
  371. # else:
  372. # if len(blank_line) == 1 and len(con_id_line) == 1: # 一般情况只有一个空行
  373. # if con_id_line[0] > blank_line[0]:
  374. # ssub.insert(con_id_line[0], "【content】")
  375. # else:
  376. # if str(count + index) == topicno[0]: # 该题的序号正确,优先按序号拆
  377. # ssub.insert(con_id_line[0], "【content】")
  378. # else:
  379. # ssub[blank_line[0]] = "【content】" # 该题序号不对时再考虑空行
  380. # elif len(blank_line) != 1:
  381. # if len(con_id_line) >= 1: # 优先考虑题目序号,多个序号时
  382. # # ssub.insert(con_id_line[-1], "【content】") # 默认最后一个,很粗糙
  383. # if str(count + index) in topicno:
  384. # ssub.insert(topicno_line_idx[str(count + index)], "【content】")
  385. # else:
  386. # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
  387. # index + count) + "题)的题文和上一题的解析之间出现【题目序号不连续】,请检查该题目序号并重新手输!"
  388. # elif len(blank_line) > 1: # 题目序号有误,多个空行时
  389. # # ssub[blank_line[-1]] = "【content】"
  390. # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
  391. # index + count) + "题)的题文和上一题的解析之间出现【题目序号有误】,请将题目序号重新手输!"
  392. # else: # 无序号,无空行
  393. # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
  394. # index + count) + "题)的题文和上一题的解析之间出现【题目序号或空行都有误】,请将题目序号重新手输并查看空行!"
  395. # # 如果存在空行有误,且题目序号有误时,那基本就会拆分错误
  396. # else: # len(con_id_line)!=1
  397. # if not con_id_line: # 一个空行,没有序号时
  398. # # ssub[blank_line[0]] = "【content】"
  399. # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
  400. # index + count) + "题)的题文和上一题的解析之间出现【题目序号有误】,请将题目序号重新手输!"
  401. # else: # 1个空行,多个序号时
  402. # print(all_type[num], "第", count, "道题的题文和上一题的解析之间存在【多个题目序号】")
  403. # if str(count + index) in topicno:
  404. # ssub.insert(topicno_line_idx[str(count + index)], "【content】")
  405. # else:
  406. # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
  407. # index + count) + "题)的题文和上一题的解析之间出现【题目序号不连续】,请检查该题目序号并重新手输!"
  408. # # ssub.insert(con_id_line[-1], "【content】") # 须优化
  409. # subcon[id + 1] = "\n".join(ssub)
  410. # # ----------------------------------------------------------------
  411. # all_item = re.split(r"【content】", "\n".join(subcon).strip())
  412. # for idk, one_item in enumerate(all_item):
  413. # dd = dict(zip(["content", "answer", "parse"],
  414. # re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?",
  415. # table_label_cleal(one_item))))
  416. # dd["item_topic_name"] = all_type[num]
  417. # dd["content"] = re.sub(r"\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
  418. # dd["score"] = each_item_score[num]
  419. # dd["errmsgs"] = []
  420. # dd["item_id"] = idk + 1 + index
  421. # if choice_class:
  422. # for k, v in choice_class.items():
  423. # if count + index in v:
  424. # dd["item_topic_name"] = k + "选题"
  425. # elif len(choice_class) == 1:
  426. # dd["item_topic_name"] = "多选题" if k == "单" else "单选题"
  427. # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
  428. # dd["score"] = each_item_score2[str(dd["item_id"])]
  429. # if select_type_id and dd["item_id"] in select_type_id:
  430. # dd['is_optional'] = 'true'
  431. # res.append(dd)
  432. # # pprint(res)
  433. # # print('------------------')
  434. # else:
  435. # # print("【答案】或【解析】格式有误")
  436. # return "第" + str(num + 1) + "大题《" + all_type[num] + "》中【答案】或【解析】格式有误或其中某道题中出现多个相同关键字或漏关键字"
  437. # index += count
  438. # return res, item_no_type
  439. #
  440. # def only_parse_split(one_item_ans, item_type, reparse_n = 1):
  441. # """
  442. # 拆分出答案和解析
  443. # :one_item: 一道题的答案解析部分,
  444. # :return:{'answer': ,"parse": }
  445. # """
  446. # dd = {'parse': one_item_ans, 'answer': ""}
  447. # simp_item = re.sub("(【([解分][析答]|详解|点[评睛])】|答案|解析|详解)\s*[::]?", "", one_item_ans)
  448. # simp_item = re.sub("[^\u4e00-\u9fa5∵∴]", "", simp_item)
  449. # if len(simp_item) < 10 and re.search("因为?|因此|所以|根据|依据|若|假设", simp_item) is None:
  450. # dd['parse'] = ""
  451. #
  452. # if re.search(r"【(解析|解答|分析|详解|点评|点睛)】\n?|(解析|解答|分析|详解|点评|点睛)\s*[::]", one_item_ans):
  453. # dd1 = dict(zip(["answer", "parse_title", "parse"],
  454. # re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", one_item_ans, maxsplit=1)))
  455. # dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
  456. # del dd1["parse_title"]
  457. #
  458. # if item_type in ["单选题", "多选题", "选择题", "单项选择", "多项选择"]:
  459. # ans = re.search(r'故选\s*[::]\s*<img src=[^>]+?data-latex="([A-Z;;和与、、\s]+)".+?/>|故选\s*[::]?\s*([A-Z;;和与、、\s]+)',
  460. # dd["parse"].replace("$", ""))
  461. # if ans:
  462. # dd["answer"] = ans.group(1) if ans.group(1) is not None else ans.group(2)
  463. # elif not dd['answer']:
  464. # dd['answer'] = one_item_ans.strip()
  465. # dd['answer'] = re.sub("[.;;.]", "", dd['answer'])
  466. # else:
  467. # ans1 = re.search(r'故\s*[::]?\s*(答案分?别?[为是]?|填)\s*[::]?\s*(.+?)[..]\s*(\n|$)', dd["parse"])
  468. # ans2 = re.search(r'故\s*[::]?\s*(答案分?别?[为是]?|填)\s*[::]?\s*(<img src=.+?/>)[..]?\s*(\n|$)', dd["parse"])
  469. # if reparse_n != 2 and "【答案】" not in one_item_ans and \
  470. # len(re.findall(r"[((]\d[))]|[\n::;;。】]([((](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[))]|[①②③④])",
  471. # one_item_ans.replace(" ", ""))) > 1:
  472. # dd["answer"] = "见解析"
  473. # elif ans1:
  474. # dd["answer"] = ans1.group(2)
  475. # elif ans2:
  476. # dd["answer"] = ans2.group(2)
  477. # elif not dd['parse']:
  478. # dd['answer'] = one_item_ans.strip()
  479. # else:
  480. # dd["answer"] = "见解析"
  481. #
  482. # return dd