stems_structure.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from utils.item_type_line import get_item_head_info
  5. from utils.topic_no import judge_item_no_type, get_right_no, pre_get_item_no
  6. def stems_structure_byno(stem_con):
  7. """
  8. 按题号进行切分;
  9. 针对无解析的试卷中所有题目的拆分;
  10. :return:{"stem": , "item_id": , "errmsgs": [],"type":,}
  11. """
  12. try:
  13. while stem_con and ((re.search(r"[\u4e00-\u9fa5]", stem_con[0]) is None) or
  14. (re.search(r"[一二三四]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", stem_con[0]) is None
  15. and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[..、、]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$",
  16. stem_con[0]) is None)):
  17. del stem_con[0]
  18. except:
  19. return "本份试卷开头格式有问题,请按试卷格式来!"
  20. stem_str = "\n" + "\n".join(stem_con)
  21. # 题号格式有条件清洗
  22. # def sub1(ss):
  23. # if int(ss.group(5)) - int(ss.group(2)) in [1, 2]:
  24. # return ss.group(1)+"\n"+ss.group(5)+"、"+ss.group(6)
  25. # else:
  26. # return ss.group(0)
  27. # stem_str = re.sub(r"(\n\s*([1-4][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-4][0-9]|[1-9])\s*[..、、]).)+?)\n+\s*([1-4][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", sub1, stem_str, flags=re.S)
  28. # 也可以用下面方法,但比较啰嗦
  29. while re.search(r"(\n\s*([1-9][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-9][0-9]|[1-9])\s*[..、、]).)+?)"
  30. r"\n+\s*([1-9][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", stem_str, re.S):
  31. wrong_id_info = re.search(r"(\n\s*([1-9][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-9][0-9]|[1-9])\s*[..、、]).)+?)"
  32. r"\n+\s*([1-9][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", stem_str, re.S)
  33. stem_str = stem_str.replace(wrong_id_info.group(0),
  34. wrong_id_info.group(1)+"\n"+wrong_id_info.group(5)+"、"+wrong_id_info.group(6))
  35. # print(stem_str)
  36. # 1、获取题型行信息、按题型行切分
  37. con11, title_info_dict, choice_class = get_item_head_info(stem_str)
  38. all_type = title_info_dict["all_type"]
  39. title_type_num = title_info_dict["title_type_num"]
  40. select_type_id = title_info_dict["select_type_id"]
  41. each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"]
  42. # -------------------------------------------------------
  43. res = []
  44. item_type_classify = {} # 记录每类题型中含有的题目个数,含合并的情况
  45. item_type_num = [] # 题型不合并
  46. pic_no = {} # 记录每个题的图
  47. new_item_no = [] # 将纠错后的题号再记录一份
  48. if not all_type:
  49. print("不存在大题题型行或题型行格式或名称有问题")
  50. # 初步获取题号,题号类型
  51. stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
  52. # 获取正确题号的位置,进行切分
  53. new_item_no, items_no_idx = get_right_no(item_no_info)
  54. one_item_split = [stem_str[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
  55. dd = {}
  56. for n, one_item in enumerate(one_item_split):
  57. dd["stem"] = one_item
  58. dd["item_id"] = new_item_no[n] # 题目本身的题号
  59. dd["type"] = "" # 先题型不备注,根据答案再看
  60. dd["errmsgs"] = []
  61. dd["score"] = 0
  62. # if select_type_id and dd["item_id"] in select_type_id:
  63. # dd['is_optional'] = 'true'
  64. res.append(dd)
  65. dd = {}
  66. # 先不做拆图处理了
  67. else:
  68. # print(all_type, len(con11))
  69. if len(all_type) == len(con11)-1: # 第一部分的题型行掉了
  70. all_type.insert(0, "")
  71. each_item_score.insert(0, 0) # 按题型行拿的分数也会掉,先补上默认的0
  72. elif len(all_type) != len(con11):
  73. print("第二种试卷格式:存在题型行没有换行") # 可能造成题目和题型行在同一行
  74. # return "存在题型行末尾没有换行或题型行中题型不明确"
  75. # else:
  76. # # print(all_type)
  77. # if "非选择题" in all_type:
  78. # error_info1 = "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
  79. # if any([True for one_type in all_type if re.search("必考基础综合中等面列下各", one_type)]):
  80. # error_info1 = "存在题型行中题型不明确"
  81. # ---------------------------------------------------------------------
  82. # 思路:>>>>先纠正题号,再拆分题目;等切分好后再纠正(删减添加操作)比较费时
  83. # 按题号切分,可再加些细节!!!! 1>>题号不要求连续
  84. # 初步判断题号类型
  85. stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
  86. # >>>>切分题目
  87. for num, one_type in enumerate(con11):
  88. # 初步获取题号
  89. item_no_info = pre_get_item_no("\n"+one_type, item_no_type)
  90. # 获取正确题号的位置,进行切分
  91. if res:
  92. items_no_temp, items_no_idx = get_right_no(item_no_info, have_type=1, last_id=res[-1]["item_id"])
  93. else:
  94. items_no_temp, items_no_idx = get_right_no(item_no_info)
  95. is_from_0 = 1
  96. if items_no_temp and items_no_idx[0] != 0 and items_no_temp[0] > 1: # 以防出现题号漏了的情况
  97. items_no_idx.insert(0, 0)
  98. is_from_0 = 0
  99. one_item_split = [("\n" + one_type)[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
  100. # ------------------每个大题的第一题前面的图可能有漏的情况------------------------
  101. may_oimt_pic = []
  102. if not is_from_0:
  103. may_oimt_pic_info = re.search("\n(<imgsrc.+?/>(\s*<imgsrc((?!/>).)+?/>)*?)\s*\n?$", one_item_split[0])
  104. if may_oimt_pic_info:
  105. if len(one_item_split) > 1 and re.search("如[上下左右]?图", one_item_split[1]):
  106. may_oimt_pic.extend(re.findall("<imgsrc.+?/>", may_oimt_pic_info.group(1)))
  107. # print("may_oimt_pic:", may_oimt_pic)
  108. # ------------------------------------------------------------------
  109. if len(all_type) == len(con11):
  110. if not is_from_0:
  111. if all_type[num] and all_type[num].replace("题", "") not in ['选择', '单选', '多选', '不定选择']:
  112. one_item_split = one_item_split[1:]
  113. else: # 针对选择题第1题或前面几题题号漏了的情况
  114. one_item_split1 = one_item_split[1:]
  115. # ------针对分错的细节继续拆分,如上一题选项行与下一题题干没有换行-----------------
  116. new_one_item_split = []
  117. pattern_1 = re.compile(
  118. r"([CDE]\s*[..、、].+?)(?<![::])\s([1-9]|1[0-9])\s*[..、、](.+?([是为有]|等于)[((]\s*[))]\n)")
  119. pattern_2 = re.compile(
  120. r"(([CDE]\s*[..、、]|\([CDE]\)).+?)(?<![::])\s\(([1-9]|1[0-9])\)(.+?([是为有]|等于)[((]\s*[))]\n)")
  121. for nn, one_item in enumerate(one_item_split1):
  122. if item_no_type == 1 and re.search(pattern_1, one_item):
  123. err_optcon = re.sub(pattern_1, r"\1【】\3", one_item) # 太粗糙了
  124. new_one_item_split.extend(err_optcon.split("【】"))
  125. items_no_temp.insert(nn+1, int(re.search(pattern_1, one_item).group(2)))
  126. elif item_no_type == 2 and re.search(pattern_2, one_item):
  127. err_optcon = re.sub(pattern_2, r"\1【】\4", one_item)
  128. new_one_item_split.extend(err_optcon.split("【】"))
  129. items_no_temp.insert(nn + 1, int(re.search(pattern_2, one_item).group(3)))
  130. else:
  131. new_one_item_split.append(one_item)
  132. if re.search(r'(^|\n)+\s*[A-Z]\s*[..、、]|(^|\n)+\s*[((]\s*[A-Z]\s*[))]\s*[..、、]?',
  133. str(one_item_split[0]).strip(), re.S) or \
  134. re.search("如[上下左右]图|[((]\s+[))]\s*($|\n)", one_item_split[0].strip()):
  135. new_one_item_split.insert(0, one_item_split[0])
  136. items_no_temp.insert(0, items_no_temp[0]-1)
  137. one_item_split = new_one_item_split
  138. # ----------------------------------------------------------
  139. if all_type[num] in item_type_classify: # 统计每类题型含有的题目个数
  140. item_type_classify[all_type[num]] += len(one_item_split)
  141. elif all_type[num]:
  142. item_type_classify[all_type[num]] = len(one_item_split)
  143. item_type_num.append((all_type[num], len(one_item_split)))
  144. else:
  145. if not is_from_0:
  146. one_item_split = one_item_split[1:]
  147. new_item_no.extend(items_no_temp)
  148. # ---------------------------------------------------------------
  149. # 从题型行中判断单选和多选 放在这里收集也可以:可可能出现两不同题型行提到的序号一样
  150. # choice_class = {}
  151. # if all_type[num] == "选择题":
  152. # multi_choice_info = re.findall("[\s,,;;((]+第?(\d+)[至到\-~]+(\d+)题[是为]([多单])项?选择?题?",
  153. # all_type_info[num][2])
  154. # if multi_choice_info:
  155. # for mu in multi_choice_info:
  156. # choice_class[mu[2]] = list(range(int(mu[0]), int(mu[1]) + 1))
  157. # ---------------------------------------------------------------
  158. dd = {}
  159. for nn, one_item in enumerate(one_item_split):
  160. dd["stem"] = one_item
  161. #------------------对每个大题的第一题加上may_oimt_pic--------------------------
  162. if nn==0 and may_oimt_pic:
  163. dd['susp_pic'] = may_oimt_pic
  164. # ------------------------------------------------------------------------
  165. dd["item_id"] = items_no_temp[nn]
  166. dd["score"] = each_item_score[num]
  167. if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
  168. dd["score"] = each_item_score2[str(dd["item_id"])]
  169. dd["errmsgs"] = []
  170. if all_type[num] and re.search("必考基础综合中等面列下各非", all_type[num]) is None:
  171. dd["type"] = all_type[num] if re.sub('[((]', "", all_type[num]) != '本大题' else "解答题"
  172. else:
  173. dd["type"] = ""
  174. if choice_class:
  175. for k, v in choice_class.items():
  176. if dd["item_id"] in v:
  177. dd["type"] = k + "选题"
  178. # elif len(choice_class) == 1:
  179. # dd["type"] = "多选题" if k == "单" else "单选题"
  180. if select_type_id and dd["item_id"] in select_type_id:
  181. dd['is_optional'] = 'true'
  182. if len(one_item_split) == 1 and dd["score"] == 0.0 and title_info_dict["total_score"][num] > 0.0:
  183. dd["score"] = title_info_dict["total_score"][num]
  184. res.append(dd)
  185. # 多图在一起的情况进行拆分
  186. # if len(re.findall(r"第?\(?([1-9]|[1-4][0-9])\)?\s*题图", one_item)) > 1 and \
  187. # re.search(r"<imgsrc\d.+?\n+\s*第?\(?([1-9]|[1-4][0-9])\)?\s*题图", one_item, re.S):
  188. pic_info = re.search(r"(<imgsrc\d.+?)\n+\s*((第?\(?([1-9]|[1-9][0-9])\)?\s*题图.?\s*){2,})", one_item, re.S)
  189. if pic_info:
  190. pic_list = re.findall(r"<imgsrc\d.*?/>", pic_info.group(1))
  191. pic_w = re.findall("([1-9]|[1-9][0-9])[))]?\s*(?=题)", pic_info.group(2))
  192. if len(pic_list) >= len(pic_w):
  193. pic_no = {int(p): pic_list[len(pic_list) - len(pic_w) + k] for k, p in enumerate(pic_w)}
  194. dd["stem"] = dd["stem"].replace(pic_info.group(2), "")
  195. for k, pic in enumerate(pic_list[::-1]):
  196. if k < len(pic_w):
  197. dd["stem"] = dd["stem"].replace(pic, "")
  198. # -----------------------------------------------------
  199. dd = {}
  200. if pic_no:
  201. for i in list(pic_no.keys()):
  202. res[i-1]["stem"] = res[i-1]["stem"].strip() + "\n" + pic_no[i] + "\n" + "第" + str(i) + "题图"
  203. # ----------------------------------------------------------------------
  204. # 可能出现选择题类的选做题
  205. # --------最后判断一下题量是否正确-----------------------------------------
  206. # 针对拆分后题量特别多的情况,将拆分后题量与已知题量一样的题目保留
  207. new_res = []
  208. right_type = [] # 记录与已知题目个数一样的分块{第几个分块,题型}
  209. if all_type and sum(list(item_type_classify.values())) > 40: # 很可能存在大片不是题号的题号
  210. title_type_num_all = sum([t[1] for t in title_type_num]) # 题型行给出的题目个数
  211. if title_type_num_all > 0:
  212. for idx, type_num in enumerate(item_type_num):
  213. if type_num[1] == title_type_num[idx][1] > 0: # 同一题型是否题量一致
  214. right_type.append((idx, type_num[0]))
  215. # elif title_type_num[idx][1] == 0 and type_num[1] == 1:
  216. # title_type_num[idx][1] = 1
  217. # right_type.append((idx, type_num[0]))
  218. elif title_type_num[idx][1] == 0 and type_num[1] >= 1: # 2023.6.26
  219. title_type_num[idx][1] = type_num[1]
  220. right_type.append((idx, type_num[0]))
  221. print("right_type::item_type_num", right_type, item_type_num)
  222. if right_type:
  223. for rtype in right_type:
  224. for idx, item in enumerate(res):
  225. r_st = 0 if rtype[0]==0 else sum([t[1] for t in item_type_num[:rtype[0]]])
  226. r_ed = r_st + item_type_num[rtype[0]][1]
  227. if item["type"] == rtype[1] and r_st<=idx<r_ed:
  228. new_res.append(item)
  229. res = new_res
  230. item_type_num = title_type_num # 将题目已知题型数量信息作为正确信息
  231. new_item_type_classify = {}
  232. for i in right_type:
  233. if title_type_num[i[0]][0] not in new_item_type_classify:
  234. new_item_type_classify[title_type_num[i[0]][0]] = title_type_num[i[0]][1]
  235. else:
  236. new_item_type_classify[title_type_num[i[0]][0]] += title_type_num[i[0]][1]
  237. item_type_classify = new_item_type_classify
  238. return res, all_type, item_type_classify, item_no_type, item_type_num, new_item_no