topic_no.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from utils.washutil import table_label_cleal
  5. from operator import itemgetter
  6. from itertools import groupby
  7. def find_seq_num(num_list):
  8. """
  9. 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下
  10. 将连续的数字进行分组
  11. :param num_list:输入[3, 4, 8, 9, 12, 13, 14]
  12. :return: [[3, 4],[8, 9],[12, 13, 14]]
  13. """
  14. seq_ranges = []
  15. for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]):
  16. group = (map(itemgetter(1), g))
  17. group = list(map(int, group))
  18. seq_ranges.append(group)
  19. return seq_ranges
  20. def judge_item_no_type(items_con):
  21. # 判断该份试卷的题号类型:(1)or 1、
  22. item_no_type = 1 # 题号目前有两种类型 1、 和 (1)
  23. all_con = table_label_cleal(items_con)
  24. item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-4][0-9])\s*[..、、::]', all_con) if m]
  25. if len(item_no_info) <= 2:
  26. # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、]?', all_con)]
  27. item_no_info = [(m.start(), m.group(1)) for m in
  28. re.finditer(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?', all_con) if m]
  29. if len(item_no_info) <= 6:
  30. print("本份试卷题号有问题!")
  31. item_no_type = 1
  32. else:
  33. item_no_type = 2
  34. return all_con, item_no_info, item_no_type
  35. def judge_ans_no_type(items_ans, item_type_num):
  36. # 判断该份试卷的答案的题号类型:(1)or 1、
  37. # 这里没有细分
  38. ans_item_no = [] # 答案中的初步题号,主要用于题号纠错
  39. ans_item_no_type = 1
  40. for num, one_type in enumerate(items_ans):
  41. one_type = table_label_cleal("\n" + one_type)
  42. one_type_no = [int(no) for no in re.findall(r'\n+\s*([1-9]|[1-4][0-9])\s*[..、、::]', one_type)]
  43. if not one_type_no and "table" in one_type: # 有的答案放在表格中,也需先记后纠错
  44. if not ans_item_no:
  45. ans_item_no.extend(range(1, item_type_num[num][1] + 1)) # 表格中的答案默认从1开始!!!!
  46. else:
  47. ans_item_no.extend(range(ans_item_no[-1] + 1, len(ans_item_no) + item_type_num[num][1] + 1))
  48. ans_item_no.extend(one_type_no)
  49. if len(ans_item_no) <= 2: # 没考虑表格里存答案的情况
  50. ans_item_no = []
  51. for num, one_type in enumerate(items_ans):
  52. one_type = table_label_cleal("\n" + one_type)
  53. one_type_no = [int(no) for no in re.findall(r'\n+\s*\(([1-9]|[1-4][0-9])\)\s*[..、、::]?', one_type)]
  54. ans_item_no.extend(one_type_no)
  55. if len(ans_item_no) > 6:
  56. ans_item_no_type = 2
  57. return ans_item_no_type
  58. def pre_get_item_no(items_con, item_no_type, flag=0):
  59. """
  60. 根据题号类型初步获取题号及其位置信息
  61. :param items_con:
  62. :param item_no_type:
  63. :return:
  64. """
  65. item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-4][0-9])\s*[..、、::]', items_con) if m] \
  66. if item_no_type == 1 else [(m.start(), m.group(1)) for m in
  67. re.finditer(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?', items_con) if m]
  68. if flag:
  69. item_no_info = [(m.start(), m.group(1)) for m in re.finditer(
  70. r'\n+\s*([1-9]|[1-4][0-9])\s*[((]\s*1\s*[))]', items_con) if m] \
  71. if item_no_type == 1 else [(m.start(), m.group(1)) for m in
  72. re.finditer(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]', items_con) if m]
  73. return item_no_info
  74. def del_exception_value(item_list, items_no_idx):
  75. """
  76. 去列表中的异常值,题目越多,越容易突出异常值
  77. :return:
  78. """
  79. import numpy as np
  80. max_v = max(item_list)
  81. arr_mean = np.mean(item_list) # 均值
  82. arr_var = np.var(item_list) # 方差
  83. while max_v > len(item_list)+4:
  84. item_list.remove(max_v)
  85. print(item_list)
  86. arr_mean = np.mean(item_list) # 去最大值后的均值
  87. arr_var = np.var(item_list) # 去最大值后的方差
  88. max_v = max(item_list)
  89. # print("均值与方差:",arr_mean,arr_var)
  90. if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3:
  91. return item_list, items_no_idx
  92. else:
  93. exception_value = []
  94. for i in item_list:
  95. # print(abs((i - arr_mean) / arr_var), i)
  96. if(abs((i - arr_mean)/arr_var)) > 0.3:
  97. exception_value.append(i)
  98. if not exception_value:
  99. return item_list, items_no_idx
  100. else:
  101. right_seq = []
  102. new_no_idx = []
  103. for k, i in enumerate(item_list):
  104. if i not in exception_value:
  105. right_seq.append(i)
  106. new_no_idx.append(items_no_idx[k])
  107. if right_seq:
  108. return right_seq, new_no_idx
  109. return item_list, items_no_idx
  110. def get_right_no(items_no_info, flag=0, have_type=0, last_id=0):
  111. """
  112. 针对分错的题号进行纠正 ;;带解析的划分题目最好按关键字拆分!!!!
  113. 题号划分错误有:题号重复,题号遗漏,题号偏离很远的错误如88.等
  114. 无题型行时,con_list中每个元素代表每一行
  115. 有题型行时,con_list中每个元素代表每个题型中的所有题目
  116. items_no:初步找到的所有题号
  117. :return: con_list
  118. """
  119. # items_no = [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
  120. items_no, items_no_idx = [], []
  121. if flag:
  122. items_no, items_no_idx = items_no_info[1], items_no_info[0]
  123. else:
  124. items_no = [int(i[1]) for i in items_no_info]
  125. items_no_idx = [i[0] for i in items_no_info]
  126. idx2no_dict = dict(zip(items_no_idx, items_no))
  127. seq_no = find_seq_num(items_no) # 找到连续的分组
  128. print("---items_no:", items_no)
  129. # print("items_no_idx:", items_no_idx)
  130. print("---seq_no:", seq_no)
  131. if len(seq_no) > 1: # 存在分断或分错的地方
  132. print('按题号切分的过程中,存在分断或分错的地方')
  133. # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值
  134. # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号
  135. all_seq_idx = []
  136. for i, no in enumerate(items_no):
  137. # print('-----------i:',i)
  138. seq_idx = [items_no_idx[i]]
  139. num = i
  140. items_no_1 = items_no[i+1:]
  141. while items_no_1:
  142. # print(items_no_1)
  143. # print("n0---------:",no)
  144. flag1 = 0; next_n = 0
  145. if no + 1 in items_no_1:
  146. next_n = items_no_1.index(no + 1)
  147. flag1 = 1
  148. no += 1
  149. elif no + 2 in items_no_1:
  150. next_n = items_no_1.index(no + 2)
  151. flag1 = 1
  152. no += 2
  153. if flag1:
  154. # print("next_n",next_n)
  155. num += next_n + 1
  156. if num<len(items_no_idx):
  157. seq_idx.append(items_no_idx[num])
  158. items_no_1 = items_no_1[next_n+1:]
  159. else:
  160. break
  161. # print(items_no_1)
  162. all_seq_idx.append(seq_idx)
  163. # print("all_seq_idx:",all_seq_idx)
  164. if have_type: # 是否含题型行
  165. all_seq_no = [[idx2no_dict[i] for i in k] for k in all_seq_idx]
  166. right_seq_idx = [k for k, no in enumerate(all_seq_no) if no and no[0] in [last_id+1, last_id+2]]
  167. if right_seq_idx:
  168. return all_seq_no[right_seq_idx[0]], all_seq_idx[right_seq_idx[0]]
  169. seq_len = [len(k) for k in all_seq_idx]
  170. max_seq_idx = all_seq_idx[seq_len.index(max(seq_len))]
  171. max_seq_no = [idx2no_dict[k] for k in max_seq_idx]
  172. print("get_right_no最后的题号:", max_seq_no)
  173. print("get_right_no最后的题号位置:", max_seq_idx)
  174. return max_seq_no, max_seq_idx
  175. return items_no, items_no_idx
  176. def get_consecutive_no(items_no):
  177. """
  178. 获取连续的题号
  179. :return:
  180. """
  181. seq_no = find_seq_num(items_no) # 找到连续的分组
  182. # print("seq_no:",seq_no)
  183. if len(seq_no) > 1: # 存在分断或分错的地方
  184. print('按题号切分的过程中,存在分断或分错的地方')
  185. # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值
  186. # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号
  187. all_seq_no = []
  188. for i, no in enumerate(items_no):
  189. # print('-----------i:',i)
  190. seq_no = [items_no[i]]
  191. num = i
  192. items_no_1 = items_no[i+1:]
  193. while items_no_1:
  194. # print(items_no_1)
  195. # print("n0---------:",no)
  196. flag1 = 0; next_n = 0
  197. if no + 1 in items_no_1:
  198. next_n = items_no_1.index(no + 1)
  199. flag1 = 1
  200. no += 1
  201. elif no + 2 in items_no_1:
  202. next_n = items_no_1.index(no + 2)
  203. flag1 = 1
  204. no += 2
  205. if flag1:
  206. # print("next_n",next_n)
  207. num += next_n + 1
  208. seq_no.append(items_no[num])
  209. items_no_1 = items_no_1[next_n+1:]
  210. else:
  211. break
  212. all_seq_no.append(seq_no)
  213. # print(all_seq_no)
  214. seq_len = [len(k) for k in all_seq_no]
  215. max_seq_no = all_seq_no[seq_len.index(max(seq_len))]
  216. print("get_consecutive_no最长连续的题号:", max_seq_no)
  217. return max_seq_no
  218. else:
  219. return items_no
  220. def get_many_ans_no(items_str, ans_item_no_type, reget=0):
  221. """
  222. 针对一行多个的答案,获取答案
  223. :param items:
  224. :param ans_item_no_type:
  225. :return:
  226. """
  227. ans_no1 = []
  228. # ans_no2 = []
  229. ans_no_idx1 = []
  230. # ans_no_idx2 = []
  231. if reget:
  232. # 这里在遇到前面没空格的两位数字的情况下分拿1位和拿2位,不太准,应该是各种组合
  233. # 最后选择取前面不是数字的的序号
  234. if ans_item_no_type == 1:
  235. for m in re.finditer(r'\n+\s*([1-9]|[1-4][0-9])\s*[..、、::]'
  236. r'|(?<![::..、、+\-*/((\[{])\s+([1-9]|[1-4][0-9])\s*[..、、::](?!png)'
  237. r'|(?<![::..、、+\-*/((\[{\d])([1-9]|[1-4][0-9])\s*[..、、::](?!png)', items_str):
  238. aa = m.groups()[0]
  239. if m.groups()[1]:
  240. aa = m.groups()[1]
  241. elif m.groups()[2]:
  242. aa = m.groups()[2]
  243. ans_no1.append(int(aa))
  244. ans_no_idx1.append(m.start())
  245. # 下面为第一次方案,不要
  246. # if m.groups()[0] or (m.groups()[1] and len(m.groups()[1])==1):
  247. # aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  248. # ans_no1.append(int(aa))
  249. # ans_no_idx1.append(m.start())
  250. # ans_no2.append(int(aa))
  251. # ans_no_idx2.append(m.start())
  252. # if m.groups()[1] and len(m.groups()[1])==2:
  253. # ans_no1.append(int(m.groups()[1][-1])) # 拿一个数
  254. # ans_no2.append(int(m.groups()[1])) # 拿2个数
  255. # ans_no_idx1.append(m.start()+1)
  256. # ans_no_idx2.append(m.start())
  257. else:
  258. for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?'
  259. r'|(?<![::+\-、、*/])\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?', items_str):
  260. aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  261. ans_no1.append(int(aa))
  262. ans_no_idx1.append(m.start())
  263. # if m.groups()[0] or (m.groups()[1] and len(m.groups()[1])==1):
  264. # aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  265. # ans_no1.append(int(aa))
  266. # ans_no_idx1.append(m.start())
  267. # ans_no2.append(int(aa))
  268. # ans_no_idx2.append(m.start())
  269. # if m.groups()[1] and len(m.groups()[1])==2:
  270. # ans_no1.append(int(m.groups()[1][-1]))
  271. # ans_no2.append(int(m.groups()[1]))
  272. # ans_no_idx1.append(m.start()+1)
  273. # ans_no_idx2.append(m.start())
  274. return ans_no1, ans_no_idx1
  275. else:
  276. # 序号前要求换行或空格
  277. if ans_item_no_type == 1:
  278. for m in re.finditer(r'\n+\s*([1-9]|[1-4][0-9])\s*[..、、::]'
  279. r'|(?<![::..、、+\-*/((\[{])\s+([1-9]|[1-4][0-9])\s*[..、、::](?!png)', items_str):
  280. aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  281. ans_no1.append(int(aa))
  282. ans_no_idx1.append(m.start())
  283. else:
  284. for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?'
  285. r'|(?<![::+\-、、*/])\s+[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?', items_str):
  286. aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  287. ans_no1.append(int(aa))
  288. ans_no_idx1.append(m.start())
  289. return ans_no1, ans_no_idx1
  290. def get_seq_no(item_no, rest_item_split, flag=1):
  291. """
  292. 对于漏缺的题号进一步判断获取!!!
  293. :param item_no:
  294. :param rest_item_split:
  295. :param flag: 题号为18(1)类型
  296. :return:
  297. """
  298. seq_no = find_seq_num(item_no)
  299. if len(seq_no) > 0:
  300. for num, s in enumerate(seq_no[1:]):
  301. last_idx = item_no.index(seq_no[num][-1])
  302. last_item = rest_item_split[last_idx]
  303. for que_no in list(range(seq_no[num][-1] + 1, s[0])):
  304. new_split = re.split("\s+" + str(que_no) + "\s*[((]\s*1\s*[))]",
  305. last_item, maxsplit=1)
  306. if len(new_split) == 2:
  307. rest_item_split[last_idx] = new_split[0]
  308. rest_item_split.insert(last_idx + 1, "(1)"+new_split[1])
  309. item_no.insert(item_no.index(s[0]), que_no)
  310. last_idx += 1
  311. last_item = rest_item_split[last_idx]
  312. return item_no, rest_item_split
  313. if __name__ == '__main__':
  314. ans_no4=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20]
  315. nos = get_consecutive_no(ans_no4)
  316. print(nos)