topic_no.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from utils.washutil import table_label_cleal
  5. from operator import itemgetter
  6. from itertools import groupby
  7. def find_seq_num(num_list):
  8. """
  9. 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下
  10. 将连续的数字进行分组
  11. :param num_list:输入[3, 4, 8, 9, 12, 13, 14]
  12. :return: [[3, 4],[8, 9],[12, 13, 14]]
  13. """
  14. seq_ranges = []
  15. for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]):
  16. group = (map(itemgetter(1), g))
  17. group = list(map(int, group))
  18. seq_ranges.append(group)
  19. return seq_ranges
  20. def judge_item_no_type(items_con):
  21. # 判断该份试卷的题号类型:(1)or 1、
  22. item_no_type = 1 # 题号目前有两种类型 1、 和 (1)
  23. all_con = table_label_cleal(items_con)
  24. item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', all_con) if m]
  25. if len(item_no_info) <= 2:
  26. # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、]?', all_con)]
  27. item_no_info2 = [(m.start(), m.group(1)) for m in
  28. re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', all_con) if m]
  29. if len(item_no_info) <= 6:
  30. print("本份试卷题号有问题!")
  31. item_no_type = 1
  32. else:
  33. item_no_info = item_no_info2
  34. item_no_type = 2
  35. return all_con, item_no_info, item_no_type
  36. def judge_ans_no_type(items_ans, item_type_num):
  37. # 判断该份试卷的答案的题号类型:(1)or 1、
  38. # 这里没有细分
  39. ans_item_no = [] # 答案中的初步题号,主要用于题号纠错
  40. ans_item_no_type = 1
  41. for num, one_type in enumerate(items_ans):
  42. one_type = table_label_cleal("\n" + one_type)
  43. one_type_no = [int(no) for no in re.findall(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', one_type)]
  44. if not one_type_no and "table" in one_type: # 有的答案放在表格中,也需先记后纠错
  45. if not ans_item_no:
  46. ans_item_no.extend(range(1, item_type_num[num][1] + 1)) # 表格中的答案默认从1开始!!!!
  47. else:
  48. ans_item_no.extend(range(ans_item_no[-1] + 1, len(ans_item_no) + item_type_num[num][1] + 1))
  49. ans_item_no.extend(one_type_no)
  50. if len(ans_item_no) <= 2: # 没考虑表格里存答案的情况
  51. ans_item_no = []
  52. for num, one_type in enumerate(items_ans):
  53. one_type = table_label_cleal("\n" + one_type)
  54. one_type_no = [int(no) for no in re.findall(r'\n+\s*\(([1-9]|[1-9][0-9])\)\s*[..、、::]?', one_type)]
  55. ans_item_no.extend(one_type_no)
  56. if len(ans_item_no) > 6:
  57. ans_item_no_type = 2
  58. return ans_item_no_type
  59. def pre_get_item_no(items_con, item_no_type, flag=0):
  60. """
  61. 根据题号类型初步获取题号及其位置信息
  62. :param items_con:
  63. :param item_no_type:
  64. :return:
  65. """
  66. item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', items_con) if m] \
  67. if item_no_type == 1 else [(m.start(), m.group(1)) for m in
  68. re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', items_con) if m]
  69. if flag:
  70. item_no_info = [(m.start(), m.group(1)) for m in re.finditer(
  71. r'\n+\s*([1-9]|[1-9][0-9])\s*[((]\s*1\s*[))]', items_con) if m] \
  72. if item_no_type == 1 else [(m.start(), m.group(1)) for m in
  73. re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]', items_con) if m]
  74. return item_no_info
  75. def del_exception_value(item_list, items_no_idx):
  76. """
  77. 去列表中的异常值,题目越多,越容易突出异常值
  78. :return:
  79. """
  80. import numpy as np
  81. max_v = max(item_list)
  82. arr_mean = np.mean(item_list) # 均值
  83. arr_var = np.var(item_list) # 方差
  84. while max_v > len(item_list)+4:
  85. item_list.remove(max_v)
  86. print(item_list)
  87. arr_mean = np.mean(item_list) # 去最大值后的均值
  88. arr_var = np.var(item_list) # 去最大值后的方差
  89. max_v = max(item_list)
  90. # print("均值与方差:",arr_mean,arr_var)
  91. if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3:
  92. return item_list, items_no_idx
  93. else:
  94. exception_value = []
  95. for i in item_list:
  96. # print(abs((i - arr_mean) / arr_var), i)
  97. if(abs((i - arr_mean)/arr_var)) > 0.3:
  98. exception_value.append(i)
  99. if not exception_value:
  100. return item_list, items_no_idx
  101. else:
  102. right_seq = []
  103. new_no_idx = []
  104. for k, i in enumerate(item_list):
  105. if i not in exception_value:
  106. right_seq.append(i)
  107. new_no_idx.append(items_no_idx[k])
  108. if right_seq:
  109. return right_seq, new_no_idx
  110. return item_list, items_no_idx
  111. def get_right_no(items_no_info, flag=0, have_type=0, last_id=0):
  112. """
  113. 针对分错的题号进行纠正 ;;带解析的划分题目最好按关键字拆分!!!!
  114. 题号划分错误有:题号重复,题号遗漏,题号偏离很远的错误如88.等
  115. 无题型行时,con_list中每个元素代表每一行
  116. 有题型行时,con_list中每个元素代表每个题型中的所有题目
  117. items_no:初步找到的所有题号
  118. :return: con_list
  119. """
  120. # items_no = [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
  121. items_no, items_no_idx = [], []
  122. if flag:
  123. items_no, items_no_idx = items_no_info[1], items_no_info[0]
  124. else:
  125. items_no = [int(i[1]) for i in items_no_info]
  126. items_no_idx = [i[0] for i in items_no_info]
  127. idx2no_dict = dict(zip(items_no_idx, items_no))
  128. seq_no = find_seq_num(items_no) # 找到连续的分组
  129. print("---items_no:", items_no)
  130. # print("items_no_idx:", items_no_idx)
  131. print("---seq_no:", seq_no)
  132. if len(seq_no) > 1: # 存在分断或分错的地方
  133. print('按题号切分的过程中,存在分断或分错的地方')
  134. # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值
  135. # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号
  136. all_seq_idx = []
  137. for i, no in enumerate(items_no):
  138. # print('-----------i:',i)
  139. seq_idx = [items_no_idx[i]]
  140. num = i
  141. items_no_1 = items_no[i+1:]
  142. while items_no_1:
  143. # print(items_no_1)
  144. # print("n0---------:",no)
  145. flag1 = 0; next_n = 0
  146. if no + 1 in items_no_1:
  147. next_n = items_no_1.index(no + 1)
  148. flag1 = 1
  149. no += 1
  150. elif no + 2 in items_no_1:
  151. next_n = items_no_1.index(no + 2)
  152. flag1 = 1
  153. no += 2
  154. if flag1:
  155. # print("next_n",next_n)
  156. num += next_n + 1
  157. if num<len(items_no_idx):
  158. seq_idx.append(items_no_idx[num])
  159. items_no_1 = items_no_1[next_n+1:]
  160. else:
  161. break
  162. # print(items_no_1)
  163. all_seq_idx.append(seq_idx)
  164. # print("all_seq_idx:",all_seq_idx)
  165. if have_type: # 是否含题型行
  166. all_seq_no = [[idx2no_dict[i] for i in k] for k in all_seq_idx]
  167. right_seq_idx = [k for k, no in enumerate(all_seq_no) if no and no[0] in [last_id+1, last_id+2]]
  168. if right_seq_idx:
  169. return all_seq_no[right_seq_idx[0]], all_seq_idx[right_seq_idx[0]]
  170. seq_len = [len(k) for k in all_seq_idx]
  171. max_seq_idx = all_seq_idx[seq_len.index(max(seq_len))]
  172. max_seq_no = [idx2no_dict[k] for k in max_seq_idx]
  173. print("get_right_no最后的题号:", max_seq_no)
  174. print("get_right_no最后的题号位置:", max_seq_idx)
  175. return max_seq_no, max_seq_idx
  176. return items_no, items_no_idx
  177. def get_consecutive_no(items_no):
  178. """
  179. 获取连续的题号
  180. :return:
  181. """
  182. seq_no = find_seq_num(items_no) # 找到连续的分组
  183. # print("seq_no:",seq_no)
  184. if len(seq_no) > 1: # 存在分断或分错的地方
  185. print('按题号切分的过程中,存在分断或分错的地方')
  186. # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值
  187. # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号
  188. all_seq_no = []
  189. for i, no in enumerate(items_no):
  190. # print('-----------i:',i)
  191. seq_no = [items_no[i]]
  192. num = i
  193. items_no_1 = items_no[i+1:]
  194. while items_no_1:
  195. # print(items_no_1)
  196. # print("n0---------:",no)
  197. flag1 = 0; next_n = 0
  198. if no + 1 in items_no_1:
  199. next_n = items_no_1.index(no + 1)
  200. flag1 = 1
  201. no += 1
  202. elif no + 2 in items_no_1:
  203. next_n = items_no_1.index(no + 2)
  204. flag1 = 1
  205. no += 2
  206. if flag1:
  207. # print("next_n",next_n)
  208. num += next_n + 1
  209. seq_no.append(items_no[num])
  210. items_no_1 = items_no_1[next_n+1:]
  211. else:
  212. break
  213. all_seq_no.append(seq_no)
  214. # print(all_seq_no)
  215. seq_len = [len(k) for k in all_seq_no]
  216. max_seq_no = all_seq_no[seq_len.index(max(seq_len))]
  217. print("get_consecutive_no最长连续的题号:", max_seq_no)
  218. return max_seq_no
  219. else:
  220. return items_no
  221. def get_many_ans_no(items_str, ans_item_no_type, reget=0):
  222. """
  223. 针对一行多个的答案,获取答案
  224. :param items:
  225. :param ans_item_no_type:
  226. :return:
  227. """
  228. ans_no1 = []
  229. # ans_no2 = []
  230. ans_no_idx1 = []
  231. # ans_no_idx2 = []
  232. if reget:
  233. # 这里在遇到前面没空格的两位数字的情况下分拿1位和拿2位,不太准,应该是各种组合
  234. # 最后选择取前面不是数字的的序号
  235. if ans_item_no_type == 1:
  236. for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]'
  237. r'|(?<![::..、、+\-*/((\[{])\s+([1-9]|[1-9][0-9])\s*[..、、::](?!png)'
  238. r'|(?<![::..、、+\-*/((\[{\d])([1-9]|[1-9][0-9])\s*[..、、::](?!png)', items_str):
  239. aa = m.groups()[0]
  240. if m.groups()[1]:
  241. aa = m.groups()[1]
  242. elif m.groups()[2]:
  243. aa = m.groups()[2]
  244. ans_no1.append(int(aa))
  245. ans_no_idx1.append(m.start())
  246. # 下面为第一次方案,不要
  247. # if m.groups()[0] or (m.groups()[1] and len(m.groups()[1])==1):
  248. # aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  249. # ans_no1.append(int(aa))
  250. # ans_no_idx1.append(m.start())
  251. # ans_no2.append(int(aa))
  252. # ans_no_idx2.append(m.start())
  253. # if m.groups()[1] and len(m.groups()[1])==2:
  254. # ans_no1.append(int(m.groups()[1][-1])) # 拿一个数
  255. # ans_no2.append(int(m.groups()[1])) # 拿2个数
  256. # ans_no_idx1.append(m.start()+1)
  257. # ans_no_idx2.append(m.start())
  258. else:
  259. for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?'
  260. r'|(?<![::+\-、、*/])\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', items_str):
  261. aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  262. ans_no1.append(int(aa))
  263. ans_no_idx1.append(m.start())
  264. # if m.groups()[0] or (m.groups()[1] and len(m.groups()[1])==1):
  265. # aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  266. # ans_no1.append(int(aa))
  267. # ans_no_idx1.append(m.start())
  268. # ans_no2.append(int(aa))
  269. # ans_no_idx2.append(m.start())
  270. # if m.groups()[1] and len(m.groups()[1])==2:
  271. # ans_no1.append(int(m.groups()[1][-1]))
  272. # ans_no2.append(int(m.groups()[1]))
  273. # ans_no_idx1.append(m.start()+1)
  274. # ans_no_idx2.append(m.start())
  275. return ans_no1, ans_no_idx1
  276. else:
  277. # 序号前要求换行或空格
  278. if ans_item_no_type == 1:
  279. for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]'
  280. r'|(?<![::..、、+\-*/((\[{])\s+([1-9]|[1-9][0-9])\s*[..、、::](?!png)', items_str):
  281. aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  282. ans_no1.append(int(aa))
  283. ans_no_idx1.append(m.start())
  284. else:
  285. for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?'
  286. r'|(?<![::+\-、、*/])\s+[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', items_str):
  287. aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
  288. ans_no1.append(int(aa))
  289. ans_no_idx1.append(m.start())
  290. return ans_no1, ans_no_idx1
  291. def get_seq_no(item_no, rest_item_split, flag=1):
  292. """
  293. 对于漏缺的题号进一步判断获取!!!
  294. :param item_no:
  295. :param rest_item_split:
  296. :param flag: 题号为18(1)类型
  297. :return:
  298. """
  299. seq_no = find_seq_num(item_no)
  300. if len(seq_no) > 0:
  301. for num, s in enumerate(seq_no[1:]):
  302. last_idx = item_no.index(seq_no[num][-1])
  303. last_item = rest_item_split[last_idx]
  304. for que_no in list(range(seq_no[num][-1] + 1, s[0])):
  305. new_split = re.split("\s+" + str(que_no) + "\s*[((]\s*1\s*[))]",
  306. last_item, maxsplit=1)
  307. if len(new_split) == 2:
  308. rest_item_split[last_idx] = new_split[0]
  309. rest_item_split.insert(last_idx + 1, "(1)"+new_split[1])
  310. item_no.insert(item_no.index(s[0]), que_no)
  311. last_idx += 1
  312. last_item = rest_item_split[last_idx]
  313. return item_no, rest_item_split
  314. if __name__ == '__main__':
  315. ans_no4=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20]
  316. nos = get_consecutive_no(ans_no4)
  317. print(nos)