123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- import re
- from utils.washutil import table_label_cleal
- from operator import itemgetter
- from itertools import groupby
- def find_seq_num(num_list):
- """
- 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下
- 将连续的数字进行分组
- :param num_list:输入[3, 4, 8, 9, 12, 13, 14]
- :return: [[3, 4],[8, 9],[12, 13, 14]]
- """
- seq_ranges = []
- for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]):
- group = (map(itemgetter(1), g))
- group = list(map(int, group))
- seq_ranges.append(group)
- return seq_ranges
- def judge_item_no_type(items_con):
- # 判断该份试卷的题号类型:(1)or 1、
- item_no_type = 1 # 题号目前有两种类型 1、 和 (1)
- all_con = table_label_cleal(items_con)
- item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', all_con) if m]
- if len(item_no_info) <= 2:
- # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、]?', all_con)]
- item_no_info2 = [(m.start(), m.group(1)) for m in
- re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', all_con) if m]
- if len(item_no_info2) <= 6:
- print("本份试卷题号有问题!")
- item_no_type = 1
- else:
- item_no_info = item_no_info2
- item_no_type = 2
- return all_con, item_no_info, item_no_type
- def judge_ans_no_type(items_ans, item_type_num):
- # 判断该份试卷的答案的题号类型:(1)or 1、
- # 这里没有细分
- ans_item_no = [] # 答案中的初步题号,主要用于题号纠错
- ans_item_no_type = 1
- for num, one_type in enumerate(items_ans):
- one_type = table_label_cleal("\n" + one_type)
- one_type_no = [int(no) for no in re.findall(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', one_type)]
- if not one_type_no and "table" in one_type: # 有的答案放在表格中,也需先记后纠错
- if not ans_item_no:
- ans_item_no.extend(range(1, item_type_num[num][1] + 1)) # 表格中的答案默认从1开始!!!!
- else:
- ans_item_no.extend(range(ans_item_no[-1] + 1, len(ans_item_no) + item_type_num[num][1] + 1))
- ans_item_no.extend(one_type_no)
- if len(ans_item_no) <= 2: # 没考虑表格里存答案的情况
- ans_item_no = []
- for num, one_type in enumerate(items_ans):
- one_type = table_label_cleal("\n" + one_type)
- one_type_no = [int(no) for no in re.findall(r'\n+\s*\(([1-9]|[1-9][0-9])\)\s*[..、、::]?', one_type)]
- ans_item_no.extend(one_type_no)
- if len(ans_item_no) > 6:
- ans_item_no_type = 2
- return ans_item_no_type
- def pre_get_item_no(items_con, item_no_type, flag=0):
- """
- 根据题号类型初步获取题号及其位置信息
- :param items_con:
- :param item_no_type:
- :return:
- """
- item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', items_con) if m] \
- if item_no_type == 1 else [(m.start(), m.group(1)) for m in
- re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', items_con) if m]
- if flag:
- item_no_info = [(m.start(), m.group(1)) for m in re.finditer(
- r'\n+\s*([1-9]|[1-9][0-9])\s*[((]\s*1\s*[))]', items_con) if m] \
- if item_no_type == 1 else [(m.start(), m.group(1)) for m in
- re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]', items_con) if m]
- return item_no_info
- def del_exception_value(item_list, items_no_idx):
- """
- 去列表中的异常值,题目越多,越容易突出异常值
- :return:
- """
- import numpy as np
- max_v = max(item_list)
- arr_mean = np.mean(item_list) # 均值
- arr_var = np.var(item_list) # 方差
- while max_v > len(item_list)+4:
- item_list.remove(max_v)
- print(item_list)
- arr_mean = np.mean(item_list) # 去最大值后的均值
- arr_var = np.var(item_list) # 去最大值后的方差
- max_v = max(item_list)
- # print("均值与方差:",arr_mean,arr_var)
- if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3:
- return item_list, items_no_idx
- else:
- exception_value = []
- for i in item_list:
- # print(abs((i - arr_mean) / arr_var), i)
- if(abs((i - arr_mean)/arr_var)) > 0.3:
- exception_value.append(i)
- if not exception_value:
- return item_list, items_no_idx
- else:
- right_seq = []
- new_no_idx = []
- for k, i in enumerate(item_list):
- if i not in exception_value:
- right_seq.append(i)
- new_no_idx.append(items_no_idx[k])
- if right_seq:
- return right_seq, new_no_idx
- return item_list, items_no_idx
- def get_right_no(items_no_info, flag=0, have_type=0, last_id=0):
- """
- 针对分错的题号进行纠正 ;;带解析的划分题目最好按关键字拆分!!!!
- 题号划分错误有:题号重复,题号遗漏,题号偏离很远的错误如88.等
- 无题型行时,con_list中每个元素代表每一行
- 有题型行时,con_list中每个元素代表每个题型中的所有题目
- items_no:初步找到的所有题号
- :return: con_list
- """
- # items_no = [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
- items_no, items_no_idx = [], []
- if flag:
- items_no, items_no_idx = items_no_info[1], items_no_info[0]
- else:
- items_no = [int(i[1]) for i in items_no_info]
- items_no_idx = [i[0] for i in items_no_info]
- idx2no_dict = dict(zip(items_no_idx, items_no))
- seq_no = find_seq_num(items_no) # 找到连续的分组
- print("---items_no:", items_no)
- # print("items_no_idx:", items_no_idx)
- print("---seq_no:", seq_no)
- if len(seq_no) > 1: # 存在分断或分错的地方
- print('按题号切分的过程中,存在分断或分错的地方')
- # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值
- # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号
- all_seq_idx = []
- for i, no in enumerate(items_no):
- # print('-----------i:',i)
- seq_idx = [items_no_idx[i]]
- num = i
- items_no_1 = items_no[i+1:]
- while items_no_1:
- # print(items_no_1)
- # print("n0---------:",no)
- flag1 = 0; next_n = 0
- if no + 1 in items_no_1:
- next_n = items_no_1.index(no + 1)
- flag1 = 1
- no += 1
- elif no + 2 in items_no_1:
- next_n = items_no_1.index(no + 2)
- flag1 = 1
- no += 2
- if flag1:
- # print("next_n",next_n)
- num += next_n + 1
- if num<len(items_no_idx):
- seq_idx.append(items_no_idx[num])
- items_no_1 = items_no_1[next_n+1:]
- else:
- break
- # print(items_no_1)
- all_seq_idx.append(seq_idx)
- # print("all_seq_idx:",all_seq_idx)
- if have_type: # 是否含题型行
- all_seq_no = [[idx2no_dict[i] for i in k] for k in all_seq_idx]
- right_seq_idx = [k for k, no in enumerate(all_seq_no) if no and no[0] in [last_id+1, last_id+2]]
- if right_seq_idx:
- return all_seq_no[right_seq_idx[0]], all_seq_idx[right_seq_idx[0]]
- seq_len = [len(k) for k in all_seq_idx]
- max_seq_idx = all_seq_idx[seq_len.index(max(seq_len))]
- max_seq_no = [idx2no_dict[k] for k in max_seq_idx]
- print("get_right_no最后的题号:", max_seq_no)
- print("get_right_no最后的题号位置:", max_seq_idx)
- return max_seq_no, max_seq_idx
- return items_no, items_no_idx
- def get_consecutive_no(items_no):
- """
- 获取连续的题号
- :return:
- """
- seq_no = find_seq_num(items_no) # 找到连续的分组
- # print("seq_no:",seq_no)
- if len(seq_no) > 1: # 存在分断或分错的地方
- print('按题号切分的过程中,存在分断或分错的地方')
- # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值
- # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号
- all_seq_no = []
- for i, no in enumerate(items_no):
- # print('-----------i:',i)
- seq_no = [items_no[i]]
- num = i
- items_no_1 = items_no[i+1:]
- while items_no_1:
- # print(items_no_1)
- # print("n0---------:",no)
- flag1 = 0; next_n = 0
- if no + 1 in items_no_1:
- next_n = items_no_1.index(no + 1)
- flag1 = 1
- no += 1
- elif no + 2 in items_no_1:
- next_n = items_no_1.index(no + 2)
- flag1 = 1
- no += 2
- if flag1:
- # print("next_n",next_n)
- num += next_n + 1
- seq_no.append(items_no[num])
- items_no_1 = items_no_1[next_n+1:]
- else:
- break
- all_seq_no.append(seq_no)
- # print(all_seq_no)
- seq_len = [len(k) for k in all_seq_no]
- max_seq_no = all_seq_no[seq_len.index(max(seq_len))]
- print("get_consecutive_no最长连续的题号:", max_seq_no)
- return max_seq_no
- else:
- return items_no
- def get_many_ans_no(items_str, ans_item_no_type, reget=0):
- """
- 针对一行多个的答案,获取答案
- :param items:
- :param ans_item_no_type:
- :return:
- """
- ans_no1 = []
- # ans_no2 = []
- ans_no_idx1 = []
- # ans_no_idx2 = []
- if reget:
- # 这里在遇到前面没空格的两位数字的情况下分拿1位和拿2位,不太准,应该是各种组合
- # 最后选择取前面不是数字的的序号
- if ans_item_no_type == 1:
- for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]'
- r'|(?<![::..、、+\-*/((\[{])\s+([1-9]|[1-9][0-9])\s*[..、、::](?!png)'
- r'|(?<![::..、、+\-*/((\[{\d])([1-9]|[1-9][0-9])\s*[..、、::](?!png)', items_str):
- aa = m.groups()[0]
- if m.groups()[1]:
- aa = m.groups()[1]
- elif m.groups()[2]:
- aa = m.groups()[2]
- ans_no1.append(int(aa))
- ans_no_idx1.append(m.start())
- # 下面为第一次方案,不要
- # if m.groups()[0] or (m.groups()[1] and len(m.groups()[1])==1):
- # aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
- # ans_no1.append(int(aa))
- # ans_no_idx1.append(m.start())
- # ans_no2.append(int(aa))
- # ans_no_idx2.append(m.start())
- # if m.groups()[1] and len(m.groups()[1])==2:
- # ans_no1.append(int(m.groups()[1][-1])) # 拿一个数
- # ans_no2.append(int(m.groups()[1])) # 拿2个数
- # ans_no_idx1.append(m.start()+1)
- # ans_no_idx2.append(m.start())
- else:
- for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?'
- r'|(?<![::+\-、、*/])\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', items_str):
- aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
- ans_no1.append(int(aa))
- ans_no_idx1.append(m.start())
- # if m.groups()[0] or (m.groups()[1] and len(m.groups()[1])==1):
- # aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
- # ans_no1.append(int(aa))
- # ans_no_idx1.append(m.start())
- # ans_no2.append(int(aa))
- # ans_no_idx2.append(m.start())
- # if m.groups()[1] and len(m.groups()[1])==2:
- # ans_no1.append(int(m.groups()[1][-1]))
- # ans_no2.append(int(m.groups()[1]))
- # ans_no_idx1.append(m.start()+1)
- # ans_no_idx2.append(m.start())
- return ans_no1, ans_no_idx1
- else:
- # 序号前要求换行或空格
- if ans_item_no_type == 1:
- for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]'
- r'|(?<![::..、、+\-*/((\[{])\s+([1-9]|[1-9][0-9])\s*[..、、::](?!png)', items_str):
- aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
- ans_no1.append(int(aa))
- ans_no_idx1.append(m.start())
- else:
- for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?'
- r'|(?<![::+\-、、*/])\s+[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', items_str):
- aa = m.groups()[0] if m.groups()[0] else m.groups()[1]
- ans_no1.append(int(aa))
- ans_no_idx1.append(m.start())
- return ans_no1, ans_no_idx1
- def get_seq_no(item_no, rest_item_split, flag=1):
- """
- 对于漏缺的题号进一步判断获取!!!
- :param item_no:
- :param rest_item_split:
- :param flag: 题号为18(1)类型
- :return:
- """
- seq_no = find_seq_num(item_no)
- if len(seq_no) > 0:
- for num, s in enumerate(seq_no[1:]):
- last_idx = item_no.index(seq_no[num][-1])
- last_item = rest_item_split[last_idx]
- for que_no in list(range(seq_no[num][-1] + 1, s[0])):
- new_split = re.split("\s+" + str(que_no) + "\s*[((]\s*1\s*[))]",
- last_item, maxsplit=1)
- if len(new_split) == 2:
- rest_item_split[last_idx] = new_split[0]
- rest_item_split.insert(last_idx + 1, "(1)"+new_split[1])
- item_no.insert(item_no.index(s[0]), que_no)
- last_idx += 1
- last_item = rest_item_split[last_idx]
- return item_no, rest_item_split
- if __name__ == '__main__':
- ans_no4=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20]
- nos = get_consecutive_no(ans_no4)
- print(nos)
|