#!/usr/bin/env/python # -*- coding:utf-8 -*- import re from utils.washutil import table_label_cleal from operator import itemgetter from itertools import groupby def find_seq_num(num_list): """ 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下 将连续的数字进行分组 :param num_list:输入[3, 4, 8, 9, 12, 13, 14] :return: [[3, 4],[8, 9],[12, 13, 14]] """ seq_ranges = [] for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]): group = (map(itemgetter(1), g)) group = list(map(int, group)) seq_ranges.append(group) return seq_ranges def judge_item_no_type(items_con): # 判断该份试卷的题号类型:(1)or 1、 item_no_type = 1 # 题号目前有两种类型 1、 和 (1) all_con = table_label_cleal(items_con) item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', all_con) if m] if len(item_no_info) <= 2: # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、]?', all_con)] item_no_info2 = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', all_con) if m] if len(item_no_info2) <= 6: print("本份试卷题号有问题!") item_no_type = 1 else: item_no_info = item_no_info2 item_no_type = 2 return all_con, item_no_info, item_no_type def judge_ans_no_type(items_ans, item_type_num): # 判断该份试卷的答案的题号类型:(1)or 1、 # 这里没有细分 ans_item_no = [] # 答案中的初步题号,主要用于题号纠错 ans_item_no_type = 1 for num, one_type in enumerate(items_ans): one_type = table_label_cleal("\n" + one_type) one_type_no = [int(no) for no in re.findall(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', one_type)] if not one_type_no and "table" in one_type: # 有的答案放在表格中,也需先记后纠错 if not ans_item_no: ans_item_no.extend(range(1, item_type_num[num][1] + 1)) # 表格中的答案默认从1开始!!!! else: ans_item_no.extend(range(ans_item_no[-1] + 1, len(ans_item_no) + item_type_num[num][1] + 1)) ans_item_no.extend(one_type_no) if len(ans_item_no) <= 2: # 没考虑表格里存答案的情况 ans_item_no = [] for num, one_type in enumerate(items_ans): one_type = table_label_cleal("\n" + one_type) one_type_no = [int(no) for no in re.findall(r'\n+\s*\(([1-9]|[1-9][0-9])\)\s*[..、、::]?', one_type)] ans_item_no.extend(one_type_no) if len(ans_item_no) > 6: ans_item_no_type = 2 return ans_item_no_type def pre_get_item_no(items_con, item_no_type, flag=0): """ 根据题号类型初步获取题号及其位置信息 :param items_con: :param item_no_type: :return: """ item_no_info = [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]', items_con) if m] \ if item_no_type == 1 else [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', items_con) if m] if flag: item_no_info = [(m.start(), m.group(1)) for m in re.finditer( r'\n+\s*([1-9]|[1-9][0-9])\s*[((]\s*1\s*[))]', items_con) if m] \ if item_no_type == 1 else [(m.start(), m.group(1)) for m in re.finditer(r'\n+\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]', items_con) if m] return item_no_info def del_exception_value(item_list, items_no_idx): """ 去列表中的异常值,题目越多,越容易突出异常值 :return: """ import numpy as np max_v = max(item_list) arr_mean = np.mean(item_list) # 均值 arr_var = np.var(item_list) # 方差 while max_v > len(item_list)+4: item_list.remove(max_v) print(item_list) arr_mean = np.mean(item_list) # 去最大值后的均值 arr_var = np.var(item_list) # 去最大值后的方差 max_v = max(item_list) # print("均值与方差:",arr_mean,arr_var) if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3: return item_list, items_no_idx else: exception_value = [] for i in item_list: # print(abs((i - arr_mean) / arr_var), i) if(abs((i - arr_mean)/arr_var)) > 0.3: exception_value.append(i) if not exception_value: return item_list, items_no_idx else: right_seq = [] new_no_idx = [] for k, i in enumerate(item_list): if i not in exception_value: right_seq.append(i) new_no_idx.append(items_no_idx[k]) if right_seq: return right_seq, new_no_idx return item_list, items_no_idx def get_right_no(items_no_info, flag=0, have_type=0, last_id=0): """ 针对分错的题号进行纠正 ;;带解析的划分题目最好按关键字拆分!!!! 题号划分错误有:题号重复,题号遗漏,题号偏离很远的错误如88.等 无题型行时,con_list中每个元素代表每一行 有题型行时,con_list中每个元素代表每个题型中的所有题目 items_no:初步找到的所有题号 :return: con_list """ # items_no = [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] items_no, items_no_idx = [], [] if flag: items_no, items_no_idx = items_no_info[1], items_no_info[0] else: items_no = [int(i[1]) for i in items_no_info] items_no_idx = [i[0] for i in items_no_info] idx2no_dict = dict(zip(items_no_idx, items_no)) seq_no = find_seq_num(items_no) # 找到连续的分组 print("---items_no:", items_no) # print("items_no_idx:", items_no_idx) print("---seq_no:", seq_no) if len(seq_no) > 1: # 存在分断或分错的地方 print('按题号切分的过程中,存在分断或分错的地方') # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值 # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号 all_seq_idx = [] for i, no in enumerate(items_no): # print('-----------i:',i) seq_idx = [items_no_idx[i]] num = i items_no_1 = items_no[i+1:] while items_no_1: # print(items_no_1) # print("n0---------:",no) flag1 = 0; next_n = 0 if no + 1 in items_no_1: next_n = items_no_1.index(no + 1) flag1 = 1 no += 1 elif no + 2 in items_no_1: next_n = items_no_1.index(no + 2) flag1 = 1 no += 2 if flag1: # print("next_n",next_n) num += next_n + 1 if num 1: # 存在分断或分错的地方 print('按题号切分的过程中,存在分断或分错的地方') # items_no, items_no_idx = del_exception_value(items_no, items_no_idx) # 主要去掉异常的大值 # 方案:从前往后,统计相差序号为2个以内的序列,序列最长的一组作为正确题号 all_seq_no = [] for i, no in enumerate(items_no): # print('-----------i:',i) seq_no = [items_no[i]] num = i items_no_1 = items_no[i+1:] while items_no_1: # print(items_no_1) # print("n0---------:",no) flag1 = 0; next_n = 0 if no + 1 in items_no_1: next_n = items_no_1.index(no + 1) flag1 = 1 no += 1 elif no + 2 in items_no_1: next_n = items_no_1.index(no + 2) flag1 = 1 no += 2 if flag1: # print("next_n",next_n) num += next_n + 1 seq_no.append(items_no[num]) items_no_1 = items_no_1[next_n+1:] else: break all_seq_no.append(seq_no) # print(all_seq_no) seq_len = [len(k) for k in all_seq_no] max_seq_no = all_seq_no[seq_len.index(max(seq_len))] print("get_consecutive_no最长连续的题号:", max_seq_no) return max_seq_no else: return items_no def get_many_ans_no(items_str, ans_item_no_type, reget=0): """ 针对一行多个的答案,获取答案 :param items: :param ans_item_no_type: :return: """ ans_no1 = [] # ans_no2 = [] ans_no_idx1 = [] # ans_no_idx2 = [] if reget: # 这里在遇到前面没空格的两位数字的情况下分拿1位和拿2位,不太准,应该是各种组合 # 最后选择取前面不是数字的的序号 if ans_item_no_type == 1: for m in re.finditer(r'\n+\s*([1-9]|[1-9][0-9])\s*[..、、::]' r'|(? 0: for num, s in enumerate(seq_no[1:]): last_idx = item_no.index(seq_no[num][-1]) last_item = rest_item_split[last_idx] for que_no in list(range(seq_no[num][-1] + 1, s[0])): new_split = re.split("\s+" + str(que_no) + "\s*[((]\s*1\s*[))]", last_item, maxsplit=1) if len(new_split) == 2: rest_item_split[last_idx] = new_split[0] rest_item_split.insert(last_idx + 1, "(1)"+new_split[1]) item_no.insert(item_no.index(s[0]), que_no) last_idx += 1 last_item = rest_item_split[last_idx] return item_no, rest_item_split if __name__ == '__main__': ans_no4=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20] nos = get_consecutive_no(ans_no4) print(nos)