#!/usr/bin/env/python # -*- coding:utf-8 -*- import re def get_con(subcon, item_no_type, item_no, **kwargs): """ # ------在下一题【解析】在本题【答案】之间找到下一题【stem】的位置-------- # 主要统计题号的位置,空行信息作为辅助 :return: """ errmsg_dict = {} # item_no = [] type_info = kwargs['all_type'][kwargs['num']] if 'all_type' in kwargs else "" index = kwargs['index'] errmsg_bef = type_info+"第{0}道题(在整篇文档中为第{1}题)的题文和上一题的解析之间" if type_info \ else "整篇文档中第{0}题的题文和上一题的解析之间{1}" count = 1 # 题量个数 for id in range(4, len(subcon), 4): if id < len(subcon) - 1: # 最后一个不用管 count += 1 # 当前在本大题中是第几个 ssub = subcon[id].strip().split("\n") # 首尾空行先去掉 blank_line = [i for i, v in enumerate(ssub) if v.strip() == ""] # 空格的索引 # 索引to题号字典,获取可能的题号的位置 pattern1 = r"([1-9]|[1-9][0-9])\s*[..、、]" if item_no_type==1 else r"\(([1-9]|[1-9][0-9])\)\s*[..、、]?" line_topicno_dict = {i: re.match(pattern1, v.strip()).group(1) for i, v in enumerate(ssub) if re.match(pattern1, v.strip())} # print("line_topicno_dict",line_topicno_dict) con_id_line = list(line_topicno_dict.keys()) # 题号的行索引,第几行 topicno = list(line_topicno_dict.values()) # 题号序列 topicno_line_dict = dict(zip(topicno, con_id_line)) # 题号to行索引字典 if len(con_id_line) != len(topicno_line_dict): # 肯定len(con_id_line)>1 # 相同序号不是题目序号时, 出现相同可疑题号 from collections import Counter topicno_set = [i for i, j in dict(Counter(topicno)).items() if j == 1] if len(topicno_set) == 1 and 0 <= int(topicno_set[0]) - (item_no[-1] + 1) <= 1: ssub.insert(topicno_line_dict[topicno_set[0]], "【stem】") item_no.append(int(topicno_set[0])) else: item_no.append(item_no[-1] + 1) errmsg = "【多个相同的题目序号或题目序号有误,可能导致拆分题目错误】" errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \ else errmsg_bef.format(str(index + count), "") + errmsg errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \ else errmsg_dict[id / 4 - 1] + ";" + errmsg errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \ else errmsg_dict[id / 4] + ";" + errmsg # 应取中间偏下方的序号为准!!! if len(blank_line) == 1: # 大概率空格处 ssub[blank_line[0]] = "【stem】" else: intervals = [(len("".join(ssub[0:r])), len("".join(ssub[r:]))) for r in con_id_line] intervals = [k for k, r in enumerate(intervals) if r[0] > 100 and 50 <= r[1] <= 200] if len(intervals) == 1: print("【多个相同的题目序号】切分不严谨") ssub.insert(con_id_line[intervals[0]], "【stem】") else: # 多个可疑题号分不出来 ssub.insert(0, "【stem】") # 分不出来,先替换 ssub.insert(0, "") else: if len(con_id_line) == 1: # 一个题号 if len(blank_line) == 1 and con_id_line[0] - blank_line[0] == 1: # 空格在前,题号在后 ssub.insert(con_id_line[0], "【stem】") item_no.append(int(topicno[0])) else: if 0 <= int(topicno[0]) - (item_no[-1] + 1) <= 1: # 允许题号相差1个 ssub.insert(con_id_line[0], "【stem】") item_no.append(int(topicno[0])) else: if len(blank_line) == 1: ssub[blank_line[0]] = "【stem】" # 该题序号不对时再考虑空行 item_no.append(item_no[-1] + 1) else: ssub.insert(con_id_line[0], "【stem】") item_no.append(item_no[-1] + 1) errmsg = "【题目序号不连续,可能导致拆分题目错误】" errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \ else errmsg_bef.format(str(index + count), "") + errmsg errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \ else errmsg_dict[id / 4 - 1] + ";" + errmsg errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \ else errmsg_dict[id / 4] + ";" + errmsg elif len(con_id_line) > 1: # 多个题号时 if str(item_no[-1] + 1) in topicno: item_no.append(item_no[-1] + 1) ssub.insert(topicno_line_dict[str(item_no[-1])], "【stem】") elif str(item_no[-1] + 2) in topicno: item_no.append(item_no[-1] + 2) ssub.insert(topicno_line_dict[str(item_no[-1])], "【stem】") else: item_no.append(item_no[-1] + 1) ssub.insert(0, "【stem】") # 分不出来,先替换 ssub.insert(0, "") errmsg = "【题目序号不连续,可能导致拆分题目错误】" errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \ else errmsg_bef.format(str(index + count), "") + errmsg errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \ else errmsg_dict[id / 4 - 1] + ";" + errmsg errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \ else errmsg_dict[id / 4] + ";" + errmsg else: # 无题号 item_no.append(item_no[-1] + 1) errmsg = "【没有题目序号,可能导致拆分题目错误】" errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \ else errmsg_bef.format(str(index + count), "") + errmsg errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \ else errmsg_dict[id / 4 - 1] + ";" + errmsg errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \ else errmsg_dict[id / 4] + ";" + errmsg if len(blank_line) == 1: ssub[blank_line[0]] = "【stem】" else: if len(ssub) == 2: # 只有一个换行时 ssub.insert(1, "【stem】") elif item_no[-1] > 49: topicno_idx = [i for i, v in enumerate(ssub) if re.match("[5-9][0-9]\s*[..、、]", v.strip())] if len(topicno_idx) == 1: ssub.insert(topicno_idx[0], "【stem】") else: ssub.insert(0, "【stem】") # 分不出来,先替换 ssub.insert(0, "") else: ssub.insert(0, "【stem】") # 分不出来,先替换 ssub.insert(0, "") subcon[id] = "\n".join(ssub) all_item = re.split(r"【stem】", "\n".join(subcon).strip()) return all_item, item_no, errmsg_dict, count