cdZWj
/
new_tiku_structure_2021


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-

import re


def get_con(subcon, item_no_type, **kwargs):
    """
    # ------在下一题【解析】在本题【答案】之间找到下一题【content】的位置--------
    # 主要统计题号的位置，空行信息作为辅助
    :return:
    """
    errmsg_dict = {}
    item_no = []
    type_info = kwargs['all_type'][kwargs['num']] if 'all_type' in kwargs else ""
    index = kwargs['index']
    errmsg_bef = type_info+"第{0}道题(在整篇文档中为第{1}题)的题文和上一题的解析之间" if type_info \
        else "整篇文档中第{0}题的题文和上一题的解析之间{1}"
    count = 1  # 题量个数
    for id in range(4, len(subcon), 4):
        if id < len(subcon) - 1:  # 最后一个不用管
            count += 1  # 当前在本大题中是第几个
            ssub = subcon[id].strip().split("\n")  # 首尾空行先去掉
            blank_line = [i for i, v in enumerate(ssub) if v.strip() == ""]  # 空格的索引
            #  索引to题号字典，获取可能的题号的位置
            pattern1 = r"([1-9]|[1-4][0-9])\s*[.．、､]" if item_no_type==1 else r"\(([1-9]|[1-4][0-9])\)\s*[.．、､]?"
            line_topicno_dict = {i: re.match(pattern1, v.strip()).group(1)
                                 for i, v in enumerate(ssub)
                                 if re.match(pattern1, v.strip())}
            # print("line_topicno_dict",line_topicno_dict)
            con_id_line = list(line_topicno_dict.keys())  # 题号的行索引,第几行
            topicno = list(line_topicno_dict.values())  # 题号序列
            topicno_line_dict = dict(zip(topicno, con_id_line))  # 题号to行索引字典

            if len(con_id_line) != len(topicno_line_dict):
                # 相同序号不是题目序号时
                from collections import Counter
                topicno_set = [i for i, j in dict(Counter(topicno)).items() if j == 1]
                if len(topicno_set) == 1 and 0 <= int(topicno_set[0]) - (item_no[-1] + 1) <= 1:
                    ssub.insert(topicno_line_dict[topicno_set[0]], "【content】")
                    item_no.append(int(topicno_set[0]))
                else:
                    item_no.append(item_no[-1] + 1)
                    errmsg = "【多个相同的题目序号或题目序号有误】"
                    errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
                        else errmsg_bef.format(str(index + count), "") + errmsg
                    errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
                        else errmsg_dict[id / 4 - 1] + ";" + errmsg
                    errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
                        else errmsg_dict[id / 4] + ";" + errmsg
                    # 应取中间偏下方的序号为准！！！
                    if len(blank_line) == 1:  # 大概率空格处
                        ssub[blank_line[0]] = "【content】"
                    else:
                        intervals = [(len("".join(ssub[0:r])), len("".join(ssub[r:]))) for r in
                                     con_id_line]
                        intervals = [k for k, r in enumerate(intervals) if
                                     r[0] > 100 and 50 <= r[1] <= 200]
                        if len(intervals) == 1:
                            print("【多个相同的题目序号】切分不严谨")
                            ssub.insert(con_id_line[intervals[0]], "【content】")
                        else:
                            ssub = ["", "【content】"].extend(ssub)  # 分不出来，先替换
            else:
                if len(con_id_line) == 1:  # 一个题号
                    if len(blank_line) == 1 and con_id_line[0] - blank_line[0] == 1:  # 空格在前，题号在后
                        ssub.insert(con_id_line[0], "【content】")
                        item_no.append(int(topicno[0]))
                    else:
                        if 0 <= int(topicno[0]) - (item_no[-1] + 1) <= 1:  # 允许题号相差1个
                            ssub.insert(con_id_line[0], "【content】")
                            item_no.append(int(topicno[0]))
                        else:
                            ssub[blank_line[0]] = "【content】"  # 该题序号不对时再考虑空行
                            item_no.append(item_no[-1] + 1)
                            errmsg = "【题目序号不连续】"
                            errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
                                else errmsg_bef.format(str(index + count), "") + errmsg
                            errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
                                else errmsg_dict[id / 4 - 1] + ";" + errmsg
                            errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
                                else errmsg_dict[id / 4] + ";" + errmsg
                elif len(con_id_line) > 1:  # 多个题号时
                    if str(item_no[-1] + 1) in topicno:
                        item_no.append(item_no[-1] + 1)
                        ssub.insert(topicno_line_dict[str(item_no[-1] + 1)], "【content】")
                    elif str(item_no[-1] + 2) in topicno:
                        item_no.append(item_no[-1] + 2)
                        ssub.insert(topicno_line_dict[str(item_no[-1] + 2)], "【content】")
                    else:
                        item_no.append(item_no[-1] + 1)
                        ssub = ["", "【content】"].extend(ssub)  # 分不出来，先替换
                        errmsg = "【题目序号不连续】"
                        errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
                            else errmsg_bef.format(str(index + count), "") + errmsg
                        errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
                            else errmsg_dict[id / 4 - 1] + ";" + errmsg
                        errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
                            else errmsg_dict[id / 4] + ";" + errmsg
                else:  # 无题号
                    item_no.append(item_no[-1] + 1)
                    errmsg = "【没有题目序号】"
                    errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
                            else errmsg_bef.format(str(index + count), "") + errmsg
                    errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
                        else errmsg_dict[id / 4 - 1] + ";" + errmsg
                    errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
                        else errmsg_dict[id / 4] + ";" + errmsg
                    if len(blank_line) == 1:
                        ssub[blank_line[0]] = "【content】"
                    else:
                        ssub = ["", "【content】"].extend(ssub)  # 分不出来，先替换
            subcon[id] = "\n".join(ssub)
    all_item = re.split(r"【content】", "\n".join(subcon).strip())

    return all_item, item_no, errmsg_dict, count