lightxu
/
exam-segment-django


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
							# @Author  : lightXu
# @File    : group_text.py
import re

subjects = ['unknown_subject', 'math', 'math_zxhx', 'english',
            'chinese', 'physics', 'chemistry',
            'biology', 'politics', 'history', 'geography',
            'science_comprehensive', 'arts_comprehensive']


# -------------------------符合下列条件的则为开始---------------------------
# general_start = re.compile("^\s*\d+\s*[\.、:：]?\D|^\s*\d+\s*[\.、:：]?\d{4}]")
general_start = re.compile("^\s*\d+\s*[\.、:：]\D|^\s*\d+\s*[\.、:：]\d{4}")
math_start = re.compile("^\s*\(\d+\)\s*[\.、:：]?")
chinese_start = re.compile("^\s*[(（]\s*[一二三四五六七八九十]\s*[）)]\s*[\.、:：]?")
write_start = re.compile("^\s*\(\d+\)\s*[\.、:：]?")


def is_topic_start(s, subject):
    '''开始节点'''
    if subject in subjects:
        if general_start.match(s):
            return True
        if subject == 'math':
            if "本题" in s or "本小题" in s:
                return True
            elif math_start.match(s):
                return True
        elif subject == 'chinese':
            if chinese_start.match(s):
                return True
        # elif subject == 'english':
        #     if '注意' in s or '内容包括' in s:
        #         if write_start.search(s):
        #             return False
        return False
    else:
        raise ValueError("subject={} is not supported!".format(subject))


# -------------------------符合下列条件的则为结束-------------------------
general_end = re.compile("D\s*[\.、:：]")
# english_end = re.compile("^\s*G\s*[\.、:：]")
english_end = re.compile("^\s*[EFG]\s*[\.、:：]|^\s*[EFG]\s+")
chinese_end = re.compile("^\s*[EFG]\s*[\.、:：]")
written_expression = re.compile(r'书面表达')
written_expression1 = re.compile(r'短文改错|翻译句子')


def is_topic_end(s, subject):
    '''结束节点'''
    if subject in subjects:
        if subject == 'english':
            if general_end.search(s):
                return True
            if english_end.search(s):
                return "G"

            # elif written_expression.search(s):
            #     return '书面表达'

        if subject == 'chinese':
            if chinese_end.search(s):
                return True
        return False
    else:
        raise ValueError("subject={} is not supported!".format(subject))


# -------------------------符合下列条件的则为跳过舍去-------------------------
general_filter1 = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、:：]")
general_filter2 = ['选择题', '单选题', '多选题', '综合题', '答案无效', '题目要求',
                   '填空题', '单空题', '多空题', '计算题', '演算步骤', '单元测试', '古代诗歌阅读',
                   '解答题', '简答题', '证明题', '按要求填写下列空格', '单项选择题', '注意事项',
                   '选做题', '实验题', '第II卷', '第Ⅱ卷', '一律得零分', '证明过程', '现代文阅读',
                   '第二卷', '答题卡', '试卷满分', '选题人', '最佳选项', '填写结果', '选不全', '文言文阅读',
                   '答题时间', '分值', '题目要求', '阅读下面文字', '阅读下面短文', '阅读下列短文',
                   '甲必考题', '必考题', '读一遍', '题卡', '符合题目要求', '规定区域', '符合要求', '阅读下面']

end_sign = re.compile("第\d+页|共\d+页|页\d+第|\d+第|第[(（]\d+[)）]页|共[(（]\d+[)）]页|共[(（]\d+[)）]页$")

english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"],
                  ['第二节'], ['语言知识运用'], ['第II卷'], ['录音'], ['作答时间'], ['选项'],
                  ['第二部分'], ['第三部分'], ['第四部分'], ['第一部分'], ['第一节'], ['阅读下列短文'], ['短文'], ['阅读下面短文'], ['阅读']]


def contains_all(s, words):
    for word in words:
        if all([w in s for w in word]):  # any()与all()函数的区别: any是任意，而all是全部。
            return True                  # 空元组、空列表返回值为True，这里要特别注意
    return False


def is_topic_skip(s, subject):
    '''判断该行是否可以去掉，跳跃节点'''

    if subject in subjects:
        if general_filter1.match(s):
            return True
        elif end_sign.search(s):
            return True
        for topic_type in general_filter2:
            if topic_type in s:
                return True
        if subject == 'english':
            return contains_all(s, english_filter)
        return False
    else:
        raise ValueError("subject={} is not supported!".format(subject))


# -----------------------all_end---------------------

end_words = [["参考", "答案"], ["试题", "答案"], ["试卷", "答案"],
             ["省", "学年"], ["省", "学期"], ["市", "学年"], ["市", "学期"]]


def all_end(s, subject):
    if subject not in ["chinese", 'english']:
        for word in end_words:
            if all([w in s for w in word]):
                return True
        return False
    return False


# ----------------------action----------------------
def group_pictures1(abcd_texts, subject=''):
    texts = abcd_texts['text']
    groups = []
    start = 0
    text_end = 0
    pattern = re.compile(r"[一]?[\.。、(（:：,，]?选择题[\.。、(（:：,，]|一[\.、(（。:：,，]?填空题[\.。、(（:：,，]|[一]?[\.。、(（:：,，]?单项选择题|[一]?[\.。、(（:：,，]?单项选择|[一]?[\.。、(（:：,，]?现代文阅读[\.。、(（:：,，]|[一]?[\.。、(（:：,，]?单选题[\.。、(（:：,，]")
    pattern1 = re.compile(r'第I卷|第〡卷|第Ⅰ卷|第I卷阅读题|第一部分')

    for i, t in enumerate(texts):
        if pattern.match(t):
            groups.clear()
            start = i + 1
        elif pattern1.match(t):
            groups.clear()
            start = i + 1

        elif is_topic_start(t, subject):
            groups.append([start, i])
            start = i
        elif is_topic_end(t, subject):
            if is_topic_end(t, subject) == "G" and start != 0:
                new_list = groups.pop()
                new_start = new_list[0]
                groups.append([new_start, i + 1])
                start = i + 1

            else:
                groups.append([start, i + 1])
                start = i + 1
        elif is_topic_skip(t, subject):
            if i > start:
                groups.append([start, i])
            start = i + 1
        elif all_end(t, subject):
            text_end = i

    len_text = len(texts)
    if len_text > start:
        if text_end:
            # print(text_end)
            groups.append([start, text_end])
        else:
            # print(len_text)
            groups.append([start, len_text])

    for i, lst in enumerate(groups):
        if lst[0] == lst[1]:
            del groups[i]

    # print('\n', groups)
    return groups


def segment(texts):
    text_correct = re.compile(r'短文改错|翻译句子|书面表达')
    seg_index = []
    for i, t in enumerate(texts):
        if text_correct.search(t):
            seg_index.append(i)

    if len(seg_index) < 1:
        abcd_texts1 = {'start_index': 0, 'text': texts}
        return abcd_texts1, []
    else:
        seg_index_number = min(seg_index)
        abcd_texts1 = {'start_index': 0, 'text': texts[:seg_index_number]}
        writing_texts2 = {'start_index': seg_index_number, 'text': texts[seg_index_number:]}

        return abcd_texts1, writing_texts2


def match_writing_section(texts, subject='english'):
    if subject == 'english':
        start_index = texts['start_index']
        texts_content = texts['text']
        text_correct = re.compile(r'短文改错|翻译句子|书面表达')
        seg_index_list = []
        for i, t in enumerate(texts_content):
            if text_correct.search(t):
                seg_index_list.append(i)

        seg_index_list.append(len(texts_content))
        seg_index_list = sorted(list(set(seg_index_list)))

        groups_list = []
        if len(seg_index_list) == 1 and seg_index_list[0] == 0:
            pass
        for i, number in enumerate(seg_index_list[1:]):
            groups_list.append([seg_index_list[i]+start_index+1, number+start_index])

        # print(groups_list)
        return groups_list
    else:
        return []


def group_text(all_texts, subject):
    # txt_path = r'F:\nine_subject\english_test\write\57.txt'
    # text = open(txt_path, 'r').readlines()
    if subject == 'english':
        abcd_sec, writing_sec = segment(all_texts)
        if len(writing_sec) > 0:
            abcd_list = group_pictures1(abcd_sec, subject)
            writing_list = match_writing_section(writing_sec, subject)
            group_list = abcd_list + writing_list
        else:
            group_list = group_pictures1(abcd_sec, subject)
    else:
        all_texts = {'text': all_texts}
        group_list = group_pictures1(all_texts, subject)
    return group_list


if __name__ == '__main__':
    subject = 'english'
    txt_path = r'G:\write\112.txt'
    all_texts = open(txt_path, 'r').readlines()
    group_list = group_text(all_texts, subject)
    print(group_list)