# @Author : lightXu # @File : group_text.py import re subjects = ['unknown_subject', 'math', 'math_zxhx', 'english', 'chinese', 'physics', 'chemistry', 'biology', 'politics', 'history', 'geography', 'science_comprehensive', 'arts_comprehensive'] # -------------------------符合下列条件的则为开始--------------------------- # general_start = re.compile("^\s*\d+\s*[\.、::]?\D|^\s*\d+\s*[\.、::]?\d{4}]") general_start = re.compile("^\s*\d+\s*[\.、::]\D|^\s*\d+\s*[\.、::]\d{4}") math_start = re.compile("^\s*\(\d+\)\s*[\.、::]?") chinese_start = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?") write_start = re.compile("^\s*\(\d+\)\s*[\.、::]?") def is_topic_start(s, subject): '''开始节点''' if subject in subjects: if general_start.match(s): return True if subject == 'math': if "本题" in s or "本小题" in s: return True elif math_start.match(s): return True elif subject == 'chinese': if chinese_start.match(s): return True # elif subject == 'english': # if '注意' in s or '内容包括' in s: # if write_start.search(s): # return False return False else: raise ValueError("subject={} is not supported!".format(subject)) # -------------------------符合下列条件的则为结束------------------------- general_end = re.compile("D\s*[\.、::]") # english_end = re.compile("^\s*G\s*[\.、::]") english_end = re.compile("^\s*[EFG]\s*[\.、::]|^\s*[EFG]\s+") chinese_end = re.compile("^\s*[EFG]\s*[\.、::]") written_expression = re.compile(r'书面表达') written_expression1 = re.compile(r'短文改错|翻译句子') def is_topic_end(s, subject): '''结束节点''' if subject in subjects: if subject == 'english': if general_end.search(s): return True if english_end.search(s): return "G" # elif written_expression.search(s): # return '书面表达' if subject == 'chinese': if chinese_end.search(s): return True return False else: raise ValueError("subject={} is not supported!".format(subject)) # -------------------------符合下列条件的则为跳过舍去------------------------- general_filter1 = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]") general_filter2 = ['选择题', '单选题', '多选题', '综合题', '答案无效', '题目要求', '填空题', '单空题', '多空题', '计算题', '演算步骤', '单元测试', '古代诗歌阅读', '解答题', '简答题', '证明题', '按要求填写下列空格', '单项选择题', '注意事项', '选做题', '实验题', '第II卷', '第Ⅱ卷', '一律得零分', '证明过程', '现代文阅读', '第二卷', '答题卡', '试卷满分', '选题人', '最佳选项', '填写结果', '选不全', '文言文阅读', '答题时间', '分值', '题目要求', '阅读下面文字', '阅读下面短文', '阅读下列短文', '甲必考题', '必考题', '读一遍', '题卡', '符合题目要求', '规定区域', '符合要求', '阅读下面'] end_sign = re.compile("第\d+页|共\d+页|页\d+第|\d+第|第[((]\d+[))]页|共[((]\d+[))]页|共[((]\d+[))]页$") english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"], ['第二节'], ['语言知识运用'], ['第II卷'], ['录音'], ['作答时间'], ['选项'], ['第二部分'], ['第三部分'], ['第四部分'], ['第一部分'], ['第一节'], ['阅读下列短文'], ['短文'], ['阅读下面短文'], ['阅读']] def contains_all(s, words): for word in words: if all([w in s for w in word]): # any()与all()函数的区别: any是任意,而all是全部。 return True # 空元组、空列表返回值为True,这里要特别注意 return False def is_topic_skip(s, subject): '''判断该行是否可以去掉,跳跃节点''' if subject in subjects: if general_filter1.match(s): return True elif end_sign.search(s): return True for topic_type in general_filter2: if topic_type in s: return True if subject == 'english': return contains_all(s, english_filter) return False else: raise ValueError("subject={} is not supported!".format(subject)) # -----------------------all_end--------------------- end_words = [["参考", "答案"], ["试题", "答案"], ["试卷", "答案"], ["省", "学年"], ["省", "学期"], ["市", "学年"], ["市", "学期"]] def all_end(s, subject): if subject not in ["chinese", 'english']: for word in end_words: if all([w in s for w in word]): return True return False return False # ----------------------action---------------------- def group_pictures1(abcd_texts, subject=''): texts = abcd_texts['text'] groups = [] start = 0 text_end = 0 pattern = re.compile(r"[一]?[\.。、((::,,]?选择题[\.。、((::,,]|一[\.、((。::,,]?填空题[\.。、((::,,]|[一]?[\.。、((::,,]?单项选择题|[一]?[\.。、((::,,]?单项选择|[一]?[\.。、((::,,]?现代文阅读[\.。、((::,,]|[一]?[\.。、((::,,]?单选题[\.。、((::,,]") pattern1 = re.compile(r'第I卷|第〡卷|第Ⅰ卷|第I卷阅读题|第一部分') for i, t in enumerate(texts): if pattern.match(t): groups.clear() start = i + 1 elif pattern1.match(t): groups.clear() start = i + 1 elif is_topic_start(t, subject): groups.append([start, i]) start = i elif is_topic_end(t, subject): if is_topic_end(t, subject) == "G" and start != 0: new_list = groups.pop() new_start = new_list[0] groups.append([new_start, i + 1]) start = i + 1 else: groups.append([start, i + 1]) start = i + 1 elif is_topic_skip(t, subject): if i > start: groups.append([start, i]) start = i + 1 elif all_end(t, subject): text_end = i len_text = len(texts) if len_text > start: if text_end: # print(text_end) groups.append([start, text_end]) else: # print(len_text) groups.append([start, len_text]) for i, lst in enumerate(groups): if lst[0] == lst[1]: del groups[i] # print('\n', groups) return groups def segment(texts): text_correct = re.compile(r'短文改错|翻译句子|书面表达') seg_index = [] for i, t in enumerate(texts): if text_correct.search(t): seg_index.append(i) if len(seg_index) < 1: abcd_texts1 = {'start_index': 0, 'text': texts} return abcd_texts1, [] else: seg_index_number = min(seg_index) abcd_texts1 = {'start_index': 0, 'text': texts[:seg_index_number]} writing_texts2 = {'start_index': seg_index_number, 'text': texts[seg_index_number:]} return abcd_texts1, writing_texts2 def match_writing_section(texts, subject='english'): if subject == 'english': start_index = texts['start_index'] texts_content = texts['text'] text_correct = re.compile(r'短文改错|翻译句子|书面表达') seg_index_list = [] for i, t in enumerate(texts_content): if text_correct.search(t): seg_index_list.append(i) seg_index_list.append(len(texts_content)) seg_index_list = sorted(list(set(seg_index_list))) groups_list = [] if len(seg_index_list) == 1 and seg_index_list[0] == 0: pass for i, number in enumerate(seg_index_list[1:]): groups_list.append([seg_index_list[i]+start_index+1, number+start_index]) # print(groups_list) return groups_list else: return [] def group_text(all_texts, subject): # txt_path = r'F:\nine_subject\english_test\write\57.txt' # text = open(txt_path, 'r').readlines() if subject == 'english': abcd_sec, writing_sec = segment(all_texts) if len(writing_sec) > 0: abcd_list = group_pictures1(abcd_sec, subject) writing_list = match_writing_section(writing_sec, subject) group_list = abcd_list + writing_list else: group_list = group_pictures1(abcd_sec, subject) else: all_texts = {'text': all_texts} group_list = group_pictures1(all_texts, subject) return group_list if __name__ == '__main__': subject = 'english' txt_path = r'G:\write\112.txt' all_texts = open(txt_path, 'r').readlines() group_list = group_text(all_texts, subject) print(group_list)