123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246 |
- # @Author : lightXu
- # @File : group_text.py
- import re
- subjects = ['unknown_subject', 'math', 'math_zxhx', 'english',
- 'chinese', 'physics', 'chemistry',
- 'biology', 'politics', 'history', 'geography',
- 'science_comprehensive', 'arts_comprehensive']
- # -------------------------符合下列条件的则为开始---------------------------
- # general_start = re.compile("^\s*\d+\s*[\.、::]?\D|^\s*\d+\s*[\.、::]?\d{4}]")
- general_start = re.compile("^\s*\d+\s*[\.、::]\D|^\s*\d+\s*[\.、::]\d{4}")
- math_start = re.compile("^\s*\(\d+\)\s*[\.、::]?")
- chinese_start = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?")
- write_start = re.compile("^\s*\(\d+\)\s*[\.、::]?")
- def is_topic_start(s, subject):
- '''开始节点'''
- if subject in subjects:
- if general_start.match(s):
- return True
- if subject == 'math':
- if "本题" in s or "本小题" in s:
- return True
- elif math_start.match(s):
- return True
- elif subject == 'chinese':
- if chinese_start.match(s):
- return True
- # elif subject == 'english':
- # if '注意' in s or '内容包括' in s:
- # if write_start.search(s):
- # return False
- return False
- else:
- raise ValueError("subject={} is not supported!".format(subject))
- # -------------------------符合下列条件的则为结束-------------------------
- general_end = re.compile("D\s*[\.、::]")
- # english_end = re.compile("^\s*G\s*[\.、::]")
- english_end = re.compile("^\s*[EFG]\s*[\.、::]|^\s*[EFG]\s+")
- chinese_end = re.compile("^\s*[EFG]\s*[\.、::]")
- written_expression = re.compile(r'书面表达')
- written_expression1 = re.compile(r'短文改错|翻译句子')
- def is_topic_end(s, subject):
- '''结束节点'''
- if subject in subjects:
- if subject == 'english':
- if general_end.search(s):
- return True
- if english_end.search(s):
- return "G"
- # elif written_expression.search(s):
- # return '书面表达'
- if subject == 'chinese':
- if chinese_end.search(s):
- return True
- return False
- else:
- raise ValueError("subject={} is not supported!".format(subject))
- # -------------------------符合下列条件的则为跳过舍去-------------------------
- general_filter1 = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]")
- general_filter2 = ['选择题', '单选题', '多选题', '综合题', '答案无效', '题目要求',
- '填空题', '单空题', '多空题', '计算题', '演算步骤', '单元测试', '古代诗歌阅读',
- '解答题', '简答题', '证明题', '按要求填写下列空格', '单项选择题', '注意事项',
- '选做题', '实验题', '第II卷', '第Ⅱ卷', '一律得零分', '证明过程', '现代文阅读',
- '第二卷', '答题卡', '试卷满分', '选题人', '最佳选项', '填写结果', '选不全', '文言文阅读',
- '答题时间', '分值', '题目要求', '阅读下面文字', '阅读下面短文', '阅读下列短文',
- '甲必考题', '必考题', '读一遍', '题卡', '符合题目要求', '规定区域', '符合要求', '阅读下面']
- end_sign = re.compile("第\d+页|共\d+页|页\d+第|\d+第|第[((]\d+[))]页|共[((]\d+[))]页|共[((]\d+[))]页$")
- english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"],
- ['第二节'], ['语言知识运用'], ['第II卷'], ['录音'], ['作答时间'], ['选项'],
- ['第二部分'], ['第三部分'], ['第四部分'], ['第一部分'], ['第一节'], ['阅读下列短文'], ['短文'], ['阅读下面短文'], ['阅读']]
- def contains_all(s, words):
- for word in words:
- if all([w in s for w in word]): # any()与all()函数的区别: any是任意,而all是全部。
- return True # 空元组、空列表返回值为True,这里要特别注意
- return False
- def is_topic_skip(s, subject):
- '''判断该行是否可以去掉,跳跃节点'''
- if subject in subjects:
- if general_filter1.match(s):
- return True
- elif end_sign.search(s):
- return True
- for topic_type in general_filter2:
- if topic_type in s:
- return True
- if subject == 'english':
- return contains_all(s, english_filter)
- return False
- else:
- raise ValueError("subject={} is not supported!".format(subject))
- # -----------------------all_end---------------------
- end_words = [["参考", "答案"], ["试题", "答案"], ["试卷", "答案"],
- ["省", "学年"], ["省", "学期"], ["市", "学年"], ["市", "学期"]]
- def all_end(s, subject):
- if subject not in ["chinese", 'english']:
- for word in end_words:
- if all([w in s for w in word]):
- return True
- return False
- return False
- # ----------------------action----------------------
- def group_pictures1(abcd_texts, subject=''):
- texts = abcd_texts['text']
- groups = []
- start = 0
- text_end = 0
- pattern = re.compile(r"[一]?[\.。、((::,,]?选择题[\.。、((::,,]|一[\.、((。::,,]?填空题[\.。、((::,,]|[一]?[\.。、((::,,]?单项选择题|[一]?[\.。、((::,,]?单项选择|[一]?[\.。、((::,,]?现代文阅读[\.。、((::,,]|[一]?[\.。、((::,,]?单选题[\.。、((::,,]")
- pattern1 = re.compile(r'第I卷|第〡卷|第Ⅰ卷|第I卷阅读题|第一部分')
- for i, t in enumerate(texts):
- if pattern.match(t):
- groups.clear()
- start = i + 1
- elif pattern1.match(t):
- groups.clear()
- start = i + 1
- elif is_topic_start(t, subject):
- groups.append([start, i])
- start = i
- elif is_topic_end(t, subject):
- if is_topic_end(t, subject) == "G" and start != 0:
- new_list = groups.pop()
- new_start = new_list[0]
- groups.append([new_start, i + 1])
- start = i + 1
- else:
- groups.append([start, i + 1])
- start = i + 1
- elif is_topic_skip(t, subject):
- if i > start:
- groups.append([start, i])
- start = i + 1
- elif all_end(t, subject):
- text_end = i
- len_text = len(texts)
- if len_text > start:
- if text_end:
- # print(text_end)
- groups.append([start, text_end])
- else:
- # print(len_text)
- groups.append([start, len_text])
- for i, lst in enumerate(groups):
- if lst[0] == lst[1]:
- del groups[i]
- # print('\n', groups)
- return groups
- def segment(texts):
- text_correct = re.compile(r'短文改错|翻译句子|书面表达')
- seg_index = []
- for i, t in enumerate(texts):
- if text_correct.search(t):
- seg_index.append(i)
- if len(seg_index) < 1:
- abcd_texts1 = {'start_index': 0, 'text': texts}
- return abcd_texts1, []
- else:
- seg_index_number = min(seg_index)
- abcd_texts1 = {'start_index': 0, 'text': texts[:seg_index_number]}
- writing_texts2 = {'start_index': seg_index_number, 'text': texts[seg_index_number:]}
- return abcd_texts1, writing_texts2
- def match_writing_section(texts, subject='english'):
- if subject == 'english':
- start_index = texts['start_index']
- texts_content = texts['text']
- text_correct = re.compile(r'短文改错|翻译句子|书面表达')
- seg_index_list = []
- for i, t in enumerate(texts_content):
- if text_correct.search(t):
- seg_index_list.append(i)
- seg_index_list.append(len(texts_content))
- seg_index_list = sorted(list(set(seg_index_list)))
- groups_list = []
- if len(seg_index_list) == 1 and seg_index_list[0] == 0:
- pass
- for i, number in enumerate(seg_index_list[1:]):
- groups_list.append([seg_index_list[i]+start_index+1, number+start_index])
- # print(groups_list)
- return groups_list
- else:
- return []
- def group_text(all_texts, subject):
- # txt_path = r'F:\nine_subject\english_test\write\57.txt'
- # text = open(txt_path, 'r').readlines()
- if subject == 'english':
- abcd_sec, writing_sec = segment(all_texts)
- if len(writing_sec) > 0:
- abcd_list = group_pictures1(abcd_sec, subject)
- writing_list = match_writing_section(writing_sec, subject)
- group_list = abcd_list + writing_list
- else:
- group_list = group_pictures1(abcd_sec, subject)
- else:
- all_texts = {'text': all_texts}
- group_list = group_pictures1(all_texts, subject)
- return group_list
- if __name__ == '__main__':
- subject = 'english'
- txt_path = r'G:\write\112.txt'
- all_texts = open(txt_path, 'r').readlines()
- group_list = group_text(all_texts, subject)
- print(group_list)
|