group_text.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. # @Author : lightXu
  2. # @File : group_text.py
  3. import re
  4. subjects = ['unknown_subject', 'math', 'math_zxhx', 'english',
  5. 'chinese', 'physics', 'chemistry',
  6. 'biology', 'politics', 'history', 'geography',
  7. 'science_comprehensive', 'arts_comprehensive']
  8. # -------------------------符合下列条件的则为开始---------------------------
  9. # general_start = re.compile("^\s*\d+\s*[\.、::]?\D|^\s*\d+\s*[\.、::]?\d{4}]")
  10. general_start = re.compile("^\s*\d+\s*[\.、::]\D|^\s*\d+\s*[\.、::]\d{4}")
  11. math_start = re.compile("^\s*\(\d+\)\s*[\.、::]?")
  12. chinese_start = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?")
  13. write_start = re.compile("^\s*\(\d+\)\s*[\.、::]?")
  14. def is_topic_start(s, subject):
  15. '''开始节点'''
  16. if subject in subjects:
  17. if general_start.match(s):
  18. return True
  19. if subject == 'math':
  20. if "本题" in s or "本小题" in s:
  21. return True
  22. elif math_start.match(s):
  23. return True
  24. elif subject == 'chinese':
  25. if chinese_start.match(s):
  26. return True
  27. # elif subject == 'english':
  28. # if '注意' in s or '内容包括' in s:
  29. # if write_start.search(s):
  30. # return False
  31. return False
  32. else:
  33. raise ValueError("subject={} is not supported!".format(subject))
  34. # -------------------------符合下列条件的则为结束-------------------------
  35. general_end = re.compile("D\s*[\.、::]")
  36. # english_end = re.compile("^\s*G\s*[\.、::]")
  37. english_end = re.compile("^\s*[EFG]\s*[\.、::]|^\s*[EFG]\s+")
  38. chinese_end = re.compile("^\s*[EFG]\s*[\.、::]")
  39. written_expression = re.compile(r'书面表达')
  40. written_expression1 = re.compile(r'短文改错|翻译句子')
  41. def is_topic_end(s, subject):
  42. '''结束节点'''
  43. if subject in subjects:
  44. if subject == 'english':
  45. if general_end.search(s):
  46. return True
  47. if english_end.search(s):
  48. return "G"
  49. # elif written_expression.search(s):
  50. # return '书面表达'
  51. if subject == 'chinese':
  52. if chinese_end.search(s):
  53. return True
  54. return False
  55. else:
  56. raise ValueError("subject={} is not supported!".format(subject))
  57. # -------------------------符合下列条件的则为跳过舍去-------------------------
  58. general_filter1 = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]")
  59. general_filter2 = ['选择题', '单选题', '多选题', '综合题', '答案无效', '题目要求',
  60. '填空题', '单空题', '多空题', '计算题', '演算步骤', '单元测试', '古代诗歌阅读',
  61. '解答题', '简答题', '证明题', '按要求填写下列空格', '单项选择题', '注意事项',
  62. '选做题', '实验题', '第II卷', '第Ⅱ卷', '一律得零分', '证明过程', '现代文阅读',
  63. '第二卷', '答题卡', '试卷满分', '选题人', '最佳选项', '填写结果', '选不全', '文言文阅读',
  64. '答题时间', '分值', '题目要求', '阅读下面文字', '阅读下面短文', '阅读下列短文',
  65. '甲必考题', '必考题', '读一遍', '题卡', '符合题目要求', '规定区域', '符合要求', '阅读下面']
  66. end_sign = re.compile("第\d+页|共\d+页|页\d+第|\d+第|第[((]\d+[))]页|共[((]\d+[))]页|共[((]\d+[))]页$")
  67. english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"],
  68. ['第二节'], ['语言知识运用'], ['第II卷'], ['录音'], ['作答时间'], ['选项'],
  69. ['第二部分'], ['第三部分'], ['第四部分'], ['第一部分'], ['第一节'], ['阅读下列短文'], ['短文'], ['阅读下面短文'], ['阅读']]
  70. def contains_all(s, words):
  71. for word in words:
  72. if all([w in s for w in word]): # any()与all()函数的区别: any是任意,而all是全部。
  73. return True # 空元组、空列表返回值为True,这里要特别注意
  74. return False
  75. def is_topic_skip(s, subject):
  76. '''判断该行是否可以去掉,跳跃节点'''
  77. if subject in subjects:
  78. if general_filter1.match(s):
  79. return True
  80. elif end_sign.search(s):
  81. return True
  82. for topic_type in general_filter2:
  83. if topic_type in s:
  84. return True
  85. if subject == 'english':
  86. return contains_all(s, english_filter)
  87. return False
  88. else:
  89. raise ValueError("subject={} is not supported!".format(subject))
  90. # -----------------------all_end---------------------
  91. end_words = [["参考", "答案"], ["试题", "答案"], ["试卷", "答案"],
  92. ["省", "学年"], ["省", "学期"], ["市", "学年"], ["市", "学期"]]
  93. def all_end(s, subject):
  94. if subject not in ["chinese", 'english']:
  95. for word in end_words:
  96. if all([w in s for w in word]):
  97. return True
  98. return False
  99. return False
  100. # ----------------------action----------------------
  101. def group_pictures1(abcd_texts, subject=''):
  102. texts = abcd_texts['text']
  103. groups = []
  104. start = 0
  105. text_end = 0
  106. pattern = re.compile(r"[一]?[\.。、((::,,]?选择题[\.。、((::,,]|一[\.、((。::,,]?填空题[\.。、((::,,]|[一]?[\.。、((::,,]?单项选择题|[一]?[\.。、((::,,]?单项选择|[一]?[\.。、((::,,]?现代文阅读[\.。、((::,,]|[一]?[\.。、((::,,]?单选题[\.。、((::,,]")
  107. pattern1 = re.compile(r'第I卷|第〡卷|第Ⅰ卷|第I卷阅读题|第一部分')
  108. for i, t in enumerate(texts):
  109. if pattern.match(t):
  110. groups.clear()
  111. start = i + 1
  112. elif pattern1.match(t):
  113. groups.clear()
  114. start = i + 1
  115. elif is_topic_start(t, subject):
  116. groups.append([start, i])
  117. start = i
  118. elif is_topic_end(t, subject):
  119. if is_topic_end(t, subject) == "G" and start != 0:
  120. new_list = groups.pop()
  121. new_start = new_list[0]
  122. groups.append([new_start, i + 1])
  123. start = i + 1
  124. else:
  125. groups.append([start, i + 1])
  126. start = i + 1
  127. elif is_topic_skip(t, subject):
  128. if i > start:
  129. groups.append([start, i])
  130. start = i + 1
  131. elif all_end(t, subject):
  132. text_end = i
  133. len_text = len(texts)
  134. if len_text > start:
  135. if text_end:
  136. # print(text_end)
  137. groups.append([start, text_end])
  138. else:
  139. # print(len_text)
  140. groups.append([start, len_text])
  141. for i, lst in enumerate(groups):
  142. if lst[0] == lst[1]:
  143. del groups[i]
  144. # print('\n', groups)
  145. return groups
  146. def segment(texts):
  147. text_correct = re.compile(r'短文改错|翻译句子|书面表达')
  148. seg_index = []
  149. for i, t in enumerate(texts):
  150. if text_correct.search(t):
  151. seg_index.append(i)
  152. if len(seg_index) < 1:
  153. abcd_texts1 = {'start_index': 0, 'text': texts}
  154. return abcd_texts1, []
  155. else:
  156. seg_index_number = min(seg_index)
  157. abcd_texts1 = {'start_index': 0, 'text': texts[:seg_index_number]}
  158. writing_texts2 = {'start_index': seg_index_number, 'text': texts[seg_index_number:]}
  159. return abcd_texts1, writing_texts2
  160. def match_writing_section(texts, subject='english'):
  161. if subject == 'english':
  162. start_index = texts['start_index']
  163. texts_content = texts['text']
  164. text_correct = re.compile(r'短文改错|翻译句子|书面表达')
  165. seg_index_list = []
  166. for i, t in enumerate(texts_content):
  167. if text_correct.search(t):
  168. seg_index_list.append(i)
  169. seg_index_list.append(len(texts_content))
  170. seg_index_list = sorted(list(set(seg_index_list)))
  171. groups_list = []
  172. if len(seg_index_list) == 1 and seg_index_list[0] == 0:
  173. pass
  174. for i, number in enumerate(seg_index_list[1:]):
  175. groups_list.append([seg_index_list[i]+start_index+1, number+start_index])
  176. # print(groups_list)
  177. return groups_list
  178. else:
  179. return []
  180. def group_text(all_texts, subject):
  181. # txt_path = r'F:\nine_subject\english_test\write\57.txt'
  182. # text = open(txt_path, 'r').readlines()
  183. if subject == 'english':
  184. abcd_sec, writing_sec = segment(all_texts)
  185. if len(writing_sec) > 0:
  186. abcd_list = group_pictures1(abcd_sec, subject)
  187. writing_list = match_writing_section(writing_sec, subject)
  188. group_list = abcd_list + writing_list
  189. else:
  190. group_list = group_pictures1(abcd_sec, subject)
  191. else:
  192. all_texts = {'text': all_texts}
  193. group_list = group_pictures1(all_texts, subject)
  194. return group_list
  195. if __name__ == '__main__':
  196. subject = 'english'
  197. txt_path = r'G:\write\112.txt'
  198. all_texts = open(txt_path, 'r').readlines()
  199. group_list = group_text(all_texts, subject)
  200. print(group_list)