get_ans_structure.txt 44 KB


  1. # -*- coding: utf-8 -*-
  2. """
  3. 答案提取原则:按行提取,按不同类型答案先后提取
  4. 1.提取听力材料(可只能出现在答案开头或答案末尾)
  5. 特征:开始关键词:听力(原文|材料|文稿)等 结尾关键词:Text 10
  6. 2.提取客观题答案(答案是ABCD这样的)
  7. """
  8. import os
  9. from collections import Counter, OrderedDict
  10. from pprint import pprint
  11. import jieba
  12. import operator
  13. import Levenshtein
  14. from flow.get_item_type import type_keyw_judge, item_type_classify
  15. from util import *
  16. chinese_content = []
  17. def get_obj_ans(one_line):
  18. """
  19. judge one line is objective answers,that is all answers is ABCDEFG
  20. 1.exist ACBD ([ABCDEFG](2,7))
  21. 2.may has English words,eg:shanghai test_paper:Cloze:21-25 BADCB << Only start of line has English Word
  22. :param one_line:
  23. :return: if return {},not obj,no ABCD, if return dict,that is {id:ans}, if return [],only has ans,but no id,need further process
  24. """
  25. if re.search(r"[A-K]",one_line) and re.search(r"[A-K][a-z]",one_line) is None:
  26. one_line = re.sub(r'\d+分', '分数', one_line)
  27. one_line = re.sub(r'l\s*[\.\-—~]\s*(\d)', r"1-\1", one_line)
  28. one_line = re.sub(r'(\d)l\s*[\.\-—~]',r"\1犇1-",one_line)
  29. one_line = re.sub(r'l(\d)\s*[\.\-—~]', r"1\1", one_line)
  30. one_line = re.sub(r'[Ss](\d)\s*[\.\-—~]', r"5\1", one_line)
  31. one_line = re.sub(r'(\d)[Ss]\s*[\.\-—~]', r"\1犇5-", one_line)
  32. one_line = re.sub(r'[Ss]\s*[\.\-—~]\s*(\d)', r"5-\1", one_line)
  33. one_line = re.sub(r'(\d)G\s*[\.\-—~]', r"\1犇6-" , one_line)
  34. one_line = re.sub(r'G(\d)\s*[\.\-—~]', r"6\1", one_line)
  35. one_line = re.sub(r'G[\.\-—~]\s*(\d)', r"6-\1", one_line)
  36. one_line = re.sub(r'(\d)[oO]\s*[\.\-—~]', r"\1犇0-", one_line)
  37. one_line = one_line.replace("犇","")
  38. ###答案是成块给出还是一一对应
  39. block_ans = re.search(r"\d+([-—]+|~)\d+",one_line) or len(re.findall(r"[A-D]{2,}",one_line))> 0
  40. one_line_list = [i for i in re.split(r'\s+', one_line)] # if re.search(r'[\u4e00-\u9fa5]', i) == None]
  41. ids = []
  42. ABCD = []
  43. id_list = [] # 连续的题号序列
  44. if block_ans == False: #可能是一个题号写一个答案,答案和题号一一对应
  45. id_list = [i for i in re.findall('\d+', one_line) if int(i) <= 120]
  46. one_line = re.sub(r"[\s\n]","",one_line)
  47. if len(one_line) > len("".join(map(str,id_list)))+2*len(id_list)+6: #一一对应的话,提取出来的数字和答案是的长度关系,乘以2是可能有.加6最多加句首可能的描述性语言
  48. id_list = []
  49. ids = id_list #下面很多地方用到了ids
  50. else:
  51. ids = sum([ [int(i[0]),int(i[-1])] for i in re.findall(r"(\d+)[^\d]{0,3}(\d+)",one_line)], [])
  52. if len(ids):
  53. id_list = [str(i) for i in range(min(ids), max(ids) + 1)]
  54. if len(id_list):
  55. one_line = one_line[one_line.index(id_list[0]):]
  56. ABCD = [i for i in english_alpha_extract(one_line)]
  57. if len(id_list) == len("".join(ABCD)): #题号编码和答案个数相等
  58. return dict(zip(id_list,list("".join(ABCD))))
  59. else:
  60. obj_id_ans = OrderedDict()
  61. if len(ABCD) >= 1 and len(re.findall('[a-z]{2,15}', one_line)) <= 3: # Keys: 1-5 BCABA
  62. sids = sorted([int(i) for i in ids])
  63. int_ids = [int(i) for i in ids]
  64. if len(set(ids)) < len(ids) or operator.eq(sids, int_ids) == False:
  65. one_line = ' '.join(one_line_list) # 听力(30分) 1-5 CCAAB 6-10 ABBCB 11-15 CBCAB 16-20 CACAB
  66. int_ids = [int(i) for i in ids]
  67. one_line = one_line.replace(' ', '')
  68. ABCD = english_alpha_extract(one_line)
  69. len_ABCD = sum([1 for i in ABCD if len(i) > 1])
  70. if len_ABCD != len(ABCD):
  71. ABCD = []
  72. if len(ids) == 2 * len(ABCD): # 说明id是两端点的题号,且题号没有丢失
  73. for k, v in enumerate(ABCD):
  74. lid = int_ids[k * 2]
  75. rid = int_ids[k * 2 + 1]
  76. if len(v) == rid - lid + 1: # ABCD none deletion
  77. sub_id = [str(iid) for iid in range(lid, rid + 1)]
  78. sub_v = [i for i in v]
  79. if len(sub_id) == len(sub_v):
  80. obj_id_ans.update(dict(zip(sub_id, sub_v)))
  81. else:
  82. if len(int_ids) > 0:
  83. all_ids = [str(i) for i in range(min(int_ids), max(int_ids) + 1)]
  84. opts = [i for i in ''.join(ABCD) if i != '']
  85. if len(all_ids) == len(opts):
  86. obj_id_ans.update(dict(zip(all_ids, opts)))
  87. elif len(all_ids) > len(opts): # 61-64 63-66: CDBD << deletion有选项丢失,every part to get
  88. for i, iid in enumerate(ids):
  89. if i % 2 != 0:
  90. st = one_line.index(str(iid))
  91. ed = 0
  92. if i < len(int_ids) - 1:
  93. # print('one_line:{},索引值是:{}'.format(one_line,str(int_ids[i+1])))
  94. ed = one_line.index(str(ids[i])) + 1
  95. else:
  96. ed = len(one_line)
  97. str_part = one_line[st:ed]
  98. sub_ABCD = re.findall(r'[ABCDEFGHIJK]{2,15}', str_part)
  99. if len(sub_ABCD) > 0:
  100. sub_ids = [str(i) for i in range(int_ids[i - 1], int(iid) + 1)]
  101. sub_A_B_C_D = [i.strip() for i in ''.join(sub_ABCD) if i.strip() != '']
  102. if len(sub_A_B_C_D) == len(sub_ids):
  103. obj_id_ans.update(dict(zip(sub_ids, sub_A_B_C_D)))
  104. elif len(ids) <= len(ABCD) and len(ABCD) > 1:
  105. return ABCD
  106. else:
  107. A_B_C_D = re.findall('[ABCDEFGHJIK]\s*', one_line)
  108. if re.search('[a-z]{2,15}', one_line) == None and len(A_B_C_D) >= 3: # 答案是分散的
  109. ids = [i for i in re.findall('\d+', one_line) if int(i) < 120]
  110. if len(ids) == len(A_B_C_D):
  111. obj_id_ans.update(dict(zip(ids, [i.strip() for i in A_B_C_D])))
  112. return obj_id_ans
  113. else:
  114. return {}
  115. def get_word_ans(one_line):
  116. """
  117. answer is English word(one or word group) no other symbols like → no Chinese,may have \、
  118. 语法填空,单词拼写,课文填空,短文填空,根据中文完成下面的短语,翻译句子都可以提取
  119. use
  120. :param one_line:
  121. :return:
  122. """
  123. if get_obj_ans(one_line) == {}:
  124. word_id_ans = OrderedDict()
  125. if is_parse(one_line) == False:
  126. ids = [i for i in re.findall('\d+', one_line) if int(i) < 120]
  127. ids = [iid for iid in ids if
  128. len(re.findall(r'[a-z]{1,15}', one_line[one_line.index(iid):], re.I)) > 0] ##id后面一定有单词
  129. if len(ids) > 0:#若第一个id前面有三个单词,或者第一个id前面有超过10个字符,就认为不是答案
  130. if len(ids)==1: #对于等于1的,很有可能就不是答案,而是句子中有一个数字,提取错了
  131. if len(re.findall(r'[a-zA-Z]{1,15}', one_line[:one_line.index(ids[0])])) >= 1: #只要数字前面有至少一个单词,就认为不是答案
  132. ids = []
  133. else:
  134. if len(re.findall(r'[a-zA-Z]{1,15}', one_line[:one_line.index(ids[0])])) >= 3 or one_line.index(
  135. ids[0]) >= 10:
  136. ids = []
  137. # word = re.findall('[a-z]{2,15}', one_line[one_line.index(ids[0]):])
  138. ids = del_outlier(ids)
  139. if len(ids) > 0: # re.search(r'[,,!!→]|[\u4e00-\u9fa5]', one_line) == None and
  140. # 曾遇到一种情况,1. branches 2. predict,,正则知道第一个是1,但字符串不知道,用index取不到,,字体不一样
  141. try:
  142. nword = [strip_point(i) for i in re.split('|'.join(ids), one_line[one_line.index(ids[0]):]) if re.search(r'[A-Za-z]+', i) != None]
  143. except:
  144. nword = [strip_point(i) for i in re.split('|'.join(ids), one_line) if re.search(r'[A-Za-z]+', i) != None]
  145. b = True
  146. if len(nword) == len(ids):
  147. for k, w in enumerate(nword):
  148. try:
  149. if w not in one_line[one_line.index(ids[k]):]:
  150. b = False
  151. break
  152. except:
  153. if w not in one_line:
  154. b = False
  155. break
  156. if b:
  157. word_id_ans.update(dict(zip(ids, nword)))
  158. else:
  159. nids = del_outlier(ids) # 57.词组 (10分) 1) dress up 2) keep her word 3) getting away with 4) cut down
  160. if len(nids) == len(nword):
  161. word_id_ans.update(dict(zip(nids, nword))) # 38.dress up 39.keep her word 40.$25.3
  162. else:
  163. nword = [' '.join(re.findall(r'[A-Za-z]+', i)) for i in
  164. re.split('|'.join(nids), one_line[one_line.index(nids[0]):]) if
  165. len(re.findall(r'[A-Za-z]+', i)) > 0]
  166. if len(nword) == len(nids):
  167. word_id_ans.update(dict(zip(nids, nword)))
  168. else: #判断是解析:
  169. may_id = re.search(r"(\d+)",one_line[:5])
  170. if may_id:
  171. word_id_ans["parse_{}".format(may_id.group(1))] = one_line
  172. else:
  173. if re.search(r"改[为成]|删[除掉]|[增添]加",one_line) is None: #否则会把短文改错提断
  174. word_id_ans["parse_0"] = one_line
  175. return word_id_ans
  176. return word_id_ans
  177. def chinese_start_line(line):
  178. """
  179. judge this line is ans_description
  180. :param line:
  181. :return:
  182. """
  183. line = re.sub("[\s\n]","",line)
  184. # chinese = re.match(r'[\u4e00-\u9fa5VI]',line)
  185. # chinese = re.findall(r'[\u4e00-\u9fa5]', line)
  186. # english = re.findall(r'[a-zA-Z]{2,15}', line)
  187. if re.search(r'(篇章|语篇)解析|[答考试真\d大小本该]题',line[:8]) and re.search(r'[a-zA-Z]',line) == None:
  188. return True
  189. elif re.match(r'[ⅫⅪⅩⅨⅧⅦⅥⅤⅣⅢⅡⅠ]', line) != None :
  190. return True
  191. elif re.match(r'(IV|VI|III|V|VII|VIII)\.', line) != None:
  192. return True
  193. elif re.search(r'读后续写|书面表达|单词拼写|写作|作文|七选五|改错|阅读(理解|表达)|语法填空|完[型形]|任务型阅读|One possible version|Writing version|Translation', line,re.I) :#
  194. #and len(re.findall(r'[a-z]{1,15}', line, re.I)) <= 5:
  195. return True
  196. elif re.search(r'第.(部分|小?节|卷)',line):
  197. return True
  198. elif len(re.findall(r"[评给总打满\d]分|档次|分数",line)) > 1:
  199. return True
  200. else:
  201. return False
  202. # def get_essay_ans(ans_list, ans_result, chinese_index):
  203. def get_essay_ans(ans_result):
  204. """
  205. :param ans_list:
  206. :return:
  207. """
  208. chinese_index = [k for k, v in enumerate(ans_result) if
  209. v == 'chinese' or isinstance(v, dict) or isinstance(v, OrderedDict)]
  210. # chinese_index = chinese_index.reverse()
  211. for i, ci in enumerate(chinese_index):
  212. start = ci
  213. end = 0
  214. if i < len(chinese_index) -1 :
  215. end = chinese_index[i + 1]
  216. else:
  217. end = len(ans_result)
  218. if end - start >= 3 and end - start <= 20:
  219. part = "\n".join(ans_result[start + 1:end])
  220. ###把最后面没有英文字母的连续文字去掉,否则可能出现,最后一行是噪声汉字,但由于汉字较多,影响了下文的判断
  221. count = -30
  222. while count < 0:
  223. if re.search(r"[\u4e00-\u9fa5]",part[count:]) is not None and re.search(r"[a-zA-Z]",part[count:]) is not None:
  224. count += 1
  225. else:
  226. if count == -30:
  227. count = len(part)
  228. break
  229. part = part[:count]
  230. if len(set(list_ele_type(part))) == 1 and 'str' in list_ele_type(part):
  231. english_word = re.findall(r'[a-z]{2,15}', part)
  232. chinese_word = re.findall(r'[\u4e00-\u9fa5]', part)
  233. nums = del_outlier(re.findall(r'\d+', part))
  234. if len(chinese_word) <= 4 and len(english_word) >= 80 and len(nums) < end - start:
  235. nans_result = []
  236. for k, v in enumerate(ans_result):
  237. if k <= start or k >= end:
  238. nans_result.append(v)
  239. #####利用答案前面的说明做题型分类
  240. ty = "短文"
  241. if start > 2:
  242. exp = ""
  243. for i in range(1,start-2):
  244. line_exp = ans_result[start-i]
  245. if isinstance(line_exp,str):
  246. if line_exp == "chinese":
  247. exp += chinese_content[ans_result[:start].count("chinese")-1]
  248. else:
  249. exp += line_exp
  250. else:
  251. break
  252. if exp != "":
  253. ty = type_keyw_judge(exp)
  254. if ty is None:
  255. ty = "短文"
  256. nans_result.insert(start + 1, {ty: part})
  257. ans_result = nans_result
  258. return get_essay_ans(ans_result)
  259. else:
  260. if end == len(ans_result):
  261. return ans_result
  262. else:
  263. pass
  264. else:
  265. if end == len(ans_result):
  266. return ans_result
  267. else:
  268. pass
  269. def get_listening_ans(ans_lines):
  270. """
  271. :param ans_lines: list,all ans lines
  272. :return:
  273. """
  274. start = -1
  275. endL = 0
  276. endR = 0
  277. for k, i in enumerate(ans_lines):
  278. if re.search(r'听力材料|听力原文|录音原文|听力录音稿|听力录音材料|听力部分录音稿|听力文稿|Text\s*1(?![0o])|Text\s+[Oo0]ne', i) != None:
  279. start = k
  280. break
  281. if start > 5: # that is : listening is the last ans
  282. listening = {'听力原文': ''.join(ans_lines[start:])}
  283. ans_lines = ans_lines[:start]
  284. ans_lines.append(listening)
  285. return ans_lines
  286. else:
  287. b1 = re.search(r'Text\s+Ten|Text\s+(10|l0|1o)|听录音,根据短文内容完成', ' '.join(ans_lines)) != None
  288. if b1:
  289. for k, i in enumerate(ans_lines):
  290. if re.search(r'Text\s+Ten|Text\s+(10|l0|1o)|听录音,根据短文内容完成', i) != None:
  291. endL = k
  292. break
  293. for k2, i2 in enumerate(ans_lines[endL + 1:]):
  294. if k2 > endL + 1 and re.match(r'[\u4e00-\u9fa5ⅫⅪⅩⅨⅧⅦⅥⅤⅣⅢⅡⅠ]', i2) != None or get_obj_ans(i2) != {}:
  295. endR = endL + k2
  296. break
  297. else:
  298. #####没有结尾标识,并且start<=5或许还等于-1
  299. if start != -1:
  300. for k3, i3 in enumerate(ans_lines[start + 1:]):
  301. if start != -1 and k > 20 + start and \
  302. re.match(r'[\u4e00-\u9fa5ⅫⅪⅩⅨⅧⅦⅥⅤⅣⅢⅡⅠ]',i) != None or get_obj_ans(i) != {}:
  303. endR = k3 + start
  304. if endR != 0:
  305. if start == -1:
  306. start = 0
  307. nans_lines = ans_lines[:start]
  308. nans_lines.insert(start, {'听力原文': '\n'.join(ans_lines[start:endR])})
  309. nans_lines.extend(ans_lines[endR + 1:])
  310. return nans_lines
  311. else:
  312. return ans_lines
  313. def error_correct(ans_result):
  314. """
  315. 提取短文改错
  316. :return:
  317. """
  318. # chinese_index = [k for k, v in enumerate(ans_result) if
  319. # v == 'chinese' or isinstance(v, dict) or isinstance(v, OrderedDict) or (
  320. # k > 5 and v.startswith("Dear"))]
  321. chinese_index = [k for k, v in enumerate(ans_result) if
  322. v == 'chinese' or isinstance(v, dict) or isinstance(v, OrderedDict)]# or (
  323. # k > 5 and "".join(v).startswith("Dear"))] #2019-7-9把Dear行当chinese,会使得答案里少这行的内容
  324. # chinese_index = chinese_index.reverse()
  325. for i, ci in enumerate(chinese_index):
  326. start = ci
  327. end = 0
  328. if i < len(chinese_index) - 1:
  329. end = chinese_index[i + 1]
  330. else:
  331. end = len(ans_result)
  332. # if end - start > 5 and end - start < 30:
  333. part = ans_result[start + 1:end]
  334. if len(set(list_ele_type(part))) == 1 and 'str' in list_ele_type(part):
  335. each_line_word = ['0' if len(re.findall(r'[a-z]{2,15}', ep)) < 5 else '1' for ep in part]
  336. chinese_word = re.findall(r'[\u4e00-\u9fa5]', ''.join(part))
  337. b1 = len(re.findall(r'添加|加上|去掉|插入|删掉|删除|∧|︿|改为', ''.join(part))) > 0
  338. b2 = len(re.findall(r'[a-z]{2,15}', ''.join(part))) > 90 and len(chinese_word) < 20
  339. b3 = '101010' in ''.join(each_line_word)
  340. b4 = False
  341. if re.search(r'\d+', ''.join(part)) == None:
  342. part_str = ''.join(part).replace(' ', '#')
  343. partL = [strip_point(i) for i in re.split(r'[^a-zA-Z0-9\s]', part_str) if strip_point(i) != '']
  344. editd = []
  345. if len(partL) % 2 == 0:
  346. for i in range(0, len(partL)):
  347. if i % 2 == 0:
  348. editd.append(Levenshtein.jaro(partL[i], partL[i + 1]))
  349. if sum(editd) / (len(editd) + 0.0001) > 0.6:
  350. b4 = True
  351. if (b2 and b3) or b1 or b4:
  352. nans_result = []
  353. for k, v in enumerate(ans_result):
  354. if k <= start or k >= end:
  355. nans_result.append(v)
  356. # nans_result = [ans_result[i] for i in range(len(ans_result)) if i<=ci or i>=chinese_index[i+1]]
  357. # dd = len(nans_result)
  358. if len(re.findall(r'[a-zA-Z]{2,}','\n'.join(part))) >= 15: #10个题目,英语单词至少得有15
  359. nans_result.insert(start + 1, {'短文改错': '\n'.join(part)})
  360. ans_result = nans_result
  361. else:
  362. continue
  363. else:
  364. continue
  365. return ans_result
  366. def ans_structure(ans_list):
  367. """
  368. anslist:del empty line first
  369. :param ans_list:
  370. :return:
  371. """
  372. ans_list = [i for i in ans_list if strip_point(i) != '' and re.search(r"共.[1,5]页|第.[1,5]页",i) is None]
  373. ans_result = []
  374. # chinese_index = []
  375. ans_list = get_listening_ans(ans_list) #提取听力材料
  376. for line in ans_list:
  377. if isinstance(line, dict) :
  378. ans_result.append(line)
  379. else:
  380. obj_ans = get_obj_ans(line) #提取客观题答案
  381. if obj_ans != {}:
  382. ans_result.append(obj_ans)
  383. else:
  384. word_ans = get_word_ans(line) #提取英语单词类答案
  385. if word_ans != {}:
  386. ans_result.append(word_ans)
  387. else:
  388. if chinese_start_line(line) == True:
  389. ans_result.append('chinese')
  390. chinese_content.append(line)
  391. # chinese_index.append(ans_list.index(line))
  392. else:
  393. # ans_result.append(None)
  394. ans_result.append(line)
  395. if 'chinese' in ans_result and 'str' in list_ele_type(ans_result):
  396. ans_result = error_correct(ans_result) #提取短文改错
  397. # if 'chinese' in ans_result and 'str' in list_ele_type(ans_result):
  398. if 'str' in list_ele_type(ans_result):
  399. ans_result = get_essay_ans(ans_result) #提取短文类答案
  400. ans_result = double_check_ans_structure(ans_result)
  401. new_ans_result = repeated_id_partandtype(ans_result, ans_list)
  402. parse_0_correct(new_ans_result)
  403. parse_extract(new_ans_result) #都是原地修改的,不用重新赋值
  404. return new_ans_result
  405. def double_check_ans_structure(ans_result):
  406. """
  407. after ans_structure(ans_result),the result is :ans_result,according to the characteristics of forward and backward answers,
  408. re-extract the answers that were not extracted for the first time
  409. :param ans_result:
  410. :return: new ans_result
  411. """
  412. try:
  413. ty = list_ele_type(ans_result)
  414. ans_str_ind = []
  415. for k, t in enumerate(ty):
  416. if (t == 'str' and ans_result[k] != 'chinese'): # list:obj ans but no id
  417. if re.search(r'[a-z]{1,15}', ans_result[k], re.I) != None:
  418. ans_str_ind.append(k)
  419. if len(ans_str_ind) > 0:
  420. for astr in ans_str_ind:
  421. # 没有提取出来的ABCD
  422. # forward:
  423. f_id = -1
  424. b_id = -1
  425. if len([int(i) for i in re.findall(r'\d+', ans_result[astr]) if int(i) < 120]) > 0:
  426. for d1 in range(astr - 1, 0, -1):
  427. if isinstance(ans_result[d1], dict) or isinstance(ans_result[d1], OrderedDict):
  428. keyL1 = list(ans_result[d1].keys())
  429. str_keyL1 = list(map(str, keyL1))
  430. if (len(keyL1) > 0) and re.search(r'[\u4e00-\u9fa5]|parse', ''.join(str_keyL1)) == None:
  431. f_id = max([int(i) for i in list(keyL1)])
  432. break
  433. for d2 in range(astr + 1, len(ans_result)):
  434. if isinstance(ans_result[d2], dict) or isinstance(ans_result[d2], OrderedDict):
  435. keyL2 = list(ans_result[d2].keys())
  436. str_keyL2 = list(map(str, keyL2))
  437. if (len(keyL2) > 0) and re.search(r'[\u4e00-\u9fa5]|parse', ''.join(str_keyL2)) == None:
  438. b_id = min([int(i) for i in list(keyL2)])
  439. break
  440. if f_id != -1 and b_id != -1:
  441. split_ind = [str(i) for i in range(f_id + 1, b_id)]
  442. if len(split_ind) > 0 and split_ind[0] in ans_result[astr]:
  443. ct = [strip_point(i) for i in
  444. re.split('|'.join(split_ind), (ans_result[astr])[ans_result[astr].index(split_ind[0]):])
  445. if strip_point(i) != '']
  446. if len(ct) == len(split_ind):
  447. ans_result[astr] = dict(zip(split_ind, ct))
  448. elif len(ct) == 1: # first id in line
  449. kk = -1
  450. for k, v in enumerate(ans_result[astr:astr + 2 * len(split_ind)]):
  451. if (isinstance(v, str) and v == 'chinese') or \
  452. isinstance(v, dict) or isinstance(v,OrderedDict) or isinstance( v, list):
  453. kk = k
  454. break
  455. split_content = ''.join(ans_result[astr:astr + kk])
  456. ct = [strip_point(i) for i in re.split('|'.join(split_ind), split_content) if
  457. strip_point(i) != '']
  458. if len(ct) == len(split_ind):
  459. ans_result[astr] = dict(zip(split_ind, ct))
  460. for i in range(1, kk):
  461. ans_result[astr + i] = '已提取'
  462. elif f_id != -1 and b_id == -1:
  463. # 提取出来的改错【10】live前加to 自成一行而没有提出来
  464. if str(f_id + 1) in ans_result[astr]:
  465. content = (ans_result[astr])[ans_result[astr].index(str(f_id + 1)):]
  466. ct = [strip_point(i) for i in content.split(str(f_id + 1)) if strip_point(i) != '']
  467. if len(ct) == 1:
  468. ans_result[astr] = {str(f_id + 1): ct[0]}
  469. elif f_id == -1 and b_id != -1:
  470. if str(b_id - 1) in ans_result[astr]:
  471. content = (ans_result[astr])[ans_result[astr].index(str(b_id - 1)):]
  472. ct = [strip_point(i) for i in content.split(str(b_id - 1)) if strip_point(i) != '']
  473. if len(ct) == 1:
  474. ans_result[astr] = {str(b_id - 1): ct[0]}
  475. return [i for i in ans_result if i != '已提取']
  476. else:
  477. return ans_result
  478. except:
  479. print("【答案提取】:double_check_ans_structure出错")
  480. return ans_result
  481. def repeated_id_partandtype(ans_result, ans_orig_list):
  482. """
  483. 将重复id的答案分块,认为:id重复,一定有题目说明,不是在前一行,就是在本行开头,拿到答案说明,去匹配题型,按题型分答案
  484. 有可能,前面一行也是答案,但由于某种原因没有提取出来而变成str,
  485. eg:1:for effort/ hard work,2: No.2,3: Three
  486. {'4': 'How Parents Should Praise Their Kids/ Why Praise Can Be Bad For Kids'}
  487. 1.前面一行是str,不是chinese,2.在本行前面也没提取到汉字,3.本行key又不是从1开始 >>> 很大可能前一行也是答案,而没有提取出来,所以用1-该行最小key值范围的数字,去split前一行
  488. 答案提取出来变成dict,怎么拿到原始数据,从而提取到该行首部的汉字:用dict中的value,每个key与原始数据每行定位,拿到index,然后最多的index,就是该dict的原始数据
  489. <<可能某个key,存在于多行中,所以不能只用一个value去定位
  490. """
  491. try:
  492. last_line_max_id = 0
  493. last_parse_id = 0
  494. for i, v in enumerate(ans_result):
  495. if isinstance(v, dict) :
  496. kl = list(v.keys())
  497. if len(v) == 1 and re.search(r'[^\d+]+',kl[0]):
  498. pass
  499. else:
  500. this_line_min_id = min(map(int, v.keys()))
  501. if this_line_min_id > eval(str(last_line_max_id)):
  502. last_line_max_id = max(map(int, v.keys()))
  503. else:
  504. # 解析先编码
  505. vvalue = ' '.join(list(v.values()))
  506. chinese_words = []
  507. for w in get_chinese_char(vvalue):
  508. chinese_words.extend(list(jieba.cut(w)))
  509. if re.search(r'考查|故选|根据|因此|可知|所以|解析', vvalue) != None: # 解析
  510. if last_parse_id == 0:
  511. ans_result[i] = {'parse_{}'.format(kl[0]): vvalue}
  512. last_parse_id = int(kl[0])
  513. elif int(kl[0]) > last_parse_id and int(
  514. kl[0]) < last_parse_id + 10: # 和上一个parse id在一定范围内的,认为是正常提取的解析
  515. ans_result[i] = {'parse_{}'.format(kl[0]): vvalue}
  516. last_parse_id = int(kl[0])
  517. else:
  518. ans_result[i] = '已提取' # 提取有问题的解析,不要了
  519. # /(len(re.findall(r'[a-zA-Z]{1-15}',vvalue))+0.0001)> 1:
  520. elif len([ii for ii in chinese_words if ii not in ['改为', '去掉', '删除', '删掉', '添加','加上']]) > 5:
  521. if last_parse_id == 0:
  522. ans_result[i] = {'parse_{}'.format(kl[0]): vvalue}
  523. last_parse_id = int(kl[0])
  524. elif int(kl[0]) > last_parse_id and int(kl[0]) < last_parse_id + 10:
  525. ans_result[i] = {'parse_{}'.format(kl[0]): vvalue}
  526. last_parse_id = int(kl[0])
  527. else:
  528. ans_result[i] = '已提取' # 提取有问题的解析,不要了
  529. # 不是解析,编码又重复了,肯定是一块新题,要分块,且拿到题目说明
  530. else:
  531. next = -1
  532. type_content = ''
  533. fc = ans_result[i - 1]
  534. if fc == 'chinese':
  535. type_content = chinese_content[appear_times(ans_result, 'chinese')[i - 1] - 1]
  536. else:
  537. may_ids = []
  538. for iv in list(v.values()):
  539. sm = [Levenshtein.ratio(strip_point(origc), iv) for origc in ans_orig_list]
  540. may_ids.append(sm.index(max(sm))) # where this line ans is in orig ans list
  541. may_id = Counter(may_ids).most_common()[0][0]
  542. chinesew = get_chinese_char(ans_orig_list[may_id]) # 提取句首的汉字
  543. if len(chinesew) > 0:
  544. type_content = chinesew[0]
  545. elif kl[0] != '1' and isinstance(fc, str):
  546. split_num = [n for n in re.findall('\d+', fc) if int(n) in range(1, int(kl[0]))]
  547. rr = re.split('(' + '|'.join(split_num) + r')(\.|、|.)', fc)
  548. num = [strip_point(nm) for nm in rr if re.match(r'\d+$', strip_point(nm)) != None]
  549. aa = [strip_point(a) for a in rr if re.search(r'[a-zA-Z]', strip_point(a)) != None]
  550. if len(num) == len(aa):
  551. v.update(dict(zip(num, aa)))
  552. ans_result[i - 1] = '已提取'
  553. if i > 2 and ans_result[i - 2] == 'chinese':
  554. type_content = chinese_content[appear_times(ans_result, 'chinese')[i - 2] - 1]
  555. for ii, vv in enumerate(ans_result[i + 1:]):
  556. if isinstance(vv, dict) and list(vv.keys())[0] not in ['短文', '短文改错', '听力原文']:
  557. if min(map(int, list(vv.keys()))) < last_line_max_id:
  558. v.update(vv)
  559. ans_result[i + 1 + ii] = '已提取'
  560. else:
  561. break
  562. else:
  563. break
  564. tylable = item_type_classify([([type_content], [str(v)])])[0].replace("写作","短文")
  565. if "单" not in tylable :
  566. ans_result[i] = {tylable: v}
  567. else:
  568. pass
  569. # nkeys = [str(last_line_max_id) + '_'+ v for v in list(v.keys())]
  570. # last_line_max_id = nkeys[-1]
  571. # ans_result[i] = dict(zip(nkeys,v.values()))
  572. return [i for i in ans_result if i != '已提取']
  573. except:
  574. print("【答案提取】:repeated_id_partandtype出错")
  575. return ans_result
  576. def is_parse(text:str):
  577. """
  578. 判断该行内容是否为解析
  579. :param text:
  580. :return:
  581. """
  582. if chinese_start_line(text) == False:
  583. if len(jieba.lcut(text)) > 5:
  584. if re.search(r'(?<!(篇章|语篇))解析|(?<![答考试真小大\d本该]){2,6}题',text[:20]) :
  585. return True
  586. elif re.search(r'答案为|考查|故选|根据|因此|可[知得]|可以?[猜推]出|所以|[A-G](选?项)?应?该?[是为](对|正确)|故[A-D]选?项',text) : # 如果没有题号,那就要求在开头要有"解析"两个字
  587. return True
  588. return False
  589. def parse_extract(ans_result):
  590. if "str" in list_ele_type(ans_result):
  591. try:
  592. idd = ""
  593. parse_index = []
  594. for k, ss in enumerate(ans_result):
  595. # 如果没有题号,那就要求在开头要有"解析"两个字
  596. #有解析的,可能是语篇解析(用里面是否有ABCD来判断),语篇解析暂时不要,语篇解析容易提取成一个小题解析
  597. if k > 0 and isinstance(ss, str) and is_parse(ss):
  598. id_find = re.search(r'(\d+)', ss[:4])
  599. if id_find == None:
  600. before_one_line = ans_result[k - 1]
  601. if isinstance(before_one_line, dict) and len(before_one_line) > 0: # 答案已提取了
  602. may_id = re.search(r"(\d+)",list(before_one_line.keys())[-1])
  603. if may_id:
  604. idd = may_id.group(1) # 字典里最后一个题号,作为这个解析的题号
  605. elif isinstance(before_one_line, str):
  606. may_id = re.search(r'(\d+)', before_one_line[:4])
  607. if may_id:
  608. idd = may_id.group(1)
  609. elif id_find != None: # 拿到id了
  610. idd = id_find.group(1) if eval(id_find.group(1)) < 120 else "0" #把大于120的id置为0
  611. ###########题号获取结束
  612. if idd != "":
  613. ans_result[k] = {'parse_{}'.format(idd): ss}
  614. parse_index.append(k) #不全,还有之前提取的解析索引没有在里面
  615. for k, v in enumerate(ans_result):
  616. if k not in parse_index and isinstance(v, dict) and "parse" in str(v.keys()):
  617. parse_index.append(k)
  618. parse_index.sort()
  619. len_ppc = len(parse_index.copy())
  620. count = 0
  621. while len(parse_index) > 0 and count < 2*len_ppc : #因为还在不停添加parse_index,所以循环多设置几次,以免没合并完成
  622. pi = parse_index[0]
  623. ####两个parse之间一行还有未提取的东西,就把这一行归到上一个parse里去
  624. if pi < len(ans_result)-1:
  625. next_ans = ans_result[pi+1]
  626. this_id = list(ans_result[pi].keys())[0]
  627. if isinstance(next_ans,str) and next_ans!="chinese" :
  628. ans_result[pi+1] = {this_id:list(ans_result[pi].values())[0]+"\n"+next_ans}
  629. if pi+1 not in parse_index:
  630. parse_index.insert(1,pi+1) #pi行下面可能有多行str或多行parse,如果不添加,这个parse只会处理紧挨着的一行
  631. ans_result[pi] = "chinese"
  632. elif isinstance(next_ans,dict):
  633. next_id = list(next_ans.keys())[0]
  634. if re.search(r"\d",next_id) and eval("{}-{}".format(next_id.replace("parse_",""),this_id.replace("parse_",""))) <= 0: #都是一个题的解析,将答案合并
  635. ans_result[pi+1] = {this_id:list(ans_result[pi].values())[0]+"\n"+list(ans_result[pi+1].values())[0]}
  636. if pi + 1 not in parse_index:
  637. parse_index.insert(1,pi+1)
  638. ans_result[pi] = "chinese"
  639. del parse_index[0]
  640. count += 1
  641. # return ans_result
  642. except:
  643. print("【答案提取】:parse_extract出错")
  644. pass
  645. def parse_0_correct(ans_result):
  646. """
  647. 在get_word_ans中,对于上一行是答案,下一行是解析,且解析前没有题号的情况,会提取成parse_0
  648. 1.如果上一行已经提取为dict了,就把上一行的题号给这一行
  649. 2.如果上一行还是str,就找这一行靠前的数字作为这一题的题号
  650. :param ans_result:
  651. :return:
  652. """
  653. if "parse_0" in str(ans_result):
  654. parse_0_index = [k for k,v in enumerate(ans_result) if k > 0 and isinstance(v,dict) and "parse_0" in v]
  655. while len(parse_0_index) > 0 :
  656. parse = ans_result[parse_0_index[0]]["parse_0"]
  657. last_line = ans_result[parse_0_index[0]-1]
  658. if isinstance(last_line,dict):
  659. may_id = list(last_line.keys())[-1]
  660. elif isinstance(last_line,str):
  661. may_id = last_line[:6]
  662. new_id = re.search(r"(\d+)",may_id).group(1) if re.search(r"(\d+)",may_id) else "0"
  663. if new_id != "0":
  664. ans_result[parse_0_index[0]] = {"parse_{}".format(new_id):parse}
  665. del parse_0_index[0]
  666. if __name__ == '__main__':
  667. # b = get_obj_ans("school is over at 5 o'clock. After school, we often play basketball or do some other sport on the")
  668. # print(b)
  669. # b2 = get_word_ans("21.B推断题。通读全文得出,第2项赛事的起跑点和终点之间落差最大,因此该赛事的下坡跑最著名。由该")
  670. # print(b2)
  671. # # a = ['21.D\n', '21.根据第2段可知,文章推荐了一些免费观看喜剧电影的网站。\n', '22.C\n', '22.根据Comedy Movies at Crackle部分中的You can browse by year or title可知,你可以通过搜索电影标题来找到自己最喜欢的电影。\n', '23.\n', "23.C根据Hulu's Free Comedy Movies部分中的“Hulu has more diversities of free comedy movies than any other place.”可知答案。\n", '24.C\n', '24.根据文章第1段中He is an unconventional man 和第2段中Everything about Eliza Doolittle seems to challenge any conventional concept可知,他们都反对传统观念。\n', '25.C\n', '25.根据第1段第2句“..and uses all manners of recording and...understandable units.”可知答案。\n', '26.D\n', '26.根据文章第2段末句可知,独立自主的个性使Eliza Doolittle获得了尊重。\n', '27.A\n', '27.文章分析的是戏剧《卖花女》中的人物性格特征,所以我们很可能在文学杂志中读到这样的文章。\n', '28.B\n', '28.根据文章第1段第2、3句可知,科学家发现改变植物基因会使植物的光合作用速度加快,将来会增加全球植物的产量。\n', '29.B\n', '29.根据文章第4段第2、3、4句可知,作者提到镜片是为了说明植物的保护系统的工作原理与镜片相同。\n', '30.D\n', '30.根据文章最后一句可知答案。\n', '31.B\n', '31.根据文章内容可知,科学家改变植物的基因,加快光合作用的速度,使植物叶子生长迅速,从而提高植物产量。\n', '32.B\n', '32.根据文中列举的提高记忆力的3种方法可知,有些充分利用思维的秘诀,一旦我们了解了,就简单易行。\n', '33.D\n', '33.根据第1段最后一句可知,本文主要介绍提高记忆力的几种方法。\n', '34.C\n', '34.根据文章最后一段第1句可知,图像联系记忆是那些记忆力比赛获胜者常用的方法。\n', '35.D\n', '35.根据文章的主题一—提高记忆力的方法可知,这篇文章与思维有关。\n', '36~40 FADCE\n', '41.B\n', '41.根据上文可知,父亲正急切地等着作者的回答。\n', '42.C\n', '42.作者仔细地看了看花园,然后回答“非常好!”。\n', '43.A\n', '43.然后,作者列举了她发现的园子里的所有变化,父亲满意地笑了。\n', '44.D\n', '44.根据下文可知,作者的母亲在一次车祸中去世了,留下父亲抚养三个年幼的女儿。\n', '45.C\n', '46.A\n', '46.根据下文可知,一开始,生活并不顺利。\n', '47.D\n', '47.根据下文可知,父亲在鼓励我们。\n', '48.B\n', '48.同时,父亲尽自己最大的努力去证明那个信念。\n', '49.\n', '49.A1972年,父亲在Okaloosa Island开垦了一片被废弃的土地。\n', '50.A\n', '50.根据下文可知,这里指在每年的初春。\n', '51.B\n', '51.经过父亲不断的辛勤劳作,这片土地逐渐变成了美丽的花园。\n', '52.C\n', '52.根据下文可知,作者有时候和父亲一起在他的花园里愉快地劳动。\n', '53.B\n', '53.根据上文可知,作者会邀请她的朋友们参观花园。\n', '54.D\n', '54.根据下文可知,父亲把他的花园打理得非常不错。\n', '55.A\n', '55.根据下文可知,多年以来,每当作者心烦的时候,就会去父亲的花园。\n', '56.D\n', '56.因为它会使作者想起父亲的信念。\n', '57.B\n', '57.根据下文可知,正是父亲和他的花园给了作者力量。\n', '58.C\n', '58.这力量让作者能够继续生活下去并且战胜生活中的挑战。\n', '59.A\n', '59.根据下文可知,父亲说他不再继续打理这个花园。\n', '60.D\n', '60.根据下文可知,我们沉默地坐着。\n', '语法填空\n', '61.earlier 62.what 63.probably 64.ways 65.a 66.in 67.grown 68.was brought 69.that 70.was\n', '短文改错\n', 'Being responsible is actually not that difficult.I used to thinking that it was hard to grow up into a respon-sible member of the society.An incident which happened in a rainy Sunday afternoon changed my attitude.I was on my way to a bookstore and was waiting for the green light when a girl knocked down by a passing car,that drove off quickly.A man immediately gave her first aid and I had joined in without hesitation.Soon many help was given to the girl.Because we sent her to the nearest hospitals in time,she was able to receive properly treatment.Not badly injured,I expressed gratitude to those giving help.Comparing with the escaped driver,I was proud of what I had done.\n', '71. thinking—think in—on 在girl 和knocked 中间加was that-- which had去掉 many--much hospitals-hospital properly--proper I--she Comparing-- Compared\n', '书面表达\n', "One possible version:Dear John,I'm glad you are concerned about my school life.You asked me what amateur activities we had recently.Do you like stage drama?It's fun to watch and do.On Nov.22,students in Grade 2 from our school acted out their own stage drama Red Crag.It's from the famous novel with the same name.Students acted and directed the drama by themselves.They are all members of the drama club.Through the drama,students said they had a better understanding of the novel.If you have time you can surf the Internet for this story and exchange your idea with me.\n", 'Looking forward to your reply.\n', 'Yours,LiHua\n']
  672. ans_list = []
  673. for root, dirs, files in os.walk (r'E:\online_old\ans_txt'):
  674. files_list = [files]
  675. for file in files:
  676. print("成功:{}".format(file))
  677. with open(os.path.join(root,file)) as f:
  678. a = f.readlines()
  679. a2 = ans_structure(a)
  680. ans_list.append( (file,a2) )
  681. import json
  682. with open (r'./structure_ans.json','w') as ff:
  683. ff.write(json.dumps({"ans":ans_list}))
  684. # with open(r"../ans_txt/[tiku.gaokao.com]吉林省九校联合体届高三第二次摸底考试英语试题.txt",'r') as f:
  685. # a = f.readlines()
  686. # a2 = ans_structure(a)
  687. # b = repeated_id_partandtype(a2, a)
  688. # # pprint(b)
  689. # parse_extract(b)
  690. # pprint(b)
  691. # print(chinese_start_line("2.C解析]推理判断题。根据第四个项目的第三句话,可知EF Ciep Yer可以让学生自主地设定学习时间"))
  692. ######山东省桓台第二中学2017-2018学年高一下学期4月月考英语试题Word版含答案.txt
  693. # with open(r"E:\online_old\ans_txt\四川省资阳市2018届高三第二次诊断性考试试题英语Word版含答案.txt","r") as f:
  694. # ans_list = f.readlines()
  695. # res = ans_structure(ans_list)
  696. # pprint(res)
  697. # print(is_parse("第三节(共5小题;每小题1分,共5分"))