# -*- coding: utf-8 -*- """ 针对单项填空、完形填空、阅读理解三种题目结构化,主要思路: 1.对每行打标签,这一步打标签只打非常确定的标签,不太确定的就暂打标签N: A:A选项 B:B选项 C:C选项 D:D选项 @:题干 N:其他(一般完型阅读的短文,还有其他非确定的都暂打标签为N) 2.这种打标签的方式需要把A.XXX B.XXXX C.XXX D.XXX四个选项两行分布或者四行分布的拆分成4行,这样更方便前后推理 3.推理修正标签: (1)对于ABND-->ABCD,对于ANBCD-->AABCD (2)对于完型和阅读,前面一定是短文,标签一定是: 阅读理解:NNNNN@ABCD@@ABCD... 完形填空:NNNNABCDABCDABCD... 如果出现NNANNNABCD--->修正为:NNNNNNABCD 4.每行的标签修正好了以后,再直接结构化提取就好了,思路是: (1)先把短文提取出来 (2)后面的选项分组,一个小题一组,然后组内结构化 5.对于题号做的处理: (1)如果最大的序号60在topic_no列表里,且59,58都在topic_no列表里,认为最大序号是正确可靠序号,题目序号用最大序号+slave个数共同生成 (2)同理,如果最小的序号21在topic_no列表里,且22,23都在topic_no列表里,认为最小序号是正确可靠序号,题目序号用最小序号+slave个数共同生成 (3)如果最大最小序号都不可靠,就使用最长连续序列题号+slave个数去推理生成整个题号序列:max_len_series """ import re from pprint import pprint from util import max_len_series def error_B_find_and_rep(con_list): """ 在list中,通过多行连续性,发现A (8|3|13|l3) C D,这样的连续序列,然后将 其替换为B 原地修改,不用返回值,使用的时候也不需要赋值 :param con_list: :return: """ for i in range(1, len(con_list) - 2): is_b = re.match(r"([^\d]{0,3}(8|13|3)[,.。、])", con_list[i].replace("(", "").replace(")", "")) if is_b: is_a = re.match(r".{0,3}A[,.。、]", con_list[i - 1].replace("(", "").replace(")", "")) is_c = re.match(r".{0,3}[CcG][,.。、]", con_list[i + 1].replace("(", "").replace(")", "")) is_d = re.match(r".{0,3}D[,.。、]", con_list[i + 2].replace("(", "").replace(")", "")) is_cd = re.match(r".{0,3}[CcG][,.。、].*?D[,.。、].+?", con_list[i + 1].replace("(", "").replace(")", "")) # CD可能在同一行 if is_a and is_c and (is_d or is_cd): con_list[i] = is_b.group(1).replace(is_b.group(2), "B") + con_list[i][is_b.end():] def infer_abcd_cloze(opt_label_str,is_oneline=False): """ 根据下文来推理前面一个不确定的选项 :param label_str: :return: """ count = 0 while re.search(r"N[B-D]", opt_label_str) and count < len(opt_label_str): opt_label_str = opt_label_str.replace("NAA", "DAA") \ .replace("NB", "AB") \ .replace("NC", "BC") \ .replace("ND", "CD") count += 1 opt_label_str = re.sub(r"([^N])NA", "\1\1A", opt_label_str) count2 = 0 if is_oneline: #同一行根据前面往后推 while re.search(r"[A-C]N", opt_label_str) and count2 < len(opt_label_str): opt_label_str = opt_label_str.replace("AN", "AB"). \ replace("BN", "BC"). \ replace("CN", "CD") count2 += 1 else: if opt_label_str.endswith("N"): opt_label_str = opt_label_str[:-1] + "D" return opt_label_str def infer_abcd_rs(opt_label_str,is_oneline=False): """ 阅读和单选,标签序列和完型不一样,@ABCD,所以推理有点不一样,就分开写了 """ count = 0 while re.search(r"N[A-D]|([A-D])N@|@N+@", opt_label_str) and count < len(opt_label_str): opt_label_str = opt_label_str.replace("NA", "@A") \ .replace("NB", "AB") \ .replace("NC", "BC") \ .replace("ND", "CD") opt_label_str = re.sub(r"([A-D])N@", r"\1\1@", opt_label_str) count += 1 if is_oneline: #同一行根据前面往后推 count2 = 0 while re.search(r"[A-C]N", opt_label_str) and count2 < len(opt_label_str): opt_label_str = opt_label_str.replace("AN", "AB").replace("BN", "BC").replace("CN", "CD") count2 += 1 else: #基于整个序列替换 def rub_mode_2(m): return "@" * len(m.group(0)) opt_label_str = re.sub(r"@([N]+)[@A]", rub_mode_2, opt_label_str) # 两个题干之间的都为N,那就把中间都当作题干 ######经过上面替换,N[A-D]肯定都没了,但会存在N$的情况 if opt_label_str.endswith("N"): opt_label_str = opt_label_str[:-1] + "D" #####单项选择前面第一个A前面如果存在N,就全部换成A return opt_label_str def label_abcd(one_item, ty): """ 每个字符串文本,判断其标签类别,@:内容行(题干行) A,B,C,D,N(其他) """ i = re.sub(r"[()]", "", one_item)[:5] if re.search(r"^[^A-D]{0,3}\d[,,。.、]", i): if ty != "完形填空": label = "@" else: label = "A" else: label = re.search(r"([A-Dc])\s*[,,.。、]", i).group(1) if re.search(r"([A-Dc])\s*[,,.。、]", i) else "N" if label=="N": #但有[A-D]\s[A-Z] label = re.search(r"([A-D])\s+[A-Z]", i).group(1) if re.search(r"([A-D])\s+[A-Z]", i) else "N" return label.upper() def essay_label_correct(all_label_str): """ 只有阅读和完型使用这个,label_str这个题所有行的label序列 """ def rub_mode_1(m): return "N" * len(m.group(1)) count = 0 while re.search(r"([^N]N{4,})", all_label_str[:10]) and count < 10: # 对于完形填空,前10行大胆这样替换 # 确定短文,把短文中的N先整理好,方便后面按照N块取essay all_label_str = re.sub(r"([^N]N{4,})", rub_mode_1, all_label_str[:10]) + all_label_str[10:] count += 1 all_label_str = re.sub(r"(N{4,}[^N]N{3,})", rub_mode_1, all_label_str) # 10行以后还有漏网之鱼 return all_label_str def sub_item_group(label, con, ty): """ 注意:只对ABCD选项按小题分组,所以完型和阅读输进来的items_all是去掉esssy以后的序列 items_all:[("A","see"),("B":"later"),("C","right"),("D":"part")] ty: 只能是 单项填空,完形填空,阅读理解 三个题型,其中,完型的选项只有ABCD,阅读和单选多了一个@标签 """ slave = [] flag = "0" one_item = {"A": "", "B": "", "C": "", "D": "", "topic_no": "0", "answer": ""} if ty in ["阅读理解", "单项填空"]: one_item["content"] = "" topic_no = [] label_list = list(label) label_list.append("0") con.append("END_MARK") for abcd, con in zip(label_list, con): if abcd > flag and abcd != "N": one_item[abcd.replace("@", "content")] = con # 拿到内容 flag = abcd elif abcd == flag: one_item[flag.replace("@", "content")] = (one_item[flag.replace("@", "content")] + " " + con) else: tn = 0 for k, v in one_item.items(): if k == "content": tn = re.search(r"(\d+)", v).group(1) if re.search(r"(\d+)", v) else 0 one_item["topic_no"] = str(tn) one_item["content"] = re.sub(r"[^a-zA-Z]", "", v[:8].replace(str(tn), "")) + v[8:] elif re.match(r"[A-D]", k): # 选项内容,把A.xxxx前面的A.去掉 if k == "A" and ty == "完形填空": #完型的题号和A选项在同一行 tn = re.search("(\d+)", v[:5]).group(1) if re.search("(\d+)", v[:5]) else 0 v = re.sub(str(tn) + "[,,。.]", "", v) v = re.sub(r".*?[,,。.]", "", v, count=1).strip() one_item[k] = v if len([one_item[i] for i in ['A', 'B', 'C', 'D'] if one_item[i] != ""]) > 0: # 一个选选项都没有,就不要添加了 topic_no.append(int(tn)) slave.append(one_item) one_item = {"A": "", "B": "", "C": "", "D": "", "topic_no": "0", "answer": ""} if ty in ["阅读理解", "单项填空"]: one_item["content"] = "" one_item[abcd.replace("@","content")] = con # 拿到内容 flag = abcd return slave, topic_no #是int def single_filling_structure(con_list): ty = "单项填空" con_list = [i for i in con_list if len(re.findall(r"[a-zA-Z0-9]", i)) > 0 and \ len(re.findall(r"[\u4e00-\u9fa5]",i))<5] # 没有英文字母的行去掉 items_all = [] error_B_find_and_rep(con_list) for k, line in enumerate(con_list): one_line_label = "" line = re.sub(r"([B-D][,。.,、])", r" \1", line) one_line_con = [i for i in re.split(r"\s{3,}",line) if len(i.replace(" ",""))>=4] #即要求A、as至少4各字符,因为可能又噪声被分出来了A. Open up to othrs.D.oB. Depend on cach other. for i in (one_line_con): i = re.sub(r"[()]","",i)[:5] one_line_label += label_abcd(i, ty) if len(one_line_label)>1: one_line_label = infer_abcd_rs(one_line_label,is_oneline=True) one_line_label = list(one_line_label) # 保证每一行内部排序是按照ABCD这样的顺序,因为确实遇到过一行,OCR识别出来CD在前面 h = sorted(zip(one_line_label, one_line_con), key=lambda x: x[0]) items_all.extend(h) label_all = [i[0] for i in items_all] con_all = [i[1] for i in items_all] label_str = "".join(label_all) label_str = essay_label_correct(label_str) def sub_start_A(m): return "A" * len(m.group(0)) label_str = re.sub(r"(^N{1,3}A)", sub_start_A, label_str) slave, topic_no = sub_item_group(label_str, con_all, ty) topic_no = max_len_series(topic_no) for i, s in enumerate(slave): s["topic_no"] = str(topic_no[i]) s["topic_type_name"] = ty s["topic_type_id"] = 1 s["parse"]="" return slave def cloze_structure(con_list): ty = "完形填空" items_all = [] error_B_find_and_rep(con_list) for k, line in enumerate(con_list): one_line_label = "" ######判断一下是否是选项行,是否要将BCD分开(OCR识别,选项都粘在一起了) if k > 5 and re.search(r"(?=3 or word_n <= (opt_n+1)*3: line = re.sub(r"([B-D][,。.,、])", r" \1",line) one_item_con = [i for i in re.split("\s{3,}", line) if i != ""] for i in one_item_con: # 一行内部再分开 one_line_label += label_abcd(i, ty) if len(one_line_label)>1: one_line_label = infer_abcd_cloze(one_line_label,is_oneline=True) one_line_label = list(one_line_label) # 保证每一行内部排序是按照ABCD这样的顺序,因为确实遇到过一行,OCR识别出来CD在前面 h = sorted(zip(one_line_label, one_item_con),key=lambda x: x[0]) items_all.extend(h) label_all = [i[0] for i in items_all] con_all = [i[1] for i in items_all] label_str = "".join(label_all) label_str = essay_label_correct(label_str) ij = re.search(r"N{5,}[^N]", label_str[3:]) if ij: essay = con_all[:3 + ij.end() - 1] opt = con_all[3 + ij.end() - 1:] # 拿到阅读题干 label_str = label_str[3 + ij.end() - 1:] # 题干对应标签 else: ###随便取,取前3行当短文 print("完形填空短文少于3行:{}".format(con_all)) essay = con_all[:3] opt = con_all[3:] # 拿到阅读题干 label_str = label_str[3:] # 题干对应标签 ##基于整篇文章序列再推理一遍 label_str = infer_abcd_cloze(label_str) slave, topic_no = sub_item_group(label_str, opt, ty) ########小题筛选 count = -1 while len(slave) > 20 and count < 5: count += 1 if topic_no[0] in topic_no[-5:]: # 可能是把短文最后几句话中的题号当作选项了 essay.append("\n".join([slave[0][i] for i in ['A', 'B', 'C', 'D'] if slave[0][i] != ""])) del slave[0] del topic_no[0] else: break topic_no = max_len_series(topic_no) for i, s in enumerate(slave): s["topic_no"] = str(topic_no[i]) if len(topic_no) > 2 and topic_no[-1] - topic_no[0] + 1 == len(slave): s_topic_no = "{}-{}".format(topic_no[0], topic_no[-1]) else: s_topic_no = "{}-{}".format(101, 100 + len(slave)) return {'content': "\n".join(essay), 'topic_type_name': '完形填空', 'slave': slave, 'topic_type_id': 2, 'topic_no': s_topic_no} def reading_structure(con_list): ty = "阅读理解" reading_content_list = [i for i in con_list if len(re.findall(r"[a-zA-Z0-9]", i)) > 0 and \ len(re.findall(r"[\u4e00-\u9fa5]",i))<8] # 没有英文字母的行去掉 items_all = [] error_B_find_and_rep(reading_content_list) for k, line in enumerate(reading_content_list): one_line_label = "" if k > 5 :#and re.search(r"\d",line[5:])==None: word_n = len(re.findall(r"[a-zA-Z]+", line)) opt_n = len(re.findall(r'[B-D][,。.,、]', line)) if word_n <= (opt_n+1)*8 and len(line) <= max(map(len,con_list[:10])): line = re.sub(r"([B-D][,。.,、])", r" \1",line) one_line_con = [i for i in re.split(r"\s{3,}",line) if len(i.replace(" ",""))>=4] #即要求A、as至少4各字符,因为可能又噪声被分出来了A. Open up to othrs.D.oB. Depend on cach other. for i in (one_line_con): i = re.sub(r"[()]","",i)[:5] one_line_label += label_abcd(i, ty) if len(one_line_label)>1: one_line_label = infer_abcd_rs(one_line_label,is_oneline=True) one_line_label = list(one_line_label) # 保证每一行内部排序是按照ABCD这样的顺序,因为确实遇到过一行,OCR识别出来CD在前面 h = sorted(zip(one_line_label, one_line_con), key=lambda x: x[0]) items_all.extend(h) label_all = [i[0] for i in items_all] con_all = [i[1] for i in items_all] label_str = "".join(label_all) label_str = essay_label_correct(label_str) ij = re.search(r"N{5,}[^N]", label_str) if ij: essay = con_all[:ij.end() - 1] opt = con_all[ij.end() - 1:] # 拿到阅读题干 label_str = label_str[ij.end() - 1:] # 题干对应标签 else: ###随便取,取前3行当短文 print("阅读理解短文少于3行:{}".format(con_all)) essay = con_all[:3] opt = con_all[3:] # 拿到阅读题干 label_str = label_str[3:] # 题干对应标签 label_str = infer_abcd_rs(label_str) slave, topic_no = sub_item_group(label_str, opt, ty) try: max_id = max(topic_no) min_id = min(topic_no) num_items = len(slave) s_topic_no = "{}-{}".format(min_id, max_id) if max_id - min_id + 1 != num_items: if max_id - 1 in topic_no or max_id - 2 in topic_no: # 说明max是对的,min是错的 n_min_id = max_id - num_items + 1 s_topic_no = "{}-{}".format(n_min_id, max_id) slave[0]["topic_no"] = str(n_min_id) # 用新的id把原来的最小值topic_no改掉 elif min_id + 1 in topic_no or min_id + 2 in topic_no: # 说明min是对的 n_max_id = min_id + num_items - 1 s_topic_no = "{}-{}".format(min_id, n_max_id) slave[-1]["topic_no"] = str(n_max_id) else: topic_no = max_len_series(topic_no) if len(topic_no) >= 2: s_topic_no = "{}-{}".format(topic_no[0], topic_no[-1]) else: s_topic_no = "{}-{}".format(100, 100 + len(slave)) print("阅读理解的题号严重有问题:{}".format(topic_no)) return {'content': '\n'.join(essay), 'slave': slave, 'topic_type_id': 3, 'topic_type_name': '阅读理解', 'topic_no': s_topic_no} except: return {'content': '\n'.join(essay), 'slave': slave, 'topic_type_id': 3, 'topic_type_name': '阅读理解', 'topic_no': "{}-{}".format(100, 100 + len(slave) - 1)} if __name__ == '__main__': # con_list = [ "A ship that sank off the coast of California decades ago was recently reconstructed in remarkable detail.The 3D digital model even included hundreds\xa0of sponges(海绵动物) that have gathered on the ship's' surface in the years\xa0Since it sank\n", 'Named A merican Heritage, the boat-a supply ship that once serviced\n', "oil platforms-sank in Santa Monica Bay on May 4, 1995, and for decades\xa0\xa0its precise location(位置) was unknown. Researchers with the Monterey Bay Aquarium Research Institute (MBARI) spotted a strange shape in that area in\xa02008. But it wasn't until May 2018 that MBARI scientists identified its precise\xa0location and mapped the site in greater detail, showing what appeared to be a\xa0shipwreck(失事船只). It measured197 feet long and rested nearly2300 feet\xa0\xa0below the surface.\n", 'Even then, the identity of the shipwreck was uncertain. Yet another MBARI team revisited the location to do further exploration.They sent remotely operated\xa0vehicles (ROVs) and took photos of the damaged ship. Though it was covered\xa0with deep-sea sponges and other animals, the scientists were able to spot\xa0\xa0letters spelling out its name, confirming that the shipwreck was American\xa0Heritage.\n', 'As one of the MBARI scientists who found American Heritage, chief\n', 'ROV pilot Knute Brekke had previously worked on the ship. And he was\xa0 on duty with the diving company American Pacific Marine\n', '-the owner of\xa0American Heritage -the night the ship began taking on water and eventually\xa0sank.\n', 'MBARI spokesperson Kim Fulton-Bennett said to Live Science about the\xa0discovery, "The model is not complete, as floating ropes and poor visibility\xa0kept the pilots from getting too close to the wreck. Nevertheless, the 3D\xa0\xa0reconstruction is detailed enough to show that American Hertage is now home\xa0\xa0to potentially thousands of sponges. Shipwrecks- accidental and intentional-often transform into the shelter for diverse communities of ocean life.\n', '12.what is the main idea of the text?\n', 'A.A valuable treasure was discovered.\n', 'B.Special sponges were found under sea.\n', 'C.3D model reconstructed sunken ship.\n', 'D.A sunken ship was gotten out of water.\n', '13.which is the right order of the following events?\n', '① something strange was found in the area\n', '②ROVs were sent under sea to take photos.\n', '③A ship sank in Santa Monica Bay.\n', '④The identity of the ship was confirmed.\n', '⑤Scientists tried to locate the shipwreck\n', 'A.②③⑤④①B.③①⑤②④C.⑤③①④②D.④③①②⑤\n', '14. What can we learn about Knute Brekke?\n', 'A. He was familiar with the sunken ship.\n', 'B\xa0He was in charge of diving company.\n', 'C He was responsible for the rescue work\n', 'D\xa0\xa0He was the first one to witness the accident.\n', "15. What's Kim Fulton-Bennett's attitude towards the 3D model?\n", 'A. Critical.\n', 'B. Doubtful.\n', 'C.Amazed\n', 'D.Objective\n', '.\n', '.\n'] # con_list = ['Washington, D.C. Bicycle Tours\n', 'Cherry Blossom Bike Tour in Washington, D.C.\n', # 'Duration Tour\n', # 'This small group bike tour is a fantastic way to see a world-famous cherry trees with beautiful flowers of Washington, D.C. Your guide will provide a history lesson about the trees and the famous monuments where they blossom. Reserve your spot before availability — the cherry blossoms—disappear!\n', # 'Washington Capital Monuments Bicycle Tour\n', 'Duration:3 hours (4 miles)\n', # 'Join a guided bike tour and view some of the most popular monuments in Washington, D.C. Explore the monuments and memorials on the National Mall as your guide shares unique facts and history at each stop. Guided tour includes bike, helmet, cookies and bottled water.\n', # 'Capital City Bike Tour in Washington, D.C.\n', 'Duration:3 hours\n', # 'Morning or Afternoon, this bike tour is the perfect tour for D. C. newcomers and locals looking to experience Washington, D.C. in a healthy way with minimum effort. Knowledgeable guides will entertain you with the most ,interesting stories about Presidents, Congress, memorials, and parks. Comfortable bikes and a smooth tour route(路线)make cycling between the sites fun and relaxing.zxxk\n', # 'Washington Capital Sites at Night Bicycle Tour\n', 'Duration:3 hours(7miles)\n', # 'Join a small group bike tour for an evening of exploration in the heart of Washington, D.C. Get up close to the monuments and memorials as you bike the sites of Capitol Hill and the National Mall. Frequent stops are made for photo taking as your guide offers unique facts and history. Tour includes bike, helmet, and bottled water. All riders are equipped with reflective vests and safety lights.\n', # '21.Which tour do you need to book in advance?\n', # 'A. Cherry Blossom Bike Tour in Washington, D.C.\n', # 'B. Washington Capital Monuments Bicycle Tour.\n', # 'C. Capital City Bike Tour in Washington, D.C.\n', # 'D. Washington Capital Sites at Night Bicycle Tour.\n', # '22.What will you do on the Capital City Bike Tour?\n', # 'A. Meet famous people. \xa0 \xa0B. Go to a national park.\n', # 'C. Visit well-known museums. \xa0 \xa0D. Enjoy interesting stories.\n', # '23.Which of the following does the bicycle tour at night provide?\n', # 'a. City maps. \xa0 \xa0 \xa0b. Cameras.\n', 'c. Meals. d. Safety lights.\n', 'A.b->a->a->d->c\n', # 'B.c->b->a->d\n', 'C.d->c->b->a\n', 'D.a->b->d->c\n'] # con_list = # res = reading_structure(con_list) # pprint(res) con_list = ["Somebody might say, “I want to be a big fish, as big as Bill Gates, in a ", "big pool, as large as Microsoft.” However, we all know it is   __1__   for a green", "hand (生手,没经验的人,即菜鸟)in the field. Then you have to   __2__   the", "question carefully. Certainly no matter  __3__   side you choose to take, you have", "your chance to succeed. Now the problem is which can provide you more  __4__  ." "I choose to be a big fish in a small pool. A big company may provide you a ", " __5__ starting point, but a small company offers you opportunity to practice", "various   __6__  . During the   __7__  , you may  __ 8__   yourself, recognize your   __9__", "points and find your potentiality(潜力). What’s more,   __10__  so many", "limitations and rules in a small company, if you are ready competent(有能力的)", "you have   __11__   chances to climb to a higher point. Finally, being a big fish", "(although) in a small pool, gives a green hand   __12__   self-confidence which is", "quite important for   __13__  .", "__14__  , you should not be confined(限制,使局限) to your small pool, and be", "__15__   with being a big fish there. You should always dream of being a big fish", "in a big pool!", "1. A. impossible B. uninteresting C. unnecessary D. uncorrectable", "2. A. think B. recognize C. weigh D. realize", "3. what B. whether C. how D. which", "4. A. help B. chance C. advice D. money", "5. A. high B. proper C. rich D. practical", "6. A. jobs B. tools C. skills D. topics", "7. A. success B. perform C. products D. process", "8. A. increase B. improve C. impress D. encourage", "9. A. weak B. reasonable C. strong D. amazing", "10. without B. because of C. except for D. with", "11. A. less B. equal C. no D. more", "12. A. various B. serious C. precious D. previous", "13. A. future B. success C. boss D. market", "14. A. Finally B. Certainly C. Immediately D. Generally", "15. A. satisfied B. proud C. relaxed D. regretted"] res = cloze_structure(con_list) pprint(res)