xh
/
word_parse_of_eng


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
							# -*- coding: utf-8 -*-

"""
obj_reparse是在get_obj_structure_8基础上修改的。
主要修改原则是：再解析中label每一行的时候写的更严格更死板
                题干：23.xxxxxx
				选项：A.、xxxxx  B.、xxxx			
                

"""
import re
from pprint import pprint
from util import max_len_series


# def error_B_find_and_rep(con_list):
#     """
#     在list中,通过多行连续性,发现A (8|3|13|l3) C D,这样的连续序列,然后将 其替换为B
#     原地修改,不用返回值,使用的时候也不需要赋值
#     :param con_list:
#     :return:
#     """
#     for i in range(1, len(con_list) - 2):
#         is_b = re.match(r"([^\d]{0,3}(8|13|3)[,.。、])", con_list[i].replace("(", "").replace(")", ""))
#         if is_b:
#             is_a = re.match(r".{0,3}A[,.。、]", con_list[i - 1].replace("(", "").replace(")", ""))
#             is_c = re.match(r".{0,3}[CcG][,.。、]", con_list[i + 1].replace("(", "").replace(")", ""))
#             is_d = re.match(r".{0,3}D[,.。、]", con_list[i + 2].replace("(", "").replace(")", ""))
#             is_cd = re.match(r".{0,3}[CcG][,.。、].*?D[,.。、].+?",
#                              con_list[i + 1].replace("(", "").replace(")", ""))  # CD可能在同一行
#             if is_a and is_c and (is_d or is_cd):
#                 con_list[i] = is_b.group(1).replace(is_b.group(2), "B") + con_list[i][is_b.end():]


def infer_abcd_cloze(opt_label_str, is_oneline=False):
    """
    根据下文来推理前面一个不确定的选项
    :param label_str:
    :return:
    """
    count = 0
    while re.search(r"N[B-D]", opt_label_str) and count < len(opt_label_str):
        opt_label_str = opt_label_str.replace("NAA", "DAA") \
            .replace("NB", "AB") \
            .replace("NC", "BC") \
            .replace("ND", "CD")
        count += 1
    opt_label_str = re.sub(r"([^N])NA", "\1\1A", opt_label_str)

    count2 = 0
    if is_oneline:  # 同一行根据前面往后推
        while re.search(r"[A-C]N", opt_label_str) and count2 < len(opt_label_str):
            opt_label_str = opt_label_str.replace("AN", "AB"). \
                replace("BN", "BC"). \
                replace("CN", "CD")
            count2 += 1
    else:
        if opt_label_str.endswith("N"):
            opt_label_str = opt_label_str[:-1] + "D"
    return opt_label_str


def infer_abcd_rs(opt_label_str, is_oneline=False):
    """
    阅读和单选,标签序列和完型不一样,@ABCD,所以推理有点不一样,就分开写了
    """

    count = 0
    while re.search(r"N[A-D]|([A-D])N@|@N+@", opt_label_str) and count < len(opt_label_str):
        opt_label_str = opt_label_str.replace("NA", "@A") \
            .replace("NB", "AB") \
            .replace("NC", "BC") \
            .replace("ND", "CD")
        opt_label_str = re.sub(r"([A-D])N@", r"\1\1@", opt_label_str)
        count += 1
    if is_oneline:  # 同一行根据前面往后推
        count2 = 0
        while re.search(r"[A-C]N", opt_label_str) and count2 < len(opt_label_str):
            opt_label_str = opt_label_str.replace("AN", "AB").replace("BN", "BC").replace("CN", "CD")
            count2 += 1
    else:  # 基于整个序列替换
        def rub_mode_2(m):
            return "@" * len(m.group(0))

        opt_label_str = re.sub(r"@([N]+)[@A]", rub_mode_2, opt_label_str)  # 两个题干之间的都为N,那就把中间都当作题干

        ######经过上面替换,N[A-D]肯定都没了,但会存在N$的情况
        if opt_label_str.endswith("N"):
            opt_label_str = opt_label_str[:-1] + "D"
    #####单项选择前面第一个A前面如果存在N,就全部换成A
    return opt_label_str


def label_abcd(one_item, ty):
    """
    每个字符串文本,判断其标签类别,@：内容行(题干行) A,B,C,D,N(其他)
    """
    i = re.sub(r"[()]", "", one_item)[:5]
    if re.search(r"\d+\s*[\.．]", i.strip()):
        if ty != "完形填空":
            label = "@"
        else:
            label = "A"
    else:
        label = re.search(r"([A-D])\s*[\.．]", i.strip()).group(1) if re.search(r"([A-D])\s*[\.．]", i.strip()) else "N"
    if label == "N":
        # 但有[A-D]\s[A-Z]
        label = re.search(r"([A-D])\s+[A-Z]", i.strip()).group(1) if re.search(r"([A-D])\s+[A-Z]", i.strip()) else "N"
    return label.upper()


def essay_label_correct(all_label_str):
    """
    只有阅读和完型使用这个,label_str这个题所有行的label序列
    """

    def rub_mode_1(m):
        return "N" * len(m.group(1))

    count = 0
    while re.search(r"([^N]N{4,})", all_label_str[:10]) and count < 10:  # 对于完形填空，前10行大胆这样替换
        # 确定短文,把短文中的N先整理好,方便后面按照N块取essay
        all_label_str = re.sub(r"([^N]N{4,})", rub_mode_1, all_label_str[:10]) + all_label_str[10:]
        count += 1
    all_label_str = re.sub(r"(N{4,}[^N]N{3,})", rub_mode_1, all_label_str)  # 10行以后还有漏网之鱼
    return all_label_str


def sub_item_group(label, con, ty):
    """
    注意:只对ABCD选项按小题分组,所以完型和阅读输进来的items_all是去掉esssy以后的序列
    items_all:[("A","see"),("B":"later"),("C","right"),("D":"part")]
    ty: 只能是 单项填空,完形填空,阅读理解 三个题型,其中,完型的选项只有ABCD,阅读和单选多了一个@标签
    """
    slave = []
    flag = "0"
    one_item = {"A": "", "B": "", "C": "", "D": "", "topic_no": "0", "answer": ""}
    if ty in ["阅读理解", "单项填空"]:
        one_item["content"] = ""
    topic_no = []
    label_list = list(label)
    label_list.append("0")
    con.append("END_MARK")

    for abcd, con in zip(label_list, con):
        if abcd > flag and abcd != "N":
            one_item[abcd.replace("@", "content")] = con  # 拿到内容
            flag = abcd
        elif abcd == flag:
            one_item[flag.replace("@", "content")] = (one_item[flag.replace("@", "content")] + " " + con)
        else:
            tn = 0
            for k, v in one_item.items():
                if k == "content":
                    tn = re.search(r"(\d+)", v).group(1) if re.search(r"(\d+)", v) else 0
                    one_item["topic_no"] = str(tn)
                    one_item["content"] = re.sub(r"[^a-zA-Z]", "", v[:8].replace(str(tn), "")) + v[8:]
                elif re.match(r"[A-D]", k):  # 选项内容,把A.xxxx前面的A.去掉
                    if k == "A" and ty == "完形填空":  # 完型的题号和A选项在同一行
                        tn = re.search("(\d+)", v[:5]).group(1) if re.search("(\d+)", v[:5]) else 0
                        v = re.sub(str(tn) + "[,，。.．]", "", v)
                    v = re.sub(r".*?[,，。.．]", "", v, count=1).strip()
                    one_item[k] = v

            if len([one_item[i] for i in ['A', 'B', 'C', 'D'] if one_item[i] != ""]) > 0:  # 一个选选项都没有,就不要添加了
                topic_no.append(int(tn))
                slave.append(one_item)
            one_item = {"A": "", "B": "", "C": "", "D": "", "topic_no": "0", "answer": ""}
            if ty in ["阅读理解", "单项填空"]:
                one_item["content"] = ""
            one_item[abcd.replace("@", "content")] = con  # 拿到内容
            flag = abcd
    return slave, topic_no  # 是int


def table_clear(con,k):
    """选项如果放在表格里,先把table标签清洗掉"""

    if len(re.findall(r"A\..+?B\..+?C\..+?D\..+?",con)) >= k:
        con = re.sub(r'<tbody><tr class="firstRow"><td>|</td></tr></tbody>',"",con)
        con = re.sub(r"</?t[dr]>.*?([BCD]\.|\d+、)",r"   \1",con)
    return con


def reparse_cloze_structure(con_list):
    ty = "完形填空"
    items_all = []
    # error_B_find_and_rep(con_list)
    for k, line in enumerate(con_list):
        one_line_label = ""
        line = table_clear(line,k=10)
        one_item_con = [line]
        ######判断一下是否是选项行,是否要将BCD分开(OCR识别，选项都粘在一起了)
        if k > 0:#
            word_n = len(re.findall(r"[a-zA-Z]+", line))
            opt_n = len(re.findall(r'[B-D]\.', line))
            if opt_n>=3 or word_n <= (opt_n + 1) * 3:
                line = re.sub(r"([B-D]\.)", r"    \1", line)
                one_item_con = [i for i in re.split("\s{3,}", line) if i != ""]
        # if len(re.findall(r"[A-Za-z]+",line)) >= 20 and len(line.replace(" ","")) > 60:
        #     one_item_con = [line]
        # else:
        #     one_item_con = [i for i in re.split("\s{3,}", line) if i != ""]

        # if len(one_item_con) == 1:
        #     one_item_con = [line]

        for i in one_item_con:  # 一行内部再分开
            one_line_label += label_abcd(i.strip(), ty)
        if len(one_line_label) > 1:
            one_line_label = infer_abcd_cloze(one_line_label, is_oneline=True)
        one_line_label = list(one_line_label)
        # 保证每一行内部排序是按照ABCD这样的顺序,因为确实遇到过一行,OCR识别出来CD在前面
        h = sorted(zip(one_line_label, one_item_con), key=lambda x: x[0])
        items_all.extend(h)

    label_all = [i[0] for i in items_all]
    con_all = [i[1] for i in items_all]
    label_str = "".join(label_all)
    label_str = essay_label_correct(label_str)
    ij = re.search(r"N+[^N]", label_str)
    print(ij.end()-1)
    if ij:
        essay = con_all[:ij.end() - 1]
        opt = con_all[ij.end() - 1:]  # 拿到阅读题干
        label_str = label_str[ij.end() - 1:]  # 题干对应标签
    else:
        ###随便取,取前3行当短文
        print("完形填空短文少于3行:{}".format(con_all))
        essay = con_all[:3]
        opt = con_all[3:]  # 拿到阅读题干
        label_str = label_str[3:]  # 题干对应标签
    ##基于整篇文章序列再推理一遍
    label_str = infer_abcd_cloze(label_str)
    slave, topic_no = sub_item_group(label_str, opt, ty)
    ########小题筛选
    count = -1
    while len(slave) > 20 and count < 5:
        count += 1
        if topic_no[0] in topic_no[-5:]:  # 可能是把短文最后几句话中的题号当作选项了
            essay.append("\n".join([slave[0][i] for i in ['A', 'B', 'C', 'D'] if slave[0][i] != ""]))
            del slave[0]
            del topic_no[0]
        else:
            break
    topic_no = max_len_series(topic_no)

    for i, s in enumerate(slave):
        s["topic_no"] = str(topic_no[i])
    if len(topic_no) > 2 and topic_no[-1] - topic_no[0] + 1 == len(slave):
        s_topic_no = "{}-{}".format(topic_no[0], topic_no[-1])
    else:
        s_topic_no = "{}-{}".format(101, 100 + len(slave))

    return {'content': "\n".join(essay), 'topic_type_name': '完形填空', 'slave': slave,
            'topic_type_id': 2, 'topic_no': s_topic_no}


def reparse_reading_structure(con_list):
    ty = "阅读理解"
    items_all = []
    # error_B_find_and_rep(con_list)
    for k, line in enumerate(con_list):
        one_line_label = ""
        if k > 1 and re.match(r"\d+、", con_list[k-1]):
            line = table_clear(line,k=1)
        one_line_con = [line]
        if k > 0:  # and re.search(r"\d",line[5:])==None:
            word_n = len(re.findall(r"[a-zA-Z]+", line))
            opt_n = len(re.findall(r'[B-D][\.．]', line))
            if opt_n>=3 or (word_n <= (opt_n + 1) * 8 and word_n< max(map(len,con_list[:10]))):
                line = re.sub(r"([B-D][\.．])", r"        \1", line)
                one_line_con = [i for i in re.split(r"\s{4,}", line.strip())]
        # if len(re.findall(r"[A-Za-z]+",line)) >= 20 and len(line.replace(" ","")) > 60:
        #     one_line_con = [line]
        # else:
        #     one_line_con = [i for i in re.split(r"\s{4,}", line.strip()) ]  # 即要求A、as至少4各字符,因为可能又噪声被分出来了A. Open up to othrs.D.oB. Depend on cach other.

        # if len(one_line_con) == 1:
        #     one_line_con = [line]
        for i in (one_line_con):
            i = re.sub(r"[()]", "", i.strip())[:5]
            one_line_label += label_abcd(i, ty)
        if len(one_line_label) > 1:
            one_line_label = infer_abcd_rs(one_line_label, is_oneline=True)
        one_line_label = list(one_line_label)
        # 保证每一行内部排序是按照ABCD这样的顺序,因为确实遇到过一行,OCR识别出来CD在前面
        h = sorted(zip(one_line_label, one_line_con), key=lambda x: x[0])
        items_all.extend(h)

    label_all = [i[0] for i in items_all]
    con_all = [i[1] for i in items_all]
    label_str = "".join(label_all)
    label_str = essay_label_correct(label_str)
    ij = re.search(r"N+[^N]", label_str)
    if ij:
        essay = con_all[:ij.end() - 1]
        opt = con_all[ij.end() - 1:]  # 拿到阅读题干
        label_str = label_str[ij.end() - 1:]  # 题干对应标签
    else:
        ###随便取,取前3行当短文
        print("阅读理解短文少于3行:{}".format(con_all))
        essay = con_all[:3]
        opt = con_all[3:]  # 拿到阅读题干
        label_str = label_str[3:]  # 题干对应标签

    label_str = infer_abcd_rs(label_str)
    slave, topic_no = sub_item_group(label_str, opt, ty)
    try:
        max_id = max(topic_no)
        min_id = min(topic_no)
        num_items = len(slave)
        s_topic_no = "{}-{}".format(min_id, max_id)
        if max_id - min_id + 1 != num_items:
            if max_id - 1 in topic_no or max_id - 2 in topic_no:  # 说明max是对的,min是错的
                n_min_id = max_id - num_items + 1
                s_topic_no = "{}-{}".format(n_min_id, max_id)
                slave[0]["topic_no"] = str(n_min_id)  # 用新的id把原来的最小值topic_no改掉
            elif min_id + 1 in topic_no or min_id + 2 in topic_no:  # 说明min是对的
                n_max_id = min_id + num_items - 1
                s_topic_no = "{}-{}".format(min_id, n_max_id)
                slave[-1]["topic_no"] = str(n_max_id)
            else:
                topic_no = max_len_series(topic_no)
                if len(topic_no) >= 2:
                    s_topic_no = "{}-{}".format(topic_no[0], topic_no[-1])
                else:
                    s_topic_no = "{}-{}".format(100, 100 + len(slave))
                    print("阅读理解的题号严重有问题：{}".format(topic_no))

        return {'content': '\n'.join(essay), 'slave': slave, 'topic_type_id': 3,
                'topic_type_name': '阅读理解', 'topic_no': s_topic_no}
    except:
        return {'content': '\n'.join(essay), 'slave': slave, 'topic_type_id': 3,
                'topic_type_name': '阅读理解', 'topic_no': "{}-{}".format(100, 100 + len(slave) - 1)}


if __name__ == '__main__':
    # con_list = [ "A ship that sank off the coast of California decades ago was recently reconstructed in remarkable detail.The 3D digital model even included hundreds\xa0of sponges(海绵动物) that have gathered on the ship's' surface in the years\xa0Since it sank\n", 'Named A merican Heritage, the boat-a supply ship that once serviced\n', "oil platforms-sank in Santa Monica Bay on May 4, 1995, and for decades\xa0\xa0its precise location(位置) was unknown. Researchers with the Monterey Bay Aquarium Research Institute (MBARI) spotted a strange shape in that area in\xa02008. But it wasn't until May 2018 that MBARI scientists identified its precise\xa0location and mapped the site in greater detail, showing what appeared to be a\xa0shipwreck(失事船只). It measured197 feet long and rested nearly2300 feet\xa0\xa0below the surface.\n", 'Even then, the identity of the shipwreck was uncertain. Yet another MBARI team revisited the location to do further exploration.They sent remotely operated\xa0vehicles (ROVs) and took photos of the damaged ship. Though it was covered\xa0with deep-sea sponges and other animals, the scientists were able to spot\xa0\xa0letters spelling out its name, confirming that the shipwreck was American\xa0Heritage.\n', 'As one of the MBARI scientists who found American Heritage, chief\n', 'ROV pilot Knute Brekke had previously worked on the ship. And he was\xa0 on duty with the diving company American Pacific Marine\n', '-the owner of\xa0American Heritage -the night the ship began taking on water and eventually\xa0sank.\n', 'MBARI spokesperson Kim Fulton-Bennett said to Live Science about the\xa0discovery, "The model is not complete, as floating ropes and poor visibility\xa0kept the pilots from getting too close to the wreck. Nevertheless, the 3D\xa0\xa0reconstruction is detailed enough to show that American Hertage is now home\xa0\xa0to potentially thousands of sponges. Shipwrecks- accidental and intentional-often transform into the shelter for diverse communities of ocean life.\n', '12.what is the main idea of the text?\n', 'A.A valuable treasure was discovered.\n', 'B.Special sponges were found under sea.\n', 'C.3D model reconstructed sunken ship.\n', 'D.A sunken ship was gotten out of water.\n', '13.which is the right order of the following events?\n', '① something strange was found in the area\n', '②ROVs were sent under sea to take photos.\n', '③A ship sank in Santa Monica Bay.\n', '④The identity of the ship was confirmed.\n', '⑤Scientists tried to locate the shipwreck\n', 'A.②③⑤④①B.③①⑤②④C.⑤③①④②D.④③①②⑤\n', '14. What can we learn about Knute Brekke?\n', 'A. He was familiar with the sunken ship.\n', 'B\xa0He was in charge of diving company.\n', 'C He was responsible for the rescue work\n', 'D\xa0\xa0He was the first one to witness the accident.\n', "15. What's Kim Fulton-Bennett's attitude towards the 3D model?\n", 'A. Critical.\n', 'B. Doubtful.\n', 'C.Amazed\n', 'D.Objective\n', '.\n', '.\n']
    # con_list = ['Washington, D.C. Bicycle Tours\n', 'Cherry Blossom Bike Tour in Washington, D.C.\n',
    #                 'Duration Tour\n',
    #                 'This small group bike tour is a fantastic way to see a world-famous cherry trees with beautiful flowers of Washington, D.C. Your guide will provide a history lesson about the trees and the famous monuments where they blossom. Reserve your spot before availability — the cherry blossoms—disappear!\n',
    #                 'Washington Capital Monuments Bicycle Tour\n', 'Duration:3 hours (4 miles)\n',
    #                 'Join a guided bike tour and view some of the most popular monuments in Washington, D.C. Explore the monuments and memorials on the National Mall as your guide shares unique facts and history at each stop. Guided tour includes bike, helmet, cookies and bottled water.\n',
    #                 'Capital City Bike Tour in Washington, D.C.\n', 'Duration:3 hours\n',
    #                 'Morning or Afternoon, this bike tour is the perfect tour for D. C. newcomers and locals looking to experience Washington, D.C. in a healthy way with minimum effort. Knowledgeable guides will entertain you with the most ,interesting stories about Presidents, Congress, memorials, and parks. Comfortable bikes and a smooth tour route(路线）make cycling between the sites fun and relaxing.zxxk\n',
    #                 'Washington Capital Sites at Night Bicycle Tour\n', 'Duration:3 hours(7miles)\n',
    #                 'Join a small group bike tour for an evening of exploration in the heart of Washington, D.C. Get up close to the monuments and memorials as you bike the sites of Capitol Hill and the National Mall. Frequent stops are made for photo taking as your guide offers unique facts and history. Tour includes bike, helmet, and bottled water. All riders are equipped with reflective vests and safety lights.\n',
    #                 '21.Which tour do you need to book in advance?\n',
    #                 'A. Cherry Blossom Bike Tour in Washington, D.C.\n',
    #                 'B. Washington Capital Monuments Bicycle Tour.\n',
    #                 'C. Capital City Bike Tour in Washington, D.C.\n',
    #                 'D. Washington Capital Sites at Night Bicycle Tour.\n',
    #                 '22.What will you do on the Capital City Bike Tour?\n',
    #                 'A. Meet famous people. \xa0 \xa0B. Go to a national park.\n',
    #                 'C. Visit well-known museums. \xa0 \xa0D. Enjoy interesting stories.\n',
    #                 '23.Which of the following does the bicycle tour at night provide?\n',
    #                 'a. City maps. \xa0 \xa0 \xa0b. Cameras.\n', 'c. Meals. d. Safety lights.\n', 'A.b->a->a->d->c\n',
    #                 'B.c->b->a->d\n', 'C.d->c->b->a\n', 'D.a->b->d->c\n']

    # res = reading_structure(con_list)
    # pprint(res)
    con_list = ['一、阅读理解(本部分共4大题，15小题，每小题2.0分，共30.0分)\n', 'A\n', 'Monthly Talks at London Canal Museum\n', 'Our monthly talks start at 19:30 on the first Thursday of each month except August.\n', 'Admission is at normal charges and you don’t need to book. They end around 21:00.\n', 'November_7th\n', 'The Canal Pioneers, by Chris Lewis. James Brindley is recognized as one of the leading early canal engineers. He was also a major player in training others in the art of canal planning and building. Chris Lewis will explain how Brindley made a positive contribution to the education of that group of early “civil engineers”.\n', 'December_5th\n', 'Ice for the Metropolis, by Malcolm Tucker. Well before the arrival of freezers, there was a demand for ice for food preservation and catering.\xa0Malcolm will explain the history of importing natural ice and the technology of building ice wells, and how London’s ice trade grew.\n', 'February_6th\n', 'An Update on the Cotsword Canals, by Liz Payne. The Stroudwater Canal is moving towards reopening. The Thames and Severn Canal will take a little longer. We will have a report on the present state of play.\n', 'March_6th\n', 'Eyots and Aits—Thames Islands, by Miranda Vickers. The Thames had many islands. Miranda has undertaken a review of all of them. She will tell us about those of greatest interest.\n', 'Online bookings:www.canalmuseum.org.uk/book\n', 'More info:www.canalmuseum.org.uk/whatson\n', 'London Cannal Museum\n', '12-13 New Wharf Road, London NI 9RT\n', 'www.canalmuseum.org.uk\xa0\xa0\xa0\xa0www.canalmuseum.mobi\n', 'Tel:02077130836\n', '1、When is the talk on James Brindley?\n', '<table><tbody><tr class="firstRow"><td>A.February 6th.</td><td>B.March 6th.</td><td>C.November 7th.</td><td>D.December 5th.</td></tr></tbody></table>\n', '2、What is the topic of the talk in February?\n', '<table><tbody><tr class="firstRow"><td>A.The Canal Pioneers.</td></tr><tr><td>B.Ice for the Metropolis</td></tr><tr><td>C.Eyots and Aits—Thames Islands</td></tr><tr><td>D.An Update on the Cotsword Canals</td></tr></tbody></table>\n', '3、Who will give the talk on the islands in the Thames?\n', '<table><tbody><tr class="firstRow"><td>A.Miranda Vickers.</td><td>B.Malcolm Tucker.</td><td>C.Chris Lewis.</td><td>D.Liz Payne.</td></tr></tbody></table>\n']

    res = reparse_reading_structure(con_list)
    pprint(res)