cdZWj
/
new_tiku_structure_v3_sci


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-

import re, os
import configs
from utils.washutil import table_label_cleal
import numpy as np
from PIL import Image


def option2block(option_con, item_no_type):
    """
    选择题选项切分
    对于选项切分部分，最好也像题号一样先自我切分纠错，但这样老师如果手误打错了字母，可能就解析出错！！！！！
    :return:
    """
    def del_table(ss):
        ss = re.sub(r"</?t[dr]>|</?tbody>|</?table>|</?div>|</?p>", "", ss.replace("<td><p>", " "))
        return ss
    # print('***********',option_con)
    if '<table><tbody><tr>' in option_con and \
        len(re.findall('<tr><td><p>(A\s*[.．、､：:].+?|\(A\)\s*[.．、､]?.+?)</tr>', option_con.strip())) == 1:
        st_opt = re.search('<table><tbody><tr><td><p>(A\s*[.．、､：:].+?|\(A\)\s*[.．、､]?.+?)</tr>',
                           option_con.strip()).start()
        option_con = option_con.strip()[0:st_opt] + '\n' + del_table(option_con.strip()[st_opt:])
        # print("option_con:", option_con)
    option_con = re.sub(r"</table>\n*\s*(<p>)?\s*(A\s*[、､.．：:]|\(A\)\s*[、､.．]?)(.+?)", r"</table>【【A､】】\3",
                        option_con, flags=re.S)
    if re.search("\n\s*C", option_con) is None and re.search("\n\s*c", option_con):
        option_con = re.sub("\n\s*c", "\nC", option_con)
    # option_con = re.sub(r"(\n\s*(<img\s*src=\".*?\"\s(width|height|eq-code|data-latex|ocr-latex)=.*?[\"/]>\s*)+?\s*)(A[、､.．：:].+?)", r"\1\n\3", option_con.strip())
    option_con = re.sub(r"(\n\s*(<img\s*src=((?![/>]).)*?/>\s*)+?\s*)(A[、､.．：:].+?)", r"\1\n\4", option_con.strip())
    con = re.sub(r"\n\s*([A-H])\s*[、､.．：:](.+?)", r"\n【【\1､】】\2", option_con.strip())  # 行首的A、不能考虑,故得用strip
    if item_no_type == 1 and len(re.findall(r'【【[A-H]\s*[.．、､]】】', con)) <= 2 and \
            len(re.findall(r'\([A-H]\)', con)) > 2:  # 针对题干是第一种类型，选项是第二种类型的情况
        item_no_type = 2

    if item_no_type == 2:
        con = re.sub(r"\n\s*\(([A-Hc])\)\s*[、､.．]?(.+?)", r"\n【【\1､】】\2", option_con)

    con = con.replace("</table>【【", "</table>\n【【")
    # print(11111,option_con)
    if item_no_type == 1:
        if len(re.findall(r'【【[A-H]\s*[.．、､]】】', con)) <= 3:
            while re.search(r"\n\s*[A-H]\s*<img\s*src=.+?", con.replace(" ", "")):  # 2020/7/15
                con = re.sub(r"\n\s*([A-H])\s*(<img\s*src=.+?)", "\n" + r"【【\1､】】\2", con)
            while re.search(r"(\n\s*<img\s*src=.+?)([A-H][.．、､])(.+?)", con.replace(" ", "")):
                con = re.sub(r"(\n\s*<img\s*src=.+?)(?<!【)([A-H]\s*[.．、､：:])(.+?)", r"\1" + "\n" + r"【【\2】】\3", con)
            while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-Hc][.．、､])\n+(.+?)(?<!【)([A-H][.．、､])(.+?)",
                            con.replace(" ", ""), re.S):
                con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】.+?)(?<!【)([A-H]\s*[.．、､：:])\s*\n+(.+?)"
                             r"(?<!【)([A-H]\s*[.．、､：:])(.+?)", r"\1【【\2】】\3【【\4】】\5", con, flags=re.S)
            while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-H][.．、､])\n+(.+?)", con.replace(" ", ""), re.S):
                con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】.+?)(?<!【)([A-H]\s*[.．、､：:])\s*\n+(.+?)",
                             r"\1【【\2】】\3", con, flags=re.S)
            while re.search(r"(\n【【[A-H][.．、､]】】.+?)(?<!【)([A-H][.．、､])(.+?)", con.replace(" ", "")):
                con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】.+?)(?<!【)([A-H]\s*[.．、､：:])(.+?)", r"\1【【\2】】\3", con)
            while re.search(r"(\n【【[A-H][.．、､]】】[^【]+?/>\s+)(?<!【)([B-H][.．、､])(.+?)", con.replace(" ", ""), re.S):
                con = re.sub(r"(\n\s*【【[A-H]\s*[.．、､]】】[^【]+?/>\s+)(?<!【)([B-H]\s*[.．、､：:])\s*(.+?)",
                             r"\1【【\2】】\3", con, flags=re.S)  # 选项子母前面是图片 9/8
    if item_no_type == 2:
        if len(re.findall(r'【【[A-H][.．、､]】】', con)) <= 3:
            while re.search(r"\n\s*\([A-H]\)\s*<imgsrc=.+?", con.replace(" ", "")):  # 2020/7/15
                con = re.sub(r"\n\s*\(([A-H])\)\s*(<img src=.+?)", "\n" + r"【【\1､】】\2", con)
            while re.search(r"(\n\s*<imgsrc=.+?)(\([A-H]\)[.．、､]?)(.+?)", con.replace(" ", "")):
                con = re.sub(r"(\n\s*<img src=.+?)\(([A-H])\)\s*[.．、､]?(.+?)", r"\1" + "\n" + r"【【\2､】】\3", con)
            while re.search(r"(\n【【[A-H]､】】.+?)\(([A-H])\)[.．、､]?\n+(.+?)\(([A-H])\)[.．、､]?(.+?)",
                            con.replace(" ", ""), re.S):
                con = re.sub(r"(\n\s*【【[A-H]､】】.+?)\(([A-H])\)\s*[.．、､]?\s*\n+(.+?)"
                             r"\(([A-H])\)\s*[.．、､]?(.+?)", r"\1【【\2､】】\3【【\4､】】\5", con, flags=re.S)
            while re.search(r"(\n【【[A-H]､】】.+?)\(([A-H])\)[.．、､]?\n+(.+?)", con.replace(" ", ""),re.S):
                con = re.sub(r"(\n\s*【【[A-H]､】】.+?)\(([A-H])\)\s*[.．、､]?\s*\n+(.+?)",
                             r"\1【【\2､】】\3", con, flags=re.S)
            while re.search(r"(\n【【[A-H]､】】.+?)\(([A-H])\)[.．、､]?(.+?)", con.replace(" ", "")):
                con = re.sub(r"(\n\s*【【[A-H]､】】.+?)\(([A-H])\)\s*[.．、､]?(.+?)", r"\1【【\2､】】\3", con)

    con_list = re.split(r"【【[A-H]\s*[.．、､]】】", con)

    if len(con_list) > 1:
        stem_opt = table_label_cleal(con_list[0])
        con_list = list(map(del_table, con_list[1:]))
        con_list.insert(0, stem_opt)  # 题干中的表格不需要清洗

    return con_list, con

recur_n = 1  # 递归次数

def option_structure(one_item, con, ans, item_no_type, is_danti=0, is_slave=0):
    """
    选择题选项拆分结构化
    还需要判断一下 选项个数与题型的对应！！！！
    :return:
    """
    global recur_n
    # print(con)
    # print('----------------------')
    if recur_n>2:
        if 'options' not in one_item and not is_slave:
            one_item["errmsgs"].append("选项格式不正确")
        recur_n = 1
        return one_item

    ans = re.sub("[;；.]+", "", ans)
    ans2 = []
    for a in ans.split("#"):
        if 0<len(a.replace(" ", "")) < 8:
            ans2.append("、".join(re.findall(r"[A-G]", a)))
    one_item["key"] = "; ".join(ans2)
    options_rank = get_options_arrange(one_item["stem"])
    # print("id：", one_item['item_id'])
    # print("options_rank:",options_rank)

    con_list, repl_con = option2block(con, item_no_type)
    # print(len(con_list), con_list)
    # 初筛
    if len(con_list) < 5:
        opt_letter = re.findall(r"【【([A-H])\s*[.．、､]】】", repl_con)
        if opt_letter and opt_letter[0] == 'B' and re.search("<img src=.+?/>\s*A\s*[.．、､].+?$", con_list[0]):
            re_split = re.sub("(<img src=.+?/>)\s*A\s*[.．、､](.+?)$", r"\1【【A、】】\2", con_list[0])
            con_list[0] = re_split.split("【【A、】】")[0]
            con_list.insert(1, re_split.split("【【A、】】")[1])
    if len(con_list) >= 5:
        pattern_1 = re.compile(r"\s([1-9]|1[0-9])[.．、､].+?([是为有]|等于)[(（]\s*[)）]\n", re.S)
        pattern_2 = re.compile(r"\s\(([1-9]|1[0-9])\).+?([是为有]|等于)[(（]\s*[)）]\n", re.S)
        pattern_3 = re.compile(r"([是为有]|等于)[(（]\s*[)）]\n", re.S)
        # 第一个错误针对题目中没有答案解析的情况，不然就是选项切分错误
        if not is_danti:
            if (item_no_type == 1 and any([True for op in con_list[1:] if re.search(pattern_1, op)])) or \
                    (item_no_type == 2 and any([True for op in con_list[1:] if re.search(pattern_2, op)])):
                one_item["errmsgs"].append("本题选项与下一题题干间没有换行符，请注意重新换行！！！")  # 一般只有一题和上一题连在一起
                if 'item_id' in one_item:
                    one_item['spliterr_point'] = one_item['item_id']
                return one_item
            elif any([True for op in con_list[1:] if re.search(pattern_3, op)]):
                one_item["errmsgs"].append("本题的下一题的题号有问题，请注意重新输入！！！")
                if 'item_id' in one_item:
                    one_item['spliterr_point'] = one_item['item_id']
        # ------------------------------------------------------------------------
        aft_opt = []  # 针对选项后是题目图片的情况
        if "\n" in con_list[-1]:
            ccon = re.split("\n+", con_list[-1])
            while re.match("<img src=", ccon[-1]) and len(ccon) > 1:
                aft_opt.insert(0, ccon[-1])
                ccon = ccon[:-1]
            if aft_opt:
                con_list[0] += "\n" + "\n".join(aft_opt)
                con_list[-1] = "\n".join(ccon)
        # -------------------------------------------------------------------------
        # 选项纠错
        con_list[0] = re.sub(r"\(\d+分\)", "", con_list[0][:9]) + con_list[0][9:]
        opt_letter = re.findall(r"【【([A-H])\s*[.．、､]】】", repl_con)
        # print('/////////////////////////',opt_letter)
        if "".join(sorted(opt_letter)) in "ABCDEFGHIJ" or "".join(sorted(opt_letter)) in ["ABCE", "ABDE", "ACDE", "BCDE"]:
            # con_list = pic_transfer(con_list)
            if con_list:
                return dict(one_item, **{"stem": con_list[0],
                                         "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list[1:]],
                                         "options_rank": options_rank,
                                         })  # , "options_num": len(con_list[1:])
        else:
            # 初次选项拆分的错误判断
            con_list = option_label_correct(opt_letter, con_list, repl_con)
            # double_l = [key for key, value in dict(Counter(opt_letter)).items() if value > 1]
            if type(con_list) == str:
                one_item["errmsgs"].append(con_list)
                return one_item
            else:
                # con_list = pic_transfer(con_list)
                if con_list:
                    return dict(one_item,
                                **{"stem": con_list[0],
                                   "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list[1:]],
                                   "options_rank": options_rank,
                                   })
        # return dict(one_item, **dict(zip(["stem","A","B","C","D"], con_list)))
    else:
        # 选项可能放在表格中
        is_fail = 0
        con_list2 = re.split(r"\n+", con)
        errmsgs = ""
        if len(con_list2) == 2:  # 选项是4个图片组成的情况
            option_array = len(re.findall("(^|\n)<img src=.+?", con_list2[1].strip()))
            if option_array > 2:  # 排列情况
                options_rank = 1
            elif option_array > 1:
                options_rank = 3
            else:
                options_rank = 2

            ims = con_list2[1].split("<img src=")
            if len(ims) == 5 and re.search(r"[\u4e00-\u9fa5]", ims[0]) is None:
                con_list2 = [con_list2[0] if k == 0 else "<img src=" + v
                             for k, v in enumerate(con_list2[1].split("<img src="))]  # 默认将“<img src=”切分后的第一项丢掉了
                # if len(con_list2) == 5:
                con_list2[0] = re.sub(r"\(\d+分\)", "", con_list2[0].replace(" ", "")[:9]) + con_list2[0][9:]
                return dict(one_item, **{"stem": con_list2[0],
                                         "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list2[1:]],
                                         "options_rank": options_rank,
                                         })
            else:
                errmsgs = """选项格式不正确,请改为: A.xxxx  B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
                                   【注意】1>>选项和题干间要换行,选项不要放在表格中；2>>选项【如A.】重新手输；3>>选项图片时用嵌入式；
                                           4>>选项太长时，每项之间要换行，上一项的内容不要与下一项在同一行！！"""
                is_fail = 1
        else:
            con_list3 = re.split(r"\n(?=<img)", con)
            if len(con_list3) == 5:
                return dict(one_item, **{"stem": con_list3[0],
                                         "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list3[1:]],
                                         "options_rank": options_rank,
                                         })
            else:
                errmsgs = """选项格式不正确,请改为: A.xxxx  B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
                                   【注意】1>>选项和题干间要换行,选项不要放在表格中；2>>选项【如A.】重新手输；3>>选项图片时用嵌入式；
                                           4>>选项太长时，每项之间要换行，上一项的内容不要与下一项在同一行！！"""
                is_fail = 1

        op_con = re.split("[(（]\s*[)）]", con)[-1]
        stem_con = "".join(re.split("[(（]\s*[)）]", con)[:-1])+"（  ）\n"
        if is_fail:
            if "table" in op_con:
                to_clean_con = re.findall('<table>(((?!(</?table>)).)*)</table>', op_con, re.S)
                if len(to_clean_con) == 1:
                    op_con = re.sub("</?table>|</?tr>|</?td>", "", op_con)
                    one_item = option_structure(one_item, stem_con+op_con, ans, item_no_type)
            else:
                aa = re.findall("[A-E]", op_con)
                if len(aa)==len(set(aa)) == 4:
                    recur_n += 1
                    op_con = re.sub("(?<!\\\)([A-E])\s*(?![.．、､])", r"\1､", op_con)
                    one_item = option_structure(one_item, stem_con + op_con, ans, item_no_type, is_slave=is_slave)

            if not is_slave and 'options' not in one_item and "选项格式不正确" not in "".join(one_item["errmsgs"]):
                one_item["errmsgs"].append(errmsgs)

    return one_item


def get_options_arrange(cont):
    """
    判断word中选项每行排版个数
    :return:
    """
    options_rank = 1  # 纵向排列
    option_num = 0
    if '<table><tbody><tr>' in cont:
        table_op = re.findall('<tr>.+?>([A-H]\s*[.．、､].+?|\([A-Z]\)\s*[.．、､]?.+?)</tr>', cont.strip())
        if table_op:
            option_num = len(re.findall('[A-H]\s*[.．、､].+?|\([A-Z]\)\s*[.．、､]?.+?', table_op[0]))
        if option_num == 2:
            options_rank = 3
        if option_num > 2:
            options_rank = 2
    else:
        option_list = cont.split("\n")
        for op in option_list:
            if re.search("^\s*[A-H]\s*[.．、､].+?|^\s*[A-H]\s*<img src=.+?|^\s*\([A-Z]\)\s*[.．、､]?.+?", op.strip()):
                option_num += 1
        if option_num == 2:
            options_rank = 3  # 一排2个
        elif option_num < 2:
            options_rank = 2  # 横向排列
    return options_rank


def new_options_rank(options):
    """
    按提分宝产品B5纸176*250mm设置选项的排版形式
    选项的排版形式暂设置3种：1：纵向排列 2：横向排列 3：一排2个
    中文字符按5号字体，即10.5磅，英文字符按3/4个中文字符算
    :return:
    """
    options_rank = 1  # 纵向排列
    option_len = []
    for opt in options:
        if re.search("\$.*?\$", opt):
            return 0
        pic_len = []
        if "<img " in opt:
            for img in re.findall("<img src=.*?/>", opt):
                w_info1 = re.search(' style=".*?width: (\d+[.\d]*?)\s*([pxtin]*?);.*?"', img)
                w_info2 = re.search(' width="(\d+[.\d]*?)\s*([pxt]*?)"', img)
                if w_info1:
                    if w_info1.group(2) == 'pt':
                        pic_len.append((25.4/72)*float(w_info1.group(1)))
                    elif w_info1.group(2) == 'px':
                        pic_len.append((25.4 / 72) * (3/4) * float(w_info1.group(1)))
                    elif w_info1.group(2) == 'in':
                        pic_len.append(25.4 * float(w_info1.group(1)))
                elif w_info2:
                    pic_len.append((25.4 / 72) * (3 / 4) * float(w_info2.group(1)))
                else:
                    print("选项中存在图片宽高未知")
                    # 主要没有宽高的图片是用户在编辑器新粘贴的图片，保存在本地，通过读取获取宽高
                    w_info3 = re.search('<img src=.*?(/[^/]*?/new_image.*?)"', img)
                    if w_info3:
                        local_p = configs.IMG_FOLDER + w_info3.group(1)
                        if os.path.exists(local_p):
                            w = Image.open(local_p).size[0]
                            pic_len.append((25.4 / 72) * (3 / 4) * float(w))
                        else:
                            print("选项中存在d的宽高未知图片不存在本地")
                            options_rank = 0
                    else:
                        options_rank = 0
                opt = opt.replace(img, "")

        # 统计字符长度
        char_en_l = len(re.findall(r"[a-z\d,.!?;'\-/:<>=*+$~%()\[\]{}\" ]", opt))
        opt = re.sub(r"[a-z\d,.!?;'\-/:<>=*+$~%()\[\]{}\" ]", "", opt)
        char_zh_l = len(opt)
        char_len = (10.5/72)*25.4*(char_en_l*0.75+char_zh_l)
        option_len.append(sum(pic_len) + char_len)

    # 以最长的选项长度作为参考：<=6个中文字符则排成1行,<=15个中文字符则排成2排，否则都是纵向排列===>此逻辑不对
    if sum(option_len) + (len(options)*2 + (len(options)-1)*4)*(10.5/72)*25.4 < 176-40:
        options_rank = 2
    else:
        option_len = sorted(option_len, reverse=True)
        if option_len[0]+option_len[1] + (2*2+1*4)*(10.5/72)*25.4 <= 176-40:
            options_rank = 3

    return options_rank


def option_label_correct(opt_letter, con_list, con):
    """
    选项少切了会报错，所以优先解决多切的错误问题
    纠正中标签错误的情况：选项字母不连续或重复；
    opt_letter：选项的字母   con_list:选择题拆分了选项的列表
    """
    lable_sign = re.findall(r"【【([A-H][.．、､])】】", con.replace(" ", ""))
    con_list2 = con_list.copy()
    for i, j in enumerate(lable_sign):  # 将con_list的选项字母加上
        con_list[i + 1] = j + con_list[i + 1]
    # con_list2 = re.split(r"【【[A-H]\s*[.．、､]】】", con)

    p1 = 0  # 选项在con_list中的起始位置
    for k, v in enumerate(con_list[1:]):
        if re.search(r"[(（]\s*[)）]", v):  # 选择题末尾一般都有（）
            opt_letter[k] = '0'
            p1 = k + 2
    if p1 and p1 < len(con_list[1:]):  # '0'不在最后一个位置
        option_list = con_list2[p1:]
        if len(option_list) >= 4:
            new_con_list = ["".join(con_list[:p1])].extend(option_list)
            return new_con_list
    else:  # 只考虑ABCD和ACBD两种情况
        label_str = "".join(opt_letter)
        if re.match("A","".join(opt_letter)) is None:
            label_str = re.sub("[^A]A", "0A", "".join(opt_letter), count=1)
        # print(label_str)
        # -------------------------------------------------------------
        # 若选择题中没有（），题干中还是出现了AA的话，需要判断下是否存在错误
        if re.search("AA", label_str):
            label_bcd_idx = [k for k, i in enumerate(label_str) if i != 'A']
            label_a_idx = [k for k, i in enumerate(label_str) if i == 'A']
            length_all = []
            for i1 in label_bcd_idx:  # 先将公式替换，作选项长度判断
                l1 = len(re.sub(r"<img\s*src\s*=\s*((?!/>).)+?/>", "<img>", con_list2[i1+1]).replace(" ",""))
                length_all.append(l1)
            aver_length = np.mean(length_all)
            st_a = label_str.index("AA")
            for i2 in label_a_idx:
                l2 = len(re.sub(r"<img\s*src\s*=\s*((?!/>).)+?/>", "<img>", con_list2[i2+1]).replace(" ",""))
                if abs(l2 - aver_length) >= 12:
                    if i2 >= st_a:
                        st_a = i2+1
            if st_a < len(label_str)-3:
                label_str = "".join(["0" if k < st_a else i for k, i in enumerate(label_str)])
        # -----------------------------------------------------------------
        label_str = re.sub("A[^BC]", "AA", label_str)
        label_str = re.sub("B[^CD]", "BB", label_str)
        label_str = re.sub("C[^BD]", "CC", label_str)
        label_str = re.sub("D[^E]", "DD", label_str)

        # 统计是否有重复的字符，若有，则进行合并，否则保持原来
        new_con_list = [con_list[0]]
        local_w = 0
        while local_w < len(label_str):
            if local_w == len(label_str) - 1 and label_str[local_w] == '0':
                break
            while label_str[local_w] == '0':  # 如果‘0’在中间，则‘0’会被去除
                local_w += 1
            double_num = label_str.count(label_str[local_w])
            if double_num >= 2:
                new_con_list.append(con_list2[local_w + 1] + "".join(con_list[2 + local_w:local_w+double_num + 1]))
            else:
                new_con_list.append(con_list2[local_w + 1])
            local_w += double_num
        new_opt_letter = label_str.replace('AA',"A").replace('BB',"B").replace('CC',"C").replace('DD',"D")
        if len(new_con_list) >= 4:
            if "".join(sorted(new_opt_letter)) in "ABCDEFGHIJ" or "".join(sorted(new_opt_letter)) in ["ABCE", "ABDE", "ACDE", "BCDE"]:
                return new_con_list
        return "选项格式不正确,1、请改为: A.xxxx  B.xxx,手动输入选项字母及后面的标点符号；" \
               "2.第一个选项A与题干之间要换行，各选项按ABCD排序；3.选项含图片时用嵌入式；"


def table_option_struc(stem):
    """
    表格类的选项结构化，在化学科目的选择题中较常见
    :return: 表格仍然作为表格，选项则根据表格中的选项补充，如A、A   B、B
    """
    options = []
    may_options = re.findall("<table>(((?!(</?table>)).)*)</table>", stem)
    if may_options:
        options_data = may_options[-1][0]
        data_col = re.findall("<tr><td>(.*?)</td>", options_data)  # 第一列
        if re.search("#?A#B#C#D#", re.sub("[.．、､，,\s]", "", "#".join(data_col).strip())+"#"):
            options_str = re.sub("[.．、､，,\s]", "", "#".join(data_col).strip()+"#")
            if "A#B#C#D#E#F#" not in options_str:
                if "A#B#C#D#E#" in options_str:
                    options = ["A", "B", "C", "D", "E"]
                elif "A#B#C#D#" in options_str:
                    options = ["A", "B", "C", "D"]
        else:
            data_rows = re.findall("<tr>(.*?)</tr>", options_data)
            data_row = re.findall("<td>(.*?)</td>", data_rows[0])  # 第一行
            if re.search("#?A#B#C#D#", re.sub("[.．、､，,\s]", "", "#".join(data_row).strip()) + "#"):
                options_str = re.sub("[.．、､，,\s]", "", "#".join(data_row).strip() + "#")
                if "A#B#C#D#E#F#" not in options_str:
                    if "A#B#C#D#E#" in options_str:
                        options = ["A", "B", "C", "D", "E"]
                    elif "A#B#C#D#" in options_str:
                        options = ["A", "B", "C", "D"]
    return options


if __name__ == '__main__':
    stem ="""
     下列物质与危险化学品标志的对应关系不正确的是<br/><table><tr><td>A</td><td>B</td><td>C</td><td>D</td></tr><tr><td>汽油</td><td>天然气</td><td>浓硫酸</td><td>氢氧化钠</td></tr><tr><td><img src="files/image2.png" width="125px" height="116px" /></td><td><img src="files/image3.png" width="117px" height="117px" /></td><td><img src="files/image4.png" width="118px" height="119px" /></td><td><img src="files/image5.png" width="122px" height="118px" /></td></tr></table>
    """

    # print(table_option_struc(stem))
    one_item = {
        'errmsgs': [],
  'key': 'C',
  'parse': '【详解】根据题意可知，辐射出的光子能量$\\varepsilon = 3 . 5 2 \\times 1 0 ^ { - 1 9 } '
           'J$，由光子的能量$\\varepsilon = h v$得<br/>$\\nu = \\frac { \\varepsilon } '
           '{ h } = 5 . 3 1 \\times 1 0 ^ { 1 4 } H z$<br/>故选C。',
  'stem': '近年来，江西省科学家发明硅衬底氮化镓基系列发光二极管，开创了国际上第三条$L E D$技术路线。某氮化镓基$L E '
          'D$材料的简化能级如图所示，若能级差为$2.20\\text{eV}$（约$3 . 5 2 \\times 1 0 ^ { - 1 9 '
          '} J$）,普朗克常量$h = 6 . 6 3 \\times 1 0 ^ { - 3 4 } J \\cdot '
          's$，则发光频率约为（）<br/><img height="112px" src="/word/media/image5.png" '
          'width="140px"/><br/>A．$6 . 3 8 \\times 1 0 ^ { 1 4 } H z$B．$5 . 6 7 '
          '\\times 1 0 ^ { 1 4 } H z$C．$5 . 3 1 \\times 1 0 ^ { 1 4 } H z$D．$4 '
          '. 6 7 \\times 1 0 ^ { 1 4 } H z$',
  'topic_num': 1,
  'type': '选择题',
    }
    one_item = option_structure(one_item, one_item["stem"], one_item["key"], 1)
    print(one_item)