HJ-AI
/
new_tiku_structure_2021


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-

import re
from utils.washutil import table_label_cleal


def stem_ans_split(one_item_dict, case):
    """
    将切出来的一道题 按 答案解析 进一步细分
    :param one_item_dict: 单道题的初步结构字典{"stem": , "item_id": , "errmsgs": []，"type"：,}
    :param case: 属于哪种情况
    :return: {"stem": ,"key": ,"parse"：}
    """
    one_item = one_item_dict["stem"]
    item_type = one_item_dict["type"]
    # print(one_item)
    if case == 'case0':  # 没“答案”关键字
        inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
                                table_label_cleal(one_item))
        inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
                        else str(a).replace('None', '').strip() for a in inside_split]
        # print(':::', inside_split)
        # print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
        dd = dict(zip(["stem", "parse_title"], inside_split[0:2]))
        dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
        dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
        dd["key"] = ""
    else:  # if case == 'case1':  # 有“答案”关键字
        dd = dict(zip(["stem", "key"], re.split(r"【答案】\n?",
                                                      table_label_cleal(one_item), maxsplit=1)))
        # pprint(dd)  # 一般默认‘答案’在‘解析’的前面
        subdd = dict(zip(["key", "parse_title", "parse"],
                         re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", dd["key"], maxsplit=1)))
        dd["key"] = subdd["key"]
        if "parse_title" in subdd:
            dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
            dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
        else:
            dd["parse"] = ""

    dd["stem"] = re.sub(r"[1-9][0-9]?\s*[.．、､]", "", dd["stem"][:5]) + dd["stem"][5:]

    # 获取答案
    if not dd["key"]:
        dd["key"] = get_ans_from_parse(dd["parse"], item_type, dd["stem"])
        # 补充！！！------------------------------------------
        # if item_type in ["单选题", "多选题", "选择题"]:  # (故选[：:]([A-Z;；和与、､]+)|
        #     ans = re.search(r'故选[：:]?<imgsrc=[^>]+?data-latex="\$?([A-Z;；和与、､\s]+)\$?".+?/>|故选[：:]?([A-Z;；和与、､\s]+)',
        #                     dd["parse"].replace("$", "").replace(" ", ""))
        #     if ans:
        #         dd["key"] = ans.group(1) if ans.group(1) is not None else ans.group(2)  # ans.group(1) != None
        #     else:
        #         dd["key"] = ""
        # else:
        #     dd["key"] = "见解析"
        #     ans = re.search(r'故\s*[：:]?\s*答案分?别?[为是]?\s*[：:]?\s*(.+?)[.．]\s*\n', dd["parse"])
        #     if ans:
        #         dd["key"] = ans.group(1)
        # ------------------------------------------------------
    if "parse_title" in dd:
        del dd["parse_title"]

    return dd


def get_ans_from_parse(item_parse, item_type, res_con):
    """
    从已知解析中 挑选 答案
    :param item_parse: 总解析
    :param item_type: 题型
    :return:
    """
    item_parse = re.split("【点评】|【点睛】", item_parse)[0].strip()
    # 将解析中末尾出现的图片去掉
    while re.search('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', item_parse):
        item_parse = re.sub('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', "", item_parse)
    item_ans = ""
    if item_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
        ans = re.search(r'故选\s*[：:]?\s*<imgsrc\d+\sdata-latex="([A-Z;；和与、､\s]+)"/>'
                        r'|故选\s*[：:]?\s*([A-Z;；和与、､\s]+)', item_parse.replace("$", ""))
        if ans:
            item_ans = ans.group(1) if ans.group(1) is not None else ans.group(2)
            item_ans = re.sub(r"[.;；．]\s*$", "", item_ans)
        elif not ans:
            item_ans = "见解析"
    elif item_type:
        ans0 = re.search(r'故选\s*[：:]?\s*([A-Z;；和与、､\s]+)[.．；;。]?$', item_parse)  # 试验题中可能还有选择题
        ans01 = re.search(r'故选\s*[：:]\s*<imgsrc\d+\sdata-latex="([A-Z;；和与、､\s]+)"/>', item_parse.replace("$", "").replace("\[", "").replace("\]", ""))  # 选择题的题型可能前面分错
        ans1 = re.search(r'(故|因[而此]|所以)\s*[：:]?\s*(答案分?别?[为是填]?|填)\s*[：:]?\s*(((?!(<img)).)+?)[.．]?\s*(\n|$)', item_parse)
        ans11 = re.search(r'((?<!解)答\s*[：:]|整理得\s*[：:]?)\s*(.+?)([.．；;]?\s*$|[.．]\s*\n)', item_parse)
        ans2 = re.search(r'(故|因[而此]|所以)\s*[：:]?\s*(答案分?别?[为是填]?|填)\s*[：:]?\s*(<imgsrc.+?/>)[.．]?\s*(\n|$)', item_parse, re.S)
        ans22 = re.search(r'(故|因[而此]|所以)\s*[：:]?\s*(答案分?别?[为是填]?|填)\s*[：:]?\s*([^∴∵因所故即【】]+?)([.．]\s*(\n|$)|$)', item_parse)
        ans21 = re.search(r'综上所述\s*[：:]\s*([^∴∵故因所即【】]+?)[.．；;]\s*$', item_parse)
        ans3 = re.search(r'(故|因[而此]|所以|∴)\s*[：:]?.+?[为是填]\s*[：:]?\s*([^∴∵故因所即【】]+?)([.．；;，,]\s*$|[.．]\s*\n)', item_parse)
        ans31 = re.search(r'(故|因[而此]|所以|∴)\s*([^当为是填∴∵故因所即则【】]+?)[.．；;]\s*$', item_parse)
        ans32 = re.search(r'(故|因[而此]|所以)\s*[：:]?[^当为是填∴∵故因所即【】]+?[为是填]\s*[：:]?\s*(<imgsrc.+?/>)[.．]?\s*(\n|$)',
                          item_parse, re.S)
        ans4 = re.search(r'\n\s*[＝=]([^＝\n]+?)[.．]?\s*$', item_parse)
        ans42 = re.search(r'[＝=](?!")(((?!([故＝∴即]|原式|因[而此]|所以|\n|=[^"])).)+?)[.．]?\s*$', item_parse)
        ans41 = re.search(r'原式\s*[＝=].+?[＝=](?!")(((?!(＝|=[^"])).)+?|\s*<imgsrc.+?/>)([.．]?\s*$|[.．]\s*\n)', item_parse)
        if not (item_type == '填空题' and len(re.findall(r"_{2,}|_+([^_]*?)_+", res_con)) == 1) and \
                len(re.findall(r"[（(]\d[)）]|[\n:：;；。】]([（(](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[)）]|[①②③④]\s*(?![+-]))",
                               item_parse.replace(" ", ""))) > 1 or "证明" in item_parse:
            item_ans = "见解析"
        elif ans0:
            item_ans = ans0.group(1)
        elif ans01:
            item_ans = ans01.group(1)
        elif ans1 or ans11:
            item_ans = ans1.group(3) if ans1 else ans11.group(2)
        elif ans2:
            item_ans = ans2.group(3)
        elif ans22:
            item_ans = ans22.group(3)
        elif ans21:
            item_ans = ans21.group(1)
        elif (ans3 or ans31 or ans32) and '证明' not in item_parse:
            if ans3:
                item_ans = ans3.group(2)
            if ans31:
                item_ans = ans31.group(2)
            if ans32:
                item_ans = ans32.group(2)
        elif (ans4 or ans41 or ans42) and '证明' not in item_parse:
            if ans4:
                item_ans = ans4.group(1)
            if ans41:
                item_ans = ans41.group(1)
            if ans42:
                item_ans = ans42.group(1)
        else:
            item_ans = "见解析"
    return item_ans


def get_split_pos(row_list):
    """
    获取题目、答案的切分位置
    :return:
    """
    # 寻找题目和答案的切分点，一定要有“答案”关键字
    split_p1 = [k for k, v in enumerate(row_list)
                if re.match(r'(参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案.{,5}$|答案[和与及]?解析([(（].*?[)）])?$'  # |答\s*案$
                            r'|.{,15}(参考|考试|(考?试|检测)[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*$'
                            r'|.{,15}评分(标准|参考)|((参考|(考?试|检测)[题卷]|考试|物理|理综|数学|化学|生物)答案|答案[和与及]解析)[\dA-E\s.．、､]+$'
                            r'|.{,15}(参考|考试|(考?试|检测)[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则|参考))\s*(物理|理综|数学|化学|生物)?\s*$'
                            r'|.{,15}解析[和与及]答案$',
                            re.sub(r"[上下]?学[年期]|[\d—【】.．·、､：:(（）)年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]"
                                   r"|[高初][一二三]|部分", "", v.strip()))]
    if split_p1 and split_p1[0] < 30:
        if len(re.sub("<imgsrc.*?/>|\s", "", "".join(row_list[:split_p1[0]])).strip())<60:
            return "题文全是图片，本通道无法解析"
    split_p1 = [p for p in split_p1 if p > 30]
    print("答案split_p1:", split_p1)
    # 没有答案关键字时
    split_p0 = [k for k, v in enumerate(row_list)
                if re.search("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)?考试(评估)?|学期|[寒暑]假作业)[一二三四五六七八九试题（(卷)）\s]*?$",
                             re.sub(r"[上下]?学[年期]度?|[\d—【】.．、､：:(（）)年第\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "", v.strip()))]
    split_p0 = [p for p in split_p0 if p > 30]
    print("试卷标题split_p0:", split_p0)
    if not split_p0:
        split_p0 = [k for k, v in enumerate(row_list)
                    if re.search("^\s*第\s*[一IⅠ]\s*卷\s*([（(]|非?选择题)", v.strip())]
        split_p0 = [p for p in split_p0 if p > 30]
        print("试卷标题split_p01:", split_p0)

    # if not split_p and len(re.split("【答案】", "@@\n".join(row_list))) == 2:  # 参考答案的关键字只用了【答案】
    #     split_p.insert(0, len(re.split("【答案】", "@@\n".join(row_list))[0].split("@@\n"))-1)
    # print("split_p1:", split_p1)
    items_list, ans_list = [], []
    # ===================================题目切分======================================================
    pattern1 = re.compile("([中高联月]考|单元测试|随堂练|(摸底|模拟|收心)考试|学期)[试题（(卷)）\s]*?$"
                          "|密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?"
                          "|((学校|班级|姓名|座位?号|准考号|学号)[\s：:_]*?){2,}|^\n*\s*\n*$")
    is_may_ans = 0
    if not split_p1 and split_p0:
        split_p1 = [split_p0[-1]]  # 优先当答案使用,选最后一个
        is_may_ans = 1
    if split_p1:
        new_p1 = split_p1[0]
        may_omit_info = re.match("((参考|试[题卷]|考试|物理|理综|数学|化学|生物)答案|答案和解析)([\dA-E\s.．、､]+)$", row_list[new_p1])
        if may_omit_info and re.findall("[A-E]", may_omit_info.group(3)):
            row_list.insert(new_p1+1, may_omit_info.group(3))
        while re.search(pattern1, row_list[new_p1 - 1]):
            new_p1 -= 1
        items_list = row_list[:new_p1]
        ans_list = row_list[split_p1[0] + 1:]
        # 再判断是否有答题卷
        split_p2 = [k for k, v in enumerate(row_list[:split_p1[0]])
                    if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
                                re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
        if split_p2:  # 答案前
            while re.search(pattern1, items_list[split_p2[0] - 1]):
                split_p2[0] -= 1
            items_list = items_list[:split_p2[0]]
        else:
            split_p2 = [k for k, v in enumerate(ans_list)
                        if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
                                    re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
            if split_p2:  # 答案后
                while re.search(pattern1, ans_list[split_p2[0] - 1]):
                    split_p2[0] -= 1
                ans_list = ans_list[:split_p2[0]]
    # else:
        # 没有答案页，但可能也有答题卡
    split_p2 = [k for k, v in enumerate(row_list)
                if re.match(".*?(答题?[卷卡页]纸?|试卷细目表)\s*$|\s*本卷.*?答题卡",
                            re.sub(r"[\d—【】年]|[中大]学|模拟|[中高]考|物理|理综|数学|化学|生物", "", v.strip()))]
    print("答题卡split_p2:", split_p2)
    if split_p2:
        while re.search(pattern1, row_list[split_p2[0] - 1]):
            split_p2[0] -= 1
        row_list = row_list[:split_p2[0]]

    return row_list, items_list, ans_list, is_may_ans