cdZWj
/
new_tiku_structure_v3_art


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-

import re
from utils.item_type_line import get_item_head_info, get_item_head_info_cn
from utils.topic_no import judge_item_no_type, get_right_no, pre_get_item_no


def stems_structure_byno(stem_con, subject="", is_danti=0):
    """
    按题号进行切分;
    针对无解析的试卷中所有题目的拆分；
    :return:{"stem": , "item_id": , "errmsgs": [],"type":,}
    """
    head_cons = []
    if not is_danti:
        try:
            while stem_con and ((re.search(r"[\u4e00-\u9fa5]", stem_con[0]) is None) or
                                (((subject != "语文" and re.search(r"[一二三四五]\s*[、.．､]\s*[^必考基础综合中等]{2,4}题", stem_con[0]) is None)
                                 or (subject == "语文" and
                                     re.search(r"[一二三四五]\s*[、.．､]\s*.{,4}(运用|阅读|写作|选择|单选|默写|语言表达|作文|综合|诗歌鉴赏)|^([\[【]题文[\]】]|阅读)", stem_con[0]) is None))
                                 and re.search(r"(^|\n)\s*[1-9][0-9]?\s*[.．、､]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$",
                                               stem_con[0]) is None)):
                head_cons.append(stem_con[0])
                del stem_con[0]
        except:
            return "本份试卷开头格式有问题，请按试卷格式来！"
    # else:

    head_cons = "\n".join(head_cons).strip()
    stem_str = "\n" + "\n".join(stem_con)

    # 题号格式有条件清洗
    # def sub1(ss):
    #     if int(ss.group(5)) - int(ss.group(2)) in [1, 2]:
    #         return ss.group(1)+"\n"+ss.group(5)+"､"+ss.group(6)
    #     else:
    #         return ss.group(0)
    # stem_str = re.sub(r"(\n\s*([1-9][0-9]|[1-9])\s*[.．、､]((?!\n\s*([1-9][0-9]|[1-9])\s*[.．、､]).)+?)\n+\s*([1-9][0-9]|[1-9])\s*(?![.．、､])([^\s\d]+)", sub1, stem_str, flags=re.S)
    # 也可以用下面方法，但比较啰嗦
    pattern_3 = re.compile(r"(\n\s*([1-9][0-9]|[1-9])\s*[.．、､]((?!\n\s*([1-9][0-9]|[1-9])\s*[.．、､]).)+?)"
                           r"\n+\s*([1-9][0-9]|[1-9])\s*(?![.．、､])([^\s\d]+)", re.S)
    while re.search(pattern_3, stem_str):
        wrong_id_info = re.search(pattern_3, stem_str)
        stem_str = stem_str.replace(wrong_id_info.group(0),
                                    wrong_id_info.group(1)+"\n"+wrong_id_info.group(5)+"､"+wrong_id_info.group(6))
    # print(stem_str)
    # 1、获取题型行信息、按题型行切分
    if subject == "语文":
        con11, title_info_dict, choice_class = get_item_head_info_cn(stem_str)
    else:
        con11, title_info_dict, choice_class = get_item_head_info(stem_str)

    all_type = title_info_dict["all_type"]
    title_type_num = title_info_dict["title_type_num"]
    select_type_id = title_info_dict["select_type_id"]
    each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"]
    # -------------------------------------------------------
    res = []
    item_type_classify = {}  # 记录每类题型中含有的题目个数，含合并的情况
    item_type_num = []  # 题型不合并
    pic_no = {}  # 记录每个题的图
    new_item_no = []   # 将纠错后的题号再记录一份
    item_groups = {"is_groups": 0, "groups_data": {}}  # 公共题干的位置，从哪个题开始，比如地理选择题
    if not all_type:
        print("不存在大题题型行或题型行格式或名称有问题")
        # 初步获取题号，题号类型
        stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
        # 获取正确题号的位置，进行切分
        new_item_no, items_no_idx = get_right_no(item_no_info)
        one_item_split = [stem_str[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
        if items_no_idx and stem_str[:items_no_idx[0]]:
            if is_danti or "【题文】" in stem_str[:items_no_idx[0]]:
                head_cons = stem_str[:items_no_idx[0]]
        dd = {}
        for n, one_item in enumerate(one_item_split):
            if subject in ["地理", "语文"]:
                res, com_stem, item_groups = split_with_comstem(head_cons, one_item_split, res, n, item_groups, subject)
                if com_stem:
                    # print("com_stem:", com_stem)
                    dd.update(com_stem)
                elif is_danti and not n:  # 2022/2/24
                    dd.update({"com_stem": head_cons})
                    item_groups["is_groups"] = 1
                    item_groups["groups_data"][0] = ""
                # if not n:
                #     mix_con = "\n".join(head_cons).strip()
                # else:
                #     mix_con = one_item_split[n - 1].strip()
                # pattern_3 = re.search("(完成|回答)下?[面列]?的?.*?[\d小]题.{,2}$", mix_con)
                # if pattern_3:
                #     com_stem, bef_con = "", ""
                #     if re.search("【题文】", mix_con):
                #         bef_con, com_stem = mix_con.split("【题文】", maxsplit=1)
                #     else:
                #         com_stem_idx = [i.end() for i in re.finditer(r"(所以|故而?).{,3}(选择?[A-Fa-f]选?项?"
                #                                                      r"|选项[A-Fa-f]正确).{,2}\n"
                #                                                      r"|\n[A-E]\s*[.．、､：:].*?\n", mix_con)]
                #         if com_stem_idx:
                #             com_stem = mix_con[com_stem_idx[-1]+1:]
                #             bef_con = mix_con[:com_stem_idx[-1]+1]
                #         else:
                #             if not n:
                #                 com_stem = mix_con
                #             else:
                #                 mix_con_list = mix_con.split(r"(详解】|解析】|答案】)")
                #                 if len(mix_con_list) > 2:
                #                     bef_con = "".join(bef_con[:-1])
                #                     mix_con = bef_con[-1]
                #                 else:
                #                     mix_con = bef_con[0]
                #                 paras = bef_con.split("\n")
                #                 paras = [para for para in paras if para.strip()]
                #                 if len(paras)>1:
                #                     if len(paras) == 2 or len(paras[-1]) >= 20:
                #                         com_stem = paras[-1]
                #                         bef_con = bef_con + "\n".join(paras[:-1])
                #                     else:
                #                         com_stem = paras[-2:]
                #                         bef_con = bef_con + "\n".join(paras[:-2])
                #                 else:
                #                     bef_con = bef_con + "\n".join(paras)
                #     dd["com_stem"] = com_stem
                #     res[-1]["stem"] = bef_con
            dd["stem"] = one_item
            dd["item_id"] = new_item_no[n]  # 题目本身的题号
            dd["type"] = ""   # 先题型不备注，根据答案再看
            dd["errmsgs"] = []
            # dd["score"] = 0
            # if select_type_id and dd["item_id"] in select_type_id:
            #     dd['is_optional'] = 'true'
            res.append(dd)
            dd = {}
            # 先不做拆图处理了
    else:
        # print(all_type, len(con11))
        if len(all_type) == len(con11)-1:  # 第一部分的题型行掉了
            all_type.insert(0, "")
            each_item_score.insert(0, 0)  # 按题型行拿的分数也会掉，先补上默认的0
        elif len(all_type) != len(con11):
            print("第二种试卷格式：存在题型行没有换行")  # 可能造成题目和题型行在同一行
            # return "存在题型行末尾没有换行或题型行中题型不明确"
        # else:
        #     # print(all_type)
        #     if "非选择题" in all_type:
        #         error_info1 = "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
        #     if any([True for one_type in all_type if re.search("必考基础综合中等面列下各", one_type)]):
        #         error_info1 = "存在题型行中题型不明确"

        # ---------------------------------------------------------------------
        # 思路：>>>>先纠正题号，再拆分题目；等切分好后再纠正（删减添加操作）比较费时
        # 按题号切分，可再加些细节!!!! 1>>题号不要求连续

        # 初步判断题号类型
        stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
        # >>>>切分题目
        for num, one_type in enumerate(con11):
            # 初步获取题号
            item_no_info = pre_get_item_no("\n"+one_type, item_no_type)
            # 获取正确题号的位置，进行切分
            if res:
                items_no_temp, items_no_idx = get_right_no(item_no_info, have_type=1, last_id=res[-1]["item_id"])
            else:
                items_no_temp, items_no_idx = get_right_no(item_no_info)

            if not items_no_temp:
                res.append({
                    "stem": one_type,
                    "item_id": res[-1]["item_id"]+1 if res else 1,
                    "type": all_type[num],
                    "errmsgs": [],
                })
                continue

            is_from_0 = 1
            if items_no_temp and items_no_idx[0] != 0 and ((not num and items_no_temp[0] > 1) or
               (num and items_no_temp[0] > new_item_no[-1]+1)):  # 以防出现题号漏了的情况
                items_no_idx.insert(0, 0)
                is_from_0 = 0
            one_item_split = [("\n" + one_type)[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
            # 针对多个题共用一段材料的情况2021-11-3
            head_item = one_type[0:items_no_idx[0]]
            if not head_item.strip() and head_cons.strip() and not num:  # 2022.2.24
                head_item = head_cons

            # if head_item and subject == "地理":
            #     res, com_stem, item_group = split_with_comstem(head_item, [], res, 0)
            #     item_groups.update(item_group)
            #     if com_stem:
            #         dd["com_stem"] = com_stem
            #     common_stem_may = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第～~－]+?(\d{1,2})题", head_item.replace(" ", ""))
            #     if common_stem_may:
            #         if not item_groups:
            #             item_groups["pos"].append(1)
            #         else:
            #             item_groups["pos"].append(len(res)+1)
            #         item_groups["{}-{}".format(common_stem_may.group(2), common_stem_may.group(3))] = head_item

            # ------------------每个大题的第一题前面的图可能有漏的情况------------------------
            may_oimt_pic = []
            if not is_from_0:
                may_oimt_pic_info = re.search("\n(<imgsrc.+?/>(\s*<imgsrc((?!/>).)+?/>)*?)\s*\n?$", one_item_split[0])
                if may_oimt_pic_info:
                    if len(one_item_split) > 1 and re.search("如[上下左右]?图", one_item_split[1]):
                        may_oimt_pic.extend(re.findall("<imgsrc.+?/>", may_oimt_pic_info.group(1)))
            # print("may_oimt_pic:", may_oimt_pic)
            # ------------------------------------------------------------------
            if len(all_type) == len(con11):
                if not is_from_0:
                    if all_type[num] and all_type[num].replace("题", "") not in ['选择', '单选', '多选', '不定选择']:
                        one_item_split = one_item_split[1:]
                    else:  # 针对选择题第1题或前面几题题号漏了的情况
                        one_item_split1 = one_item_split[1:]
                        # ------针对分错的细节继续拆分，如上一题选项行与下一题题干没有换行-----------------
                        new_one_item_split = []
                        pattern_1 = re.compile(
                            r"([CDE]\s*[.．、､].+?)(?<![:：])\s([1-9]|1[0-9])\s*[.．、､](.+?([是为有]|等于)[(（]\s*[)）]\n)")
                        pattern_2 = re.compile(
                            r"(([CDE]\s*[.．、､]|\([CDE]\)).+?)(?<![:：])\s\(([1-9]|1[0-9])\)(.+?([是为有]|等于)[(（]\s*[)）]\n)")
                        for nn, one_item in enumerate(one_item_split1):
                            if item_no_type == 1 and re.search(pattern_1, one_item):
                                err_optcon = re.sub(pattern_1, r"\1【】\3", one_item)  # 太粗糙了
                                new_one_item_split.extend(err_optcon.split("【】"))
                                items_no_temp.insert(nn+1, int(re.search(pattern_1, one_item).group(2)))
                            elif item_no_type == 2 and re.search(pattern_2, one_item):
                                err_optcon = re.sub(pattern_2, r"\1【】\4", one_item)
                                new_one_item_split.extend(err_optcon.split("【】"))
                                items_no_temp.insert(nn + 1, int(re.search(pattern_2, one_item).group(3)))
                            else:
                                new_one_item_split.append(one_item)

                        if re.match(r'(^|\n)+\s*[A-Z]\s*[.．、､]|(^|\n)+\s*[(（]\s*[A-Z]\s*[)）]\s*[.．、､]?',
                                    str(one_item_split[0]).strip(), re.S) or \
                           re.search("如[上下左右]图|[(（]\s+[）)]\s*($|\n)", one_item_split[0].strip()):
                            new_one_item_split.insert(0, one_item_split[0])
                            items_no_temp.insert(0, items_no_temp[0]-1)
                        one_item_split = new_one_item_split
                    # ----------------------------------------------------------
                if all_type[num] in item_type_classify:  # 统计每类题型含有的题目个数
                    item_type_classify[all_type[num]] += len(one_item_split)
                elif all_type[num]:
                    item_type_classify[all_type[num]] = len(one_item_split)
                item_type_num.append((all_type[num], len(one_item_split)))
            else:
                if not is_from_0:
                    one_item_split = one_item_split[1:]

            new_item_no.extend(items_no_temp)
            # ---------------------------------------------------------------
            # 从题型行中判断单选和多选   放在这里收集也可以：可可能出现两不同题型行提到的序号一样
            # choice_class = {}
            # if all_type[num] == "选择题":
            #     multi_choice_info = re.findall("[\s,，；;(（]+第?(\d+)[至到\-~]+(\d+)题[是为]([多单])项?选择?题?",
            #                                    all_type_info[num][2])
            #     if multi_choice_info:
            #         for mu in multi_choice_info:
            #             choice_class[mu[2]] = list(range(int(mu[0]), int(mu[1]) + 1))
            # ---------------------------------------------------------------
            dd = {}
            for nn, one_item in enumerate(one_item_split):
                # 针对多个题共用一段材料的情况2021-11-3
                if subject in ["地理", "语文"]:
                    res, com_stem, item_groups = split_with_comstem(head_item, one_item_split, res, nn, item_groups, subject)
                    if com_stem:
                        dd.update(com_stem)
                # if subject == "语文":
                #     if not nn and head_item.strip():
                #         dd["com_stem"] = head_item.strip()
                # common_stem_may = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第～~－]+?(\d{1,2})题",
                #                             one_item.replace(" ", "").replace("\n", ""))
                # if subject == "地理" and common_stem_may:
                #     common_stem_info = re.search("((\n\s*[ABCDE][^\n]+?\n?)+)\n", one_item, flags=re.S)
                #     if common_stem_info:
                #         common_stem = one_item[common_stem_info.end():]
                #         item_groups["pos"].append(up_num+nn+2)
                #         item_groups["{}-{}".format(common_stem_may.group(2),
                #                                    common_stem_may.group(3))] = common_stem
                #         one_item = one_item[:common_stem_info.end()]
                    # item_groups["{}-{}".format(common_stem_may.group(1), common_stem_may.group(2))] = head_item
                dd["stem"] = one_item
                # ------------------对每个大题的第一题加上may_oimt_pic--------------------------
                if nn == 0 and may_oimt_pic:
                    dd['susp_pic'] = may_oimt_pic
                # ------------------------------------------------------------------------
                dd["item_id"] = items_no_temp[nn]
                # dd["score"] = each_item_score[num]
                # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
                #     dd["score"] = each_item_score2[str(dd["item_id"])]
                dd["errmsgs"] = []
                if all_type[num] and re.search("必考基础综合中等面列下各非", all_type[num]) is None:
                    dd["type"] = all_type[num] if re.sub('[(（]', "", all_type[num]) != '本大题' else "解答题"
                else:
                    dd["type"] = ""
                if choice_class:
                    for k, v in choice_class.items():
                        if dd["item_id"] in v:
                            dd["type"] = k + "选题"
                        # elif len(choice_class) == 1:
                        #     dd["type"] = "多选题" if k == "单" else "单选题"
                if select_type_id and dd["item_id"] in select_type_id:
                    dd['is_optional'] = 'true'
                # if len(one_item_split) == 1 and dd["score"] == 0.0 and title_info_dict["total_score"][num] > 0.0:
                #     dd["score"] = title_info_dict["total_score"][num]
                res.append(dd)

                # 多图在一起的情况进行拆分
                # if len(re.findall(r"第?\(?([1-9]|[1-9][0-9])\)?\s*题图", one_item)) > 1 and \
                #         re.search(r"<imgsrc\d.+?\n+\s*第?\(?([1-9]|[1-9][0-9])\)?\s*题图", one_item, re.S):
                pic_info = re.search(r"(<imgsrc\d.+?)\n+\s*((第?\(?([1-9]|[1-9][0-9])\)?\s*题图.?\s*){2,})", one_item, re.S)
                if pic_info:
                    pic_list = re.findall(r"<imgsrc\d.*?/>", pic_info.group(1))
                    pic_w = re.findall("([1-9]|[1-9][0-9])[)）]?\s*(?=题)", pic_info.group(2))
                    if len(pic_list) >= len(pic_w):
                        pic_no = {int(p): pic_list[len(pic_list) - len(pic_w) + k] for k, p in enumerate(pic_w)}
                        dd["stem"] = dd["stem"].replace(pic_info.group(2), "")
                        for k, pic in enumerate(pic_list[::-1]):
                            if k < len(pic_w):
                                dd["stem"] = dd["stem"].replace(pic, "")
                # -----------------------------------------------------
                dd = {}
    if pic_no:
        for i in list(pic_no.keys()):
            res[i-1]["stem"] = res[i-1]["stem"].strip() + "\n" + pic_no[i] + "\n" + "第" + str(i) + "题图"
    # ----------------------------------------------------------------------
    # 可能出现选择题类的选做题
    # --------最后判断一下题量是否正确-----------------------------------------
    # 针对拆分后题量特别多的情况，将拆分后题量与已知题量一样的题目保留
    new_res = []
    right_type = []  # 记录与已知题目个数一样的分块{第几个分块，题型}
    if all_type and sum(list(item_type_classify.values())) > 40:  # 很可能存在大片不是题号的题号
        title_type_num_all = sum([t[1] for t in title_type_num])  # 题型行给出的题目个数
        if title_type_num_all > 0:
            for idx, type_num in enumerate(item_type_num):
                if type_num[1] == title_type_num[idx][1] > 0:  # 同一题型是否题量一致
                    right_type.append((idx, type_num[0]))
                elif title_type_num[idx][1] == 0 and type_num[1] == 1:
                    title_type_num[idx][1] = 1
                    right_type.append((idx, type_num[0]))
        if right_type:
            for rtype in right_type:
                for idx, item in enumerate(res):
                    r_st = 0 if rtype[0]==0 else sum([t[1] for t in item_type_num[:rtype[0]]])
                    r_ed = r_st + item_type_num[rtype[0]][1]
                    if item["type"] == rtype[1] and r_st<=idx<r_ed:
                        new_res.append(item)

            res = new_res
            item_type_num = title_type_num   # 将题目已知题型数量信息作为正确信息
            new_item_type_classify = {}
            for i in right_type:
                if title_type_num[i[0]][0] not in new_item_type_classify:
                    new_item_type_classify[title_type_num[i[0]][0]] = title_type_num[i[0]][1]
                else:
                    new_item_type_classify[title_type_num[i[0]][0]] += title_type_num[i[0]][1]
            item_type_classify = new_item_type_classify

    return res, all_type, item_type_classify, item_no_type, item_type_num, new_item_no, item_groups


def split_with_comstem(head_cons, one_item_split, bef_res, pc_idx, item_groups, subject):
    """
    item_groups：例{'is_groups': 1, 'groups_data': {0: 'fei', 8: '', 10: '6-8'}}  # 只记录题组
    head_cons: 每类题最前面的题文部分
    one_item_split：按题号切分的试题list
    bef_res:前面已经初步结构化好的题目
    pc_idx: 索引计时器
    :return:
    """
    if not pc_idx:  # 没有题型行,第一题前
        mix_con = head_cons
    else:
        mix_con = one_item_split[pc_idx - 1].strip()

    common_stem_info1 = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第～~－]+?(\d{1,2})小?题",
                                  mix_con.replace(" ", "").replace("\n", ""))
    common_stem_info2 = re.search("(完成|回答)下?[面列]?的?.*?[\d小各]题.{,2}\n*?(<img\s*src.*?/>[\s\n]*?)*?.{,2}$",
                                  mix_con, flags=re.S)
    st_2 = ed_2 = 0
    if subject == "语文":
        common_stem_info2 = re.search("(完成|回答)下?[面列]?的?.*?[\d小各]题.{,2}\n|\n\s*阅读.*?按要求[作回]答.{,2}\n", mix_con)
        if common_stem_info2:
            ed_2 = common_stem_info2.end()
            st_2 = common_stem_info2.start()
            while re.search("(完成|回答)下?[面列]?的?.*?[\d小各]题.{,2}\n|\n\s*阅读.*?按要求[作回]答.{,2}\n", mix_con[ed_2:]):
                new_info2 = re.search("(完成|回答)下?[面列]?的?.*?[\d小各]题.{,2}\n|\n\s*阅读.*?按要求[作回]答.{,2}\n", mix_con[ed_2:])
                st_2 = new_info2.start() + ed_2
                ed_2 = new_info2.end() + ed_2

    bef_con = ""
    one_stem = {}
    if "【题文】" in mix_con and len(mix_con.split("【题文】", maxsplit=1)[-1])>10:
        item_groups["is_groups"] = 1
    if common_stem_info2:
        if re.search("\n\s*[\[【]\s*(答案|解析)", mix_con[st_2:]) is None:
            item_groups["is_groups"] = 1
        if common_stem_info1:
            st = common_stem_info1.group(2)
            end = common_stem_info1.group(3)
            if not item_groups["groups_data"]:
                item_groups["groups_data"][0] = "{}-{}".format(st, end)
            else:
                item_groups["groups_data"][len(bef_res)] = "{}-{}".format(st, end)
        elif subject == "地理":
            item_groups["groups_data"][len(bef_res)] = ""
        #     item_groups["groups_data"][len(bef_res)] = ""
    # elif common_stem_info1:
    #     item_groups["is_groups"] = 1

    # if (item_groups["is_groups"] and common_stem_info2) or "【题文】" in mix_con or subject == "语文":
    if item_groups["is_groups"] or subject == "语文":
        com_stem = ""
        if re.search("【题文】", mix_con):
            bef_con, com_stem = mix_con.split("【题文】", maxsplit=1)
            # if len(bef_res) not in item_groups["groups_data"]:
            #     item_groups["groups_data"][len(bef_res)] = ""
            # if not item_groups["is_groups"]:
            #     item_groups["is_groups"] = 1
        else:
            com_id = sorted(item_groups["groups_data"])
            common_stem_info3 = re.search("(完成|回答)下?[面列]?的?.*?问题.{,2}\n", one_item_split[pc_idx])  # 带多问的大题
            com_stem_idx = [i.end() for i in re.finditer(r"(所以|故而?|答案).{,3}(选择?\s*[A-Fa-f]选?项?.{,2}"
                                                              r"|选项\s*[A-Fa-f]+\s*正确|[A-Fa-f]+\s*项正确).{,2}($|\n\s*【点睛】\s*\n.+?\n|\n)"
                                                         r"|(选?项?[A-Fa-f]+项?错误?|[A-D]正确|排除\s*[A-D]"
                                                              r"|[A-D][A-D、､]*?不符?合题意).{,2}\s*($|\n\s*【点睛】\s*\n.+?\n|\n)" 
                                                         r"|(\n\s*[A-E]\s*[.．、､：:][^\n]*?\n?)+(\n|$)", mix_con,
                                                         flags=re.S)]   # add 第1条后部分 2022-2-14
            com_stem_idx_2 = [i.start() for i in re.finditer(r"\n<p style=\"text-indent: 2em;\">", mix_con,
                                                         flags=re.S)]   # add  2024-9-12
            com_stem_idx.extend(com_stem_idx_2)
            com_stem_idx.sort()
            # print("common_stem_info3:", common_stem_info3)
            if subject == "地理" and common_stem_info3:
                item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1] + 1, len(bef_res)-1)
            elif com_stem_idx and subject == "地理":
                com_stem = mix_con[com_stem_idx[-1]:]
                bef_con = mix_con[:com_stem_idx[-1]]
                # print("com_stem:", com_stem)
                # print("bef_con:", bef_con)
            else:
                if not pc_idx:
                    com_stem = mix_con
                    if len(com_stem) < 25 and re.search("任选一题", com_stem):
                        com_stem = ""
                    if not com_stem:
                        item_groups["groups_data"][len(bef_res)] = "fei"

                else:
                    if subject == "语文":
                        mix_con_list = re.split(r"(\n\s*阅读.*?按要求[作回]答.{,2}\n"
                                                r"|\n\s*阅读下[面列].*?[完成回答]+各题.{,2}\n"
                                                r"|[(（][一二三四五][)）]\s*.{,6}\n|[(（][一二三四五][)）]\s*.{,6}$)",
                                                # r"|\n+\s*[(（][一二三四五][)）])",  # 与上一条都满足时，匹配短的
                                                re.sub("[（(][本题共\d小\s]*?\d{1,2}分\s*[)）].?", "", mix_con))
                        if len(mix_con_list) == 1:
                            mix_con_list = re.split(r"([(（][一二三四五][)）].*?[完成回答]+下?[面列]?的?.*?[\d小]\s*?题.{,2})\n",
                                                    re.sub("[（(]\s*\d{1,2}\s*分\s*[)）].?", "", mix_con))
                        if len(mix_con_list) == 1:
                            mix_con_list = re.split(r"\n+\s*[(（][一二三四五][)）]",
                                                    re.sub("[（(][本题共\d小\s]*?\d{1,2}分\s*[)）].?", "", mix_con))
                        if len(mix_con_list) >= 3:
                            com_stem = "\n".join(mix_con_list[-2:])
                            # 有([一二三四五]),但不一定有公共题文
                            if re.search("[(（]\s*[一二三四五]\s*[)）](语言文字运用|微?写作|选择|单选|语言表达|作文"
                                         r"|.{,4}(文本阅读|诗歌阅读|文言文阅读|名著阅读|默写)题?).{,10}$", com_stem.strip()):
                                com_stem = ""
                            bef_con = "".join(mix_con_list[:-2])
                            if com_id[-1] not in item_groups["groups_data"] or not item_groups["groups_data"][com_id[-1]]:
                                item_groups["groups_data"][com_id[-1]] = "{}-{}".format(com_id[-1]+1, len(bef_res))
                                common_stem_info4 = re.search("(完成|回答)下?[面列]?的?第?(\d{1,2})[-到至第～~－]+?(\d{1,2})小?题",
                                                              mix_con_list[-2].replace(" ", "").replace("\n", ""))
                                if common_stem_info4:
                                    item_groups["groups_data"][len(bef_res)] = "{}-{}".format(
                                                               common_stem_info1.group(2), common_stem_info1.group(3))
                        elif common_stem_info2 and re.search("\n\s*[\[【]\s*(答案|解析)", mix_con[st_2:]) is None:
                            item_groups["groups_data"][len(bef_res)] = ""
                            common_stem_list = re.split("([完成回答]下?[面列]?的?.*?[\d小各]题.{,2}\n"
                                                        "|\n\s*阅读.*?按要求[作回]答.{,2}\n)", mix_con)
                            if len(common_stem_list) > 3:
                                com_stem = common_stem_list[-1]
                                mix_con = "".join(common_stem_list[:-1])
                            else:
                                com_stem = mix_con[st_2:]
                                mix_con = mix_con[:st_2]  # 没切全，还需进一步
                            # print("mix_con:",mix_con)
                            paras = mix_con.split("\n")
                            paras = [para for para in paras if para.strip()]  # 去掉空行
                            if len(paras) > 1:
                                com_stem = paras[-1] + com_stem
                                bef_con = "\n".join(paras[:-1])
                                # print(bef_con)
                            else:
                                bef_con = mix_con
                    else:
                        mix_con_list = re.split(r"(详解】|解析】|答案】)", mix_con)
                        if len(mix_con_list) > 2:  # 含解析:
                            bef_con = "".join(mix_con_list[:-1])
                            mix_con = mix_con_list[-1]
                        else:
                            mix_con = mix_con_list[0]
                        paras = mix_con.split("\n")  # 将混淆的部分换行拆分
                        paras = [para for para in paras if para.strip()]  # 去掉空行
                        if len(paras) > 1:
                            if len(paras) == 2 or len(paras[-1]) >= 20:  # 2段or段长
                                com_stem = paras[-1]
                                if re.search("^\s*[（(]\s*\d\s*[）)]", com_stem):
                                    com_stem = ""
                                else:
                                    bef_con = bef_con + "\n".join(paras[:-1])
                            else:
                                com_stem = "\n".join(paras[-2:])
                                bef_con = bef_con + "\n".join(paras[:-2])
                        else:
                            bef_con = bef_con + "\n".join(paras)
                            # 此时 com_stem 为空
        if subject == "地理":
            if com_stem:
                one_stem["com_stem"] = com_stem
                if bef_res and pc_idx:  # 不包括第一题
                    bef_res[-1]["stem"] = bef_con
        else:
            if re.sub("</?tbody>|</?table>|\n", "", com_stem):
                one_stem["com_stem"] = com_stem
                item_groups["is_groups"] = 1
                if len(bef_res) not in item_groups["groups_data"]:
                    item_groups["groups_data"][len(bef_res)] = ""
                if bef_res and bef_con:
                    bef_res[-1]["stem"] = bef_con
            # elif not pc_idx:
            #     item_groups["groups_data"][len(bef_res)] = ""

    return bef_res, one_stem, item_groups