cdZWj
/
new_tiku_structure_2021


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-

import re


def img_regroup(item_list):
    """
    判断图片是否存在错位，否则重新划分所属
    排除 某个题的题文都是图片的情况！！！
    :param item_list: 第一次进行题目切分的结果
    :return:
    """
    # ------------题目开头图片分错判断---统计信息-----------------
    no_pic_id = []
    item_pic_end = []  # 收集题目末尾的图片
    have_pic_info = [0] * len(item_list)
    item_pic_type1_id = []
    temp_res = []
    bef_con = item_list[0]["content"]
    for k, sub_res in enumerate(item_list):
        raw_cont = sub_res["content"]
        end_pic = []
        if re.match(r"\n*\s*(<imgsrc.*?/>[\s\n]*)*?$", sub_res["content"]) is None:
            end_pic = re.findall("\n(<imgsrc\d+ w_h=[^A-Z$]*?/>\s*(<imgsrc\d+ w_h=[^A-Z$]*?/>)*?)\n?$", sub_res["content"])
        # print(end_pic)
        if end_pic:
            item_pic_end.append(re.findall("<imgsrc.+?/>", end_pic[0][0]))
            temp_res.append(sub_res)
            temp_res[-1]["content"] = re.split("\n<imgsrc.+?/>\s*(<imgsrc.+?/>)*?\n?$", sub_res["content"])[0]
        else:
            item_pic_end.append("")
            temp_res.append(sub_res)

        if 'susp_pic' in sub_res:
            item_pic_type1_id.append(k)
        # 统计---本题没有图片，前一题有图片，且本题提示了图片信息---这样的题目id
        if re.search("如[上下左右]?图|下[面列]图中", raw_cont):
            have_pic_info[k] = 1
            if "imgsrc" not in raw_cont:
                if k > 0 and "imgsrc" in bef_con and 'susp_pic' not in sub_res:
                    no_pic_id.append(k)
        bef_con = raw_cont  # item_list[k-1]["content"]和raw_cont对不上，不知道为啥
    print("no_pic_id索引:", no_pic_id)
    print("item_pic_type1_id:", item_pic_type1_id)
    print("item_pic_end:",item_pic_end,len(item_pic_end))
    # pprint(temp_res)
    # 开始图片位置纠错--------------------------------------------------
    right_after_corret = True
    if no_pic_id:  # 图片可能是公式图片或公式截图
        for nn, i in enumerate(no_pic_id):
            st = 0 if nn == 0 else no_pic_id[nn - 1] + 1
            if item_pic_end[i - 1]:
                temp_res[i]["content"] += "\n" + item_pic_end[i - 1][-1]  # 默认取一个
                if item_pic_end[i - 1][:-1]:
                    temp_res[i-1]["content"] += "\n" + "\n".join(item_pic_end[i - 1][:-1])
                item_pic_end[i - 1] = ""  # 取完后图片要清空

            bef_id_list = list(range(st, i))
            bef_id_list.reverse()
            for j in bef_id_list:  # 从当前位置向前继续判断
                # print(j,'--------------',item_pic_end[j - 1])
                if have_pic_info[j]:  # 有图片信息，
                    if "imgsrc" not in temp_res[j]["content"]:  # 没图片
                        if 'susp_pic' in temp_res[j]:
                            temp_res[j]["content"] += "\n" + "\n".join(temp_res[j]['susp_pic'])
                            del temp_res[j]['susp_pic']
                        elif not item_pic_end[j]:  # 本身末尾没图片时
                            if j > 0 and item_pic_end[j - 1]:
                                temp_res[j]["content"] += "\n" + item_pic_end[j - 1][-1]
                                if len(item_pic_end[j - 1]) > 1:
                                    temp_res[j - 1]["content"] += "\n" + "\n".join(item_pic_end[j - 1][:-1])
                                item_pic_end[j - 1] = ""
                            else:
                                temp_res[j]['errmsgs'].append("本题缺图片")
                                print("第{}题缺图片".format(str(temp_res[j]['item_id'])))  # 本身有吗？
                                right_after_corret = False
                        else:
                            temp_res[j]["content"] += "\n" + "\n".join(item_pic_end[j])
                            item_pic_end[j] = ""
                    elif item_pic_end[j-1]:  # 有图片，（但不一定就是如图的“图”），且前一题末尾有图
                        print("第{}题可能漏图片".format(str(temp_res[j]['item_id'])))
                        if not have_pic_info[j - 1] or (j > 1 and item_pic_end[j - 2]):  # 前面一题没有图片信息或前前一题末尾有图片时
                            if not item_pic_end[j]:  # 本题末尾没有图片
                                temp_res[j]["content"] += "\n" + item_pic_end[j - 1][-1]
                                if len(item_pic_end[j - 1]) > 1:
                                    temp_res[j - 1]["content"] += "\n" + "\n".join(item_pic_end[j - 1][:-1])
                                item_pic_end[j - 1] = ""
                            else:  # 本身有图片的话，用自己的
                                temp_res[j]["content"] += "\n" + "\n".join(item_pic_end[j])
                                item_pic_end[j] = ""
                        else:
                            temp_res[j - 1]["content"] += "\n" + "\n".join(item_pic_end[j - 1])
                            item_pic_end[j - 1] = ""
                    elif item_pic_end[j]:  # 当前题末尾有图
                        temp_res[j]["content"] += "\n" + "\n".join(item_pic_end[j])
                        item_pic_end[j] = ""
                else:  # 其他图片信息关键字没匹配到，可能 会漏掉
                    if item_pic_end[j]:
                        temp_res[j]["content"] += "\n" + "\n".join(item_pic_end[j])
                        item_pic_end[j] = ""

        if right_after_corret:
            item_list = temp_res
    if item_pic_type1_id:
        for i in item_pic_type1_id:
            if 'susp_pic' in temp_res[i]:
                temp_res[i]["content"] += "\n" + "\n".join(temp_res[i]['susp_pic'])

    return item_list