cdZWj
/
new_tiku_structure_v3_sci


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-

import re


def img_regroup(item_list, row_list):
    """
    判断图片是否存在错位，否则重新划分所属
    排除 某个题的题文都是图片的情况！！！
    :param item_list: 第一次进行题目切分的结果
    :param row_list: all行文本
    :return:
    """
    # ------------题目开头图片分错判断---统计信息-----------------
    no_pic_id = []  # 没有图但有信息的题目id
    item_pic_end = []  # 收集题目末尾的图片
    have_pic_info = [0] * len(item_list)  # 是否有图片
    item_pic_type1_id = []  # ‘susp_pic’
    temp_res = []
    bef_con = item_list[0]["stem"]  # bef_con：前一个题的题文
    for k, sub_res in enumerate(item_list):
        sub_res = sub_res.copy()
        raw_cont = sub_res["stem"]
        # end_pic = []
        # if re.match(r"\n*\s*(<imgsrc.*?/>[\s\n]*)*?$", sub_res["stem"]) is None:
        # end_pic = re.findall(r"\n(<imgsrc\d+ w_h=[^A-Z$]*?/>\s*(<imgsrc\d+ w_h=[^A-Z$]*?/>)*?)\n?$",
        #                      sub_res["stem"], flags=re.S)
        # 末尾图片不应该把公式统计在内吧
        end_pic = re.findall(r"\n(<imgsrc\d+ w_h=[\d.]+\*[\d.]+(\sdata-latex=.*?\")?\s*/>\s*"
                     r"(<imgsrc\d+ w_h=[\d.]+\*[\d.]+(\sdata-latex=.*?\")?\s*/>)*?)\n?$", sub_res["stem"])
        if end_pic:
            item_pic_end.append(re.findall("<imgsrc.+?/>", end_pic[0][0]))
            temp_res.append(sub_res)
            temp_res[-1]["stem"] = re.split("\n<imgsrc.+?/>\s*(<imgsrc((?!/>).)+?/>)*?\n?$", sub_res["stem"])[0]
            # 判断下该图片后面是否存在很多空行，不存在则加标识
            pic_temp_id = [k1 for k1, r in enumerate(row_list) if item_pic_end[-1][0] in r]
            is_konghang = []
            for r in row_list[pic_temp_id[0] + 1:]:
                if not re.sub(r"[\s\n\t]", "", r):
                    is_konghang.append(1)
                else:
                    break
            if len(is_konghang)<2:
                item_pic_end[-1].append("#@#")
        else:
            item_pic_end.append("")
            temp_res.append(sub_res)

        if 'susp_pic' in sub_res:
            item_pic_type1_id.append(k)
        # 统计---本题没有图片，前一题有图片，且本题提示了图片信息---这样的题目id
        if re.search("如[上下左右]?图|下[面列]图中", raw_cont):
            have_pic_info[k] = 1
            if "imgsrc" not in raw_cont:
                if k > 0 and "imgsrc" in bef_con and 'susp_pic' not in sub_res:  # 从第2题开始
                    no_pic_id.append(k)
        bef_con = raw_cont  # item_list[k-1]["stem"]和raw_cont对不上，不知道为啥
    print("no_pic_id索引:", no_pic_id)
    print("item_pic_type1_id:", item_pic_type1_id)
    print("have_pic_info:", have_pic_info)
    print("item_pic_end:",item_pic_end,len(item_pic_end))
    # pprint(temp_res)
    # 开始图片位置纠错--------------------------------------------------
    right_after_corret = True
    if no_pic_id:  # 图片可能是公式图片或公式截图
        for nn, i in enumerate(no_pic_id):
            st = 0 if nn == 0 else no_pic_id[nn - 1] + 1
            if item_pic_end[i - 1] and "#@#" in item_pic_end[i - 1]:
                temp_res[i]["stem"] += "\n" + item_pic_end[i - 1][-2]  # 默认取一个
                if item_pic_end[i - 1][:-2]:
                    temp_res[i-1]["stem"] += "\n" + "\n".join(item_pic_end[i - 1][:-2])
                item_pic_end[i - 1] = ""  # 取完后图片要清空
            else:
                temp_res[i]['errmsgs'].append("本题缺图片")
                right_after_corret = False

            bef_id_list = list(range(st, i))
            bef_id_list.reverse()
            for j in bef_id_list:  # 从当前位置向前继续判断
                # print(j,'--------------',item_pic_end[j - 1])
                if have_pic_info[j]:  # 有图片信息，
                    if "imgsrc" not in temp_res[j]["stem"]:  # 没图片
                        if 'susp_pic' in temp_res[j]:
                            temp_res[j]["stem"] += "\n" + "\n".join(temp_res[j]['susp_pic'])
                            del temp_res[j]['susp_pic']
                        elif not item_pic_end[j]:  # 本身末尾没图片时
                            if j > 0 and item_pic_end[j - 1] and "#@#" in item_pic_end[j - 1]:
                                temp_res[j]["stem"] += "\n" + item_pic_end[j - 1][-2]
                                if len(item_pic_end[j - 1][:-1]) > 1:
                                    temp_res[j - 1]["stem"] += "\n" + "\n".join(item_pic_end[j - 1][:-2])
                                item_pic_end[j - 1] = ""
                            else:
                                temp_res[j]['errmsgs'].append("本题缺图片")
                                print("第{}题缺图片".format(str(temp_res[j]['item_id'])))  # 本身有吗？
                                right_after_corret = False
                        else:
                            temp_res[j]["stem"] += "\n" + "\n".join(item_pic_end[j]).replace("#@#", "")
                            item_pic_end[j] = ""
                    elif item_pic_end[j-1] and "#@#" in item_pic_end[j-1]:  # 有图片，（但不一定就是如图的“图”），且前一题末尾有图
                        print("第{}题可能漏图片".format(str(temp_res[j]['item_id'])))
                        if not have_pic_info[j - 1] or (j > 1 and item_pic_end[j - 2] and "#@#" in item_pic_end[j-2]):  # 前面一题没有图片信息或前前一题末尾有图片时
                            if not item_pic_end[j]:  # 本题末尾没有图片
                                temp_res[j]["stem"] += "\n" + item_pic_end[j - 1][-2]
                                if len(item_pic_end[j - 1][:-1]) > 1:
                                    temp_res[j - 1]["stem"] += "\n" + "\n".join(item_pic_end[j - 1][:-2])
                                item_pic_end[j - 1] = ""
                            else:  # 本身有图片的话，用自己的
                                temp_res[j]["stem"] += "\n" + "\n".join(item_pic_end[j]).replace("#@#", "")
                                item_pic_end[j] = ""
                        else:
                            temp_res[j - 1]["stem"] += "\n" + "\n".join(item_pic_end[j - 1]).replace("#@#", "")
                            item_pic_end[j - 1] = ""
                    elif item_pic_end[j]:  # 当前题末尾有图
                        temp_res[j]["stem"] += "\n" + "\n".join(item_pic_end[j]).replace("#@#", "")
                        item_pic_end[j] = ""
                else:  # 其他图片信息关键字没匹配到，可能 会漏掉
                    if item_pic_end[j]:
                        temp_res[j]["stem"] += "\n" + "\n".join(item_pic_end[j]).replace("#@#", "")
                        item_pic_end[j] = ""

        if right_after_corret:
            item_list = temp_res
            if any([True if m else False for m in item_pic_end]):
                for mi, m in enumerate(item_pic_end):
                    if m:
                        item_list[mi]["stem"] += "\n" + "\n".join(item_pic_end[mi]).replace("#@#", "")
    # else:

    if item_pic_type1_id:
        for i in item_pic_type1_id:
            if 'susp_pic' in temp_res[i]:
                temp_res[i]["stem"] += "\n" + "\n".join(temp_res[i]['susp_pic'])

    return item_list