cdZWj
/
new_tiku_structure_v3_sci


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-


from pprint import pprint
# from utils.exam_type import get_exam_type
# from utils.get_data import Mongo
from structure.final_structure import one_item_structure
from utils.stem_ans_split import get_split_pos
from utils.washutil import *
from structure.three_parse_structure import *
from utils.pic_pos_judge import img_regroup
from func_timeout import func_set_timeout

from utils.xuanzuoti2slave import toslave_bef, toslave_aft

paper_types = ["第三种试卷格式：题目与答案分开",
               "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
               "第一种试卷格式：教师用卷，含答案和解析关键字"]

class WordParseStructure:
    """
        基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
    """

    def __init__(self, html, wordid, is_reparse=0, must_latex=0):
        self.html = html
        self.is_reparse = is_reparse
        self.wordid = wordid
        self.must_latex = must_latex

    def img_repl(self, one_dict):
        """
        初步拆分题目后，图片信息的替换
        :return:
        """
        imgs = {s: re.findall("<img.*?/>", one_dict[s]) for s in ['stem', 'key', 'parse']}
        for k, imgs_seq in imgs.items():
            for img in imgs_seq:
                img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
                if img in self.subs2src:
                    one_dict[k] = one_dict[k].replace(img, self.subs2src[img])
                # if type(self.img_url) == str and self.img_url:
                #     one_dict[k] = re.sub(r'<img src="files/', '<img src="' + str(self.img_url), str(one_dict[k]))
        if "analy" in one_dict:
            for img in re.findall("<img.*?/>", one_dict["analy"]):
                img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
                one_dict["analy"] = one_dict["analy"].replace(img, self.subs2src[img])
        return one_dict

    # @func_set_timeout(30)
    def structure(self):
        """结构化入口"""
        # 第一步：清洗
        row_list, self.subs2src, new_html = HtmlWash(self.html, self.wordid, self.is_reparse,
                                                     must_latex=self.must_latex).html_cleal()
        # pprint(row_list)
        if not row_list:
            return {"errcode": 1, "errmsgs": "题文没有有效信息", "data": {}}, ""
        # 判断考试类型
        # paper_other_info = get_exam_type(row_list)

        # 第二步：寻找题目和答案的切分点，一定要有“答案”关键字
        split_res = get_split_pos(row_list)
        if type(split_res) == str:
            return {"errcode": 1, "errmsgs": split_res, "data": {}}, paper_types[0]
        row_list, items_list, ans_list, is_may_ans = split_res
        rd2_is_fail = 0
        rd1_may_fail = 0
        item_res, paper_type, item_no_type = "", "", 1
        if "【答案】" in "".join(items_list) or "【解析】" in "".join(items_list):
            rd1_may_fail = 1
        else:
            if items_list:
                paper_type = paper_types[0]
                reform_res = items_ans_reform(items_list, ans_list)
                if type(reform_res) == str:
                    return {"errcode": 1, "errmsgs": reform_res, "data": {}}, paper_type
                else:
                    if len(reform_res)==2:
                        item_res = reform_res
                    else:
                        item_res, item_no_type, rd2_is_fail= reform_res

        if not items_list or rd1_may_fail or (is_may_ans and rd2_is_fail):
            ans_n = re.findall("【答案】", "\n".join(row_list))
            if ans_n and len(ans_n) == len(re.findall("【解析】", "\n".join(row_list))) > 10:  # 带相同个数的答案和解析
                paper_type = paper_types[2]
                item_res = split_by_keywords(row_list)
                if type(item_res) == str and re.search("格式有误|没有换行|题型不明确|题型行格式有问题", item_res):
                    print("第一种试卷格式解析格式有误")
                    try:
                        paper_type = paper_types[1]
                        item_res = split_by_topicno(row_list)
                    except:
                        return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
            else:
                paper_type = paper_types[1]
                item_res = split_by_topicno(row_list)

        if type(item_res) == str:
            return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
        else:
            item_list = item_res
            if type(item_res) == tuple:
                item_list, item_no_type = item_res
            # pprint(item_list)
            print('****************初步切分题目的个数*****************', len(item_list))
            res = []
            if item_list:
                item_list = img_regroup(item_list, row_list)   # 图片重组判断
                if self.subs2src:
                    item_list = list(map(self.img_repl, item_list))  # 图片信息替换还原
            # ---------初步拆分题目错误判断--------------------
            # ---------新题型进一步拆分--------------------
            #     new_item = [[k, i] for k, i in enumerate(item_list) if re.search("选[修学考]", i["stem"][:10])]
            #     have_slave = 0
            #     to_slave = {}
            #     if new_item:
            #         try:
            #             have_slave = 1
            #             for one in new_item:
            #                 new_res = toslave_bef(one[1])
            #                 item_list[one[0]] = new_res
            #                 if type(new_res) == list:
            #                     to_slave[one[0]] = new_res
            #         except:
            #             pass
            #     if to_slave:
            #         item_list = [i if type(i) == list else [i] for i in item_list]
            #         item_list = sum(item_list, [])

            # ==========小题结构化========
            #   from multiprocessing.dummy import Pool as ThreadPool
            #   pool = ThreadPool(2)  # 比# pool = multiprocessing.Pool(3)速度快
                consumer = ['toslave'] * len(item_list)
                items_no_type = [item_no_type] * len(item_list)
                xyz = zip(item_list, consumer, items_no_type)
                # res = list(pool.map(one_item_structure, xyz))
                res = list(map(one_item_structure, xyz))  # 和多进程相比，这样速度也很快
                # pprint(res)
            # ==========最后的清洗=========
                res = wash_after(res)
                # if have_slave and not to_slave:
                #     res = list(map(toslave_aft, res))
            # 结果返回
            if self.is_reparse:
                return {"html":new_html, "items": res}, paper_type
            else:
                return {"items": res}, paper_type

    @staticmethod
    def _get_all_errors(res):
        """
        整套试卷结构化完成以后,把所有报错放在一个list里面：
        all_errors = [{"单选题第1题目":[]},{"解答题第2题":[]},{},{}]
        :param res:
        :return:
        """
        type_names = []
        errmgs = []
        spliterr_point = []
        for one_res in res:
            type_names.append(one_res["type"])
            if "text_errmsgs" in one_res:
                errmgs.append(one_res["text_errmsgs"])
            else:
                errmgs.append("")
            if 'spliterr_point' in one_res:
                spliterr_point.append(one_res['spliterr_point'])
        # 给同种题型的名字重新编码
        new_names = []
        for k, v in enumerate(type_names):
            if v:
                nums = str(type_names[:k]).count(v)
            else:
                nums = k
            if spliterr_point:
                add_n = insert_sort2get_idx(spliterr_point, k+1)
                new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1 + add_n, k + 1 + add_n))
            else:
                new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1, k + 1))
        all_errors = []
        for name, error in zip(new_names, errmgs):
            if len(error) > 0:
                all_errors.append({name: error})
        return all_errors


if __name__ == '__main__':
    # 单份试卷测试
    import json
    from bson.objectid import ObjectId

    # path1 = r"F:\zwj\parse_2021\data\fail\2\2.txt"
    # path = r"F:\zwj\parse_2021\res_folder\13.html"
    # images_url1 = ""  # "http://49.233.23.58:11086/ser_static/4439/files/"
    # html = "<p>"+"</p>\n<p>".join(html.split("\n"))+"</p>"

    # with open(r"F:\zwj\Text_Structure\fail_files3\c5e222c5fbded2a2264ae002907fc92c__2021_04_16_18_43_23.json", 'r') as load_f:
    #     html = json.load(load_f)

        # print(load_dict)

    # path2 = r"F:\zwj\Text_Structure\accept_files\628c459781b582c0470d02d7.html"
    path2 = r"C:\Users\Python\Desktop\62e0ac706c6aff2279346f3b.html"
    # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\data\620bbf7aa7d375f4518b98e1.html"
    # path2 = r"F:\zwj\new_word_text_extract_v2\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级（下）第二次联考地理试卷-普通用卷.html"
    # path2 = r"F:\zwj\new_word_parse_2021\data\huaxue\huexue2.html"
    # path2 = r"C:\Users\Python\Desktop\bug\6258cc7af84c0e279ac64301.html"  # 正则卡死
    # path2 = r"C:\Users\Python\Desktop\bug\629073b9f84c0e279ac64811.html"  # 正则卡死
    # 62650d5cf84c0e279ac643f1  6258cc7af84c0e279ac64301   62660fa2f84c0e279ac643f5
    html = open(path2, "r", encoding="utf-8").read()
    # html = """
    # <html><head><meta charset="utf-8" /></head><body>\n<p>1．下列化学符号中的数字“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示的意义不正确的是</p>\n<p>A．<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image2.png" width="21px" height="24px" data-latex="$$" />：“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示两个氧原子</p>\n<p>B．<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image3.png" width="34px" height="24px" data-latex="$$" />：“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示一个二氧化氮分子含有两个氧原子</p>\n<p>C．<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image4.png" width="40px" height="21px" data-latex="$$" />：“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示两个氢氧根离子</p>\n<p>D．<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image5.png" width="36px" height="28px" data-latex="$$" />：“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image6.png" width="21px" height="17px" data-latex="$$" />”表示氧化镁中镁元素的化合价为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image6.png" width="21px" height="17px" data-latex="$$" />价</p>\n<p>【答案】</p>\n<p>A</p>\n<p>【解析】</p>\n<p>根据元素符号前面的数字表示原子的个数，元素符号右下角的数字表示一个分子中的原子个数，离子符号前面的数字表示离子的个数，元素符号正上方的数字表示元素的化合价。A．<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image2.png" width="21px" height="24px" data-latex="$$" />：“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示一个氧分子由两个氧原子组成，故选项表示的意义不正确；B．元素符号右下角的数字表示一个分子中的原子个数，故<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image3.png" width="34px" height="24px" data-latex="$$" />：“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示一个二氧化氮分子含有两个氧原子，故表示的意义正确；C．离子符号前面的数字表示离子的个数，故<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image4.png" width="40px" height="21px" data-latex="$$" />：“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示两个氢氧根离子，故表示的意义正确；D．元素符号正上方的数字表示元素的化合价，故<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image7.png" width="36px" height="28px" data-latex="$$" />中的“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示镁元素的化合价为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image6.png" width="21px" height="17px" data-latex="$$" />价，故表示的意义正确。故选：A。</p>\n<p> </p>\n<p>2．亚油酸具有降低人体血液中胆固醇及血脂的作用，它的化学式为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image8.png" width="64px" height="24px" data-latex="$$" />，下列说法中正确的是</p>\n<p>A．亚油酸是由三个元素构成的化合物</p>\n<p>B．每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image9.png" width="20px" height="18px" data-latex="$$" />个原子</p>\n<p>C．亚油酸中碳．氧元素的质量比为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image10.png" width="38px" height="18px" data-latex="$$" /></p>\n<p>D．每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image11.png" width="18px" height="18px" data-latex="$$" />个碳原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image12.png" width="20px" height="18px" data-latex="$$" />个氢原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image13.png" width="9px" height="17px" data-latex="$$" />个氧分子</p>\n<p>【答案】</p>\n<p>C</p>\n<p>【解析】</p>\n<p>A．由化学式可知，亚油酸是由<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image14.png" width="16px" height="18px" data-latex="$$" />、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image15.png" width="17px" height="17px" data-latex="$$" />、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image16.png" width="16px" height="18px" data-latex="$$" />三种元素组成的化合物，A错误。B．每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image17.png" width="102px" height="18px" data-latex="$$" />个原子，B错误。C．亚油酸中碳．氧元素的质量比为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image18.png" width="184px" height="18px" data-latex="$$" />，C正确。D．每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image11.png" width="18px" height="18px" data-latex="$$" />个碳原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image12.png" width="20px" height="18px" data-latex="$$" />个氢原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image19.png" width="13px" height="17px" data-latex="$$" />个氧原子，D错误。故选：C。</p>\n</body></html>
    # """

    # print(html)
    res1 = WordParseStructure(html, "", 1).structure()
    # new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html")
    # re_f = open(new_fpath, 'a+', encoding='utf-8')
    # for i in res1[0]["items"]:
    #     re_f.write(str(i))
    pprint(res1)
    # pprint(res1[0]['items'])
    print('题目数量：', len(res1[0]["items"]))

    # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"
    # re_f = open(new_fpath, 'w', encoding='utf-8')
    # json.dump(res1, re_f, ensure_ascii=False)
    # mongo = Mongo()
    # data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")})
    # # pprint(data["item_ocr"])
    # res1 = WordParseStructure(data["item_ocr"], images_url1).structure()

    # print(res1)
    # print('题目数量：', len(res1[0]["items"]))

    # 6837 序号有些乱  6836 图片位置和格式有问题
    # 6822  16A、和16B、类型的序号怎么处理    'item_id'有int和 str 型，须统一处理下
    # 6820 答案页没有明显标识
    # 14.html  只有答案，没有题干
    # 21.html  多套题目在一起，多个从1开始的序号，最后一道题，把后面题目都放在一起了，需要判断一下吗？
    # import json
    # re_f = open("207.txt", 'w', encoding='utf-8')
    # json.dump(res1[0], re_f)

    # json文件
    # for file in os.listdir(r"F:\zwj\Text_Structure\fail_files"):
    #     path1 = os.path.join(r"F:\zwj\Text_Structure\fail_files", file)
    #     # path1 = r"F:\zwj\Text_Structure\fail_files\89a6911f57bf89aba898651b27d2a2fc__2021_04_09_18_50_19.json"
    #     with open(path1,'r',encoding='utf-8') as f:
    #         html= json.load(f)
    #         pprint(html)
    #     # try:
    #     #     res1 = WordParseStructure(html, "").structure()
    #         # os.remove(path1)
    #     # except:
    #     #     pass
    #     res1 = WordParseStructure(html, "").structure()
    #     pprint(res1)
    #     print('题目数量：', len(res1[0]["items"]))