#!/usr/bin/env/python # -*- coding:utf-8 -*- from pprint import pprint # from utils.exam_type import get_exam_type from structure.final_structure import one_item_structure from utils.stem_ans_split import get_split_pos from utils.washutil import * from structure.three_parse_structure import * from utils.pic_pos_judge import img_regroup from structure.paper_text_structure import WordParseStructure from func_timeout import func_set_timeout from utils.xuanzuoti2slave import toslave_bef, toslave_aft paper_types = ["第三种试卷格式：题目与答案分开", "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字", "第一种试卷格式：教师用卷，含答案和解析关键字"] class StructureExporter(WordParseStructure): """ 基于wordbin出来的html结果进一步做试卷类型非模板结构化 """ def img_repl(self, one_dict): """ 初步拆分题目后，图片信息的替换 :return: """ imgs = {s: re.findall("", one_dict[s]) for s in ['stem', 'key', 'parse']} for k, imgs_seq in imgs.items(): for img in imgs_seq: img = re.sub("(?", one_dict["analy"]): img = re.sub("(? 10: # 带相同个数的答案和解析 paper_type = paper_types[2] item_res = split_by_keywords(row_list) if type(item_res) == str and re.search("格式有误|没有换行|题型不明确|题型行格式有问题", item_res): print("第一种试卷格式解析格式有误") try: paper_type = paper_types[1] item_res = split_by_topicno(row_list, self.subject) except: return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type else: paper_type = paper_types[1] item_res = split_by_topicno(row_list, self.subject) item_list =[] if type(item_res) == str: return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type else: if type(item_res) == tuple: if len(item_res) == 2: item_list, item_no_type = item_res else: item_list, item_no_type, item_groups, ans_groups = item_res # pprint(item_list) print('****************初步切分题目的个数*****************', len(item_list)) res = [] if item_list: item_list = img_regroup(item_list, row_list) # 图片重组判断 if self.subs2src: item_list = list(map(self.img_repl, item_list)) # 图片信息替换还原 # ---------初步拆分题目错误判断-------------------- # ---------新题型进一步拆分-------------------- new_item = [[k, i] for k, i in enumerate(item_list) if re.search("选[修学]", i["stem"][:10])] have_slave = 0 to_slave = [] if new_item: try: have_slave = 1 for one in new_item: new_res = toslave_bef(one[1]) if type(new_res) == list: to_slave.extend(new_res) item_list.remove(one[1]) else: item_list[one[0]] = new_res except: pass if to_slave: item_list.extend(to_slave) # ==========小题结构化======== # from multiprocessing.dummy import Pool as ThreadPool # pool = ThreadPool(2) # 比# pool = multiprocessing.Pool(3)速度快 consumer = ['noslave'] * len(item_list) items_no_type = [item_no_type] * len(item_list) xyz = zip(item_list, consumer, items_no_type) # res = list(pool.map(one_item_structure, xyz)) res = list(map(one_item_structure, xyz)) # 和多进程相比，这样速度也很快 # ==========最后的清洗========= res = wash_after(res, item_groups, ans_groups) if have_slave and not to_slave: res = list(map(toslave_aft, res)) # 结果返回 if self.is_reparse: return {"html":self.new_html, "items": res}, paper_type else: return {"items": res}, paper_type @staticmethod def _get_all_errors(res): """ 整套试卷结构化完成以后,把所有报错放在一个list里面： all_errors = [{"单选题第1题目":[]},{"解答题第2题":[]},{},{}] :param res: :return: """ type_names = [] errmgs = [] spliterr_point = [] for one_res in res: type_names.append(one_res["type"]) if "text_errmsgs" in one_res: errmgs.append(one_res["text_errmsgs"]) else: errmgs.append("") if 'spliterr_point' in one_res: spliterr_point.append(one_res['spliterr_point']) # 给同种题型的名字重新编码 new_names = [] for k, v in enumerate(type_names): if v: nums = str(type_names[:k]).count(v) else: nums = k if spliterr_point: add_n = insert_sort2get_idx(spliterr_point, k+1) new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1 + add_n, k + 1 + add_n)) else: new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1, k + 1)) all_errors = [] for name, error in zip(new_names, errmgs): if len(error) > 0: all_errors.append({name: error}) return all_errors if __name__ == '__main__': # 单份试卷测试 import json from bson.objectid import ObjectId # path1 = r"F:\zwj\parse_2021\data\fail\2\2.txt" # path = r"F:\zwj\parse_2021\res_folder\13.html" # images_url1 = "" # "http://49.233.23.58:11086/ser_static/4439/files/" # html = "

"+"

".join(html.split("\n"))+"

" # with open(r"F:\zwj\Text_Structure\fail_files3\c5e222c5fbded2a2264ae002907fc92c__2021_04_16_18_43_23.json", 'r') as load_f: # html = json.load(load_f) # print(load_dict) path2 = r"F:\zwj\new_word_text_extract_2021\data\地理\3\安徽高三地理.html" # path2 = r"F:\zwj\new_word_text_extract_2021\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级（下）第二次联考地理试卷-普通用卷.html" # path2 = r"F:\zwj\new_word_parse_2021\data\huaxue\huexue2.html" # path2 = r"F:\zwj\new_word_text_extract_2021\data\phy_clean.html" html = open(path2, "r", encoding="utf-8").read() # print(html) res1 = StructureExporter(html, "",1, "地理").export() # new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html") # re_f = open(new_fpath, 'a+', encoding='utf-8') # for i in res1[0]["items"]: # re_f.write(str(i)) pprint(res1[0]["items"]) print('题目数量：', len(res1[0]["items"])) # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json" # re_f = open(new_fpath, 'w', encoding='utf-8') # json.dump(res1, re_f, ensure_ascii=False) # mongo = Mongo() # data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")}) # # pprint(data["item_ocr"]) # res1 = WordParseStructure(data["item_ocr"], images_url1).structure() # print(res1) # print('题目数量：', len(res1[0]["items"])) # 6837 序号有些乱 6836 图片位置和格式有问题 # 6822 16A、和16B、类型的序号怎么处理 'item_id'有int和 str 型，须统一处理下 # 6820 答案页没有明显标识 # 14.html 只有答案，没有题干 # 21.html 多套题目在一起，多个从1开始的序号，最后一道题，把后面题目都放在一起了，需要判断一下吗？ # import json # re_f = open("207.txt", 'w', encoding='utf-8') # json.dump(res1[0], re_f) # json文件 # for file in os.listdir(r"F:\zwj\Text_Structure\fail_files"): # path1 = os.path.join(r"F:\zwj\Text_Structure\fail_files", file) # # path1 = r"F:\zwj\Text_Structure\fail_files\89a6911f57bf89aba898651b27d2a2fc__2021_04_09_18_50_19.json" # with open(path1,'r',encoding='utf-8') as f: # html= json.load(f) # pprint(html) # # try: # # res1 = WordParseStructure(html, "").structure() # # os.remove(path1) # # except: # # pass # res1 = WordParseStructure(html, "").structure() # pprint(res1) # print('题目数量：', len(res1[0]["items"]))