#!/usr/bin/env/python # -*- coding:utf-8 -*- import os import re from pprint import pprint from utils.exam_type import get_exam_type from utils.get_data import Mongo from utils.stem_ans_split import get_split_pos from utils.washutil import * from structure.option import option_structure from structure.three_parse_structure import * from utils.pic_pos_judge import img_regroup from utils.dati2slave import get_slave from func_timeout import func_set_timeout # 各题型结构化 def one_item_structure(xyz): """ 判断解析类型,解析类型为: if: 1.content不需要再做其他处理<-- 答案没有[;;],且答案不是ABCDEFG 2.选择题类,需要把content中的ABCD各选项内容提取出来<--答案是ABCDEFG else: 都要看是否含有小题,如果含有小题,需要把小题提取出来,slave 3.填空题类,(1)需要提取content中下划线的个数 选择题结构化:单选或者多选<--要把各选项是什么提取出来放在slave中 one_item:{"content":xxxx,"answer":xxx,"parse":xxx} consumer: 分“高中数学”还是“全学科”; item_no_type:题号是否以(\d)的形式 :return: """ one_item, consumer, item_no_type = xyz # print(one_item) if "【章节】" in one_item["parse"]: # 属于后一个题的,后面须调整 one_item["chapter"] = one_item["parse"].split("【章节】")[1].split("\n")[0] one_item["parse"] = one_item["parse"].replace("【章节】" + one_item["chapter"], "") if "【章节】" in one_item["content"]: # 属于后一个题的,后面须调整 one_item["chapter"] = one_item["content"].split("【章节】")[1].split("\n")[0] one_item["content"] = one_item["content"].replace("【章节】" + one_item["chapter"], "") if "【选做题】" in one_item["content"] + one_item["answer"] + one_item["parse"]: opt_str = re.search(r"【选做题】:'(\d+)分'", one_item["content"] + one_item["answer"] + one_item["parse"]) one_item["option_st"] = "选做题,"+opt_str.group(1) if opt_str else "选做题" # 选做题开始的位置,后面的题开始是选做题 one_item["content"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["content"]) one_item["answer"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["answer"]) one_item["parse"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["parse"]) ans = one_item["answer"] con = one_item["content"] parse = re.sub(r"((?<=[\n】])|^)\s*解\s*[::]", "", one_item["parse"]) if not one_item["item_topic_name"]: # one_item["errmsgs"].append("本题没有给出明确题型!") # return one_item if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()): one_item["item_topic_name"] = "单选题" if len(ans.strip()) == 1 else "多选题" elif re.search(r"[((]\s*[))]", one_item["content"]) or \ len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["content"])) >= 4: one_item["item_topic_name"] = "选择题" elif re.findall(r"_{2,}", one_item["content"]): one_item["item_topic_name"] = "填空题" else: one_item["item_topic_name"] = "简答题" topic_type = one_item["item_topic_name"] # print(topic_type) if topic_type.replace("题", "") in ["单选", "多选", "选择"]: one_item = option_structure(one_item, con, ans, item_no_type) elif consumer == 'toslave': # 拆小题 one_item = get_slave(one_item, con, parse, ans) if ('slave' not in one_item or not one_item['slave']) and 'analy' in one_item: del one_item['analy'] if one_item["item_topic_name"] == "多选题": one_item = option_structure(one_item, con, ans, item_no_type) else: # 不拆小题,非选择题 pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+((|[^_;;。?!])+?)_+([cdkm上]?m?\s*.?[。.?]?\s*($|
|<==×÷/()()﹙﹚\[\]﹛﹜{\}∧∨∠▰▱△∆⊙⌒" r"⊆⊂⊇⊃∈∩∉∪⊕∥∣≌∽∞∝⊥∫∬∮∯Φ∅≮≯∁∴∵∷←↑→↓↖↗↘↙‖〒¤○′″¢°℃℉" r"αβγδεζηθικλμνξορστυφχψωϕ%‰℅㎎㎏㎜㎝㎞㎡㎥㏄㏎㏕$£¥º¹²³⁴ⁿ₁₂₃₄·∶½⅓⅔¼¾⅛⅜⅝⅞" r"ΑΒΓΔΕΖΗΘΙΚΜ]", "", ans)): one_item["item_topic_name"] = "多选题" one_item = option_structure(one_item, con, ans, item_no_type) if one_item["item_topic_name"] == "填空题" and re.search("_{2,}", one_item['content']) is None: # -----放在huanhang_wash_after中调整-------------- # blank_ans =[] # while re.search(pattern1, one_item["content"]): # 答案直接填在____上的情况 # blank_con1 = re.search(pattern1, one_item["content"]) # one_item["content"] = one_item["content"].replace(blank_con1.group(0), # blank_con1.group(1) + "____" + blank_con1.group(4)) # blank_ans.append(blank_con1.group(2)) # while re.search(pattern2, one_item["content"]): # 答案直接填在____上的情况 # blank_con1 = re.search(pattern2, one_item["content"]) # one_item["content"] = one_item["content"].replace(blank_con1.group(0), # blank_con1.group(1) + "____" + blank_con1.group(3)) # blank_ans.append(blank_con1.group(2)) # if not ans: # one_item["answer"] = ";".join(blank_ans) # one_item["blank_num"] = len(blank_ans) # ---------------------------------------------- if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()): one_item["item_topic_name"] = "单选题" if len(ans.strip()) == 1 else "多选题" one_item = option_structure(one_item, con, ans, item_no_type) elif re.search(r"[((]\s*[))]", one_item["content"]) or ('步骤' not in one_item["content"] and len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["content"])) >= 4): one_item["item_topic_name"] = "选择题" one_item = option_structure(one_item, con, ans, item_no_type) elif re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["content"]): one_item["blank_num"] = len(re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["content"])) elif re.findall('[ \s]{3,}[a-zA-Z]\s*[,;.。;,]', one_item["content"]): one_item["blank_num"] = len(re.findall('\s{3,}\n*\s*[a-zA-Z]\s*[,;.。;,.]', one_item["content"])) elif re.search(pattern1, one_item["content"]) is None and re.search(pattern2, one_item["content"]) is None: stem = re.sub("|[,,.。.、、]", "", one_item["content"]) if len(stem) > 2: one_item["item_topic_name"] = "解答题" # print('------------------------------------------------') if one_item: if re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["content"].strip()): one_item["score"] = float(re.match(r"(\[.*?\])?\(.*?(\d+)分\)", one_item["content"].strip()).group(2)) one_item["content"] = re.sub(r"(\[.*?\])?\(.*?\d+分\)", "", one_item["content"][:20]) + one_item["content"][20:] return one_item paper_types = ["第三种试卷格式:题目与答案分开", "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字", "第一种试卷格式:教师用卷,含答案和解析关键字"] @func_set_timeout(30) class WordParseStructure: """ 基于wordbin出来的html结果进一步做 试卷类型 非模板结构化 """ def __init__(self, html, images_url, is_reparse=0): self.html = html self.img_url = images_url self.is_reparse = is_reparse def img_repl(self, one_dict): """ 初步拆分题目后,图片信息的替换 :return: """ # print("one_dict:", one_dict) # imgs = {s: re.findall("", one_dict[s]) for s in ['content', 'answer', 'parse']} for k, imgs_seq in imgs.items(): for img in imgs_seq: img = re.sub("(?", one_dict["analy"]): img = re.sub("(? 10: # 带相同个数的答案和解析 paper_type = paper_types[2] item_res = split_by_keywords(row_list) if type(item_res) == str and re.search("格式有误|没有换行|题型不明确", item_res): print("第一种试卷格式解析格式有误") try: paper_type = paper_types[1] item_res = split_by_topicno(row_list) except: return {"errcode": 1, "errmsgs": item_res, "items": []}, paper_type else: paper_type = paper_types[1] item_res = split_by_topicno(row_list) # print(paper_type) # pprint(item_res) if type(item_res) == str: return {"errcode": 1, "errmsgs": item_res, "items": []}, paper_type else: item_list = item_res if type(item_res) == tuple: item_list, item_no_type = item_res # pprint(item_list) print('****************初步切分题目的个数*****************', len(item_list)) res = [] if item_list: item_list = img_regroup(item_list) # 图片重组判断 # 图片信息替换还原------------------------ item_list = list(map(self.img_repl, item_list)) # ---------初步拆分题目错误判断-------------------- # ===================================小题结构化====================================================== # from multiprocessing.dummy import Pool as ThreadPool # pool = ThreadPool(2) # 比# pool = multiprocessing.Pool(3)速度快 consumer = ['noslave'] * len(item_list) # consumer = ['all'] * len(item_list) items_no_type = [item_no_type] * len(item_list) xyz = zip(item_list, consumer, items_no_type) # res = list(pool.map(one_item_structure, xyz)) res = list(map(one_item_structure, xyz)) # res = list(map(one_item_structure, item_list, consumer, items_no_type)) # 这样速度也很快 # pprint(res) # ===================================最后的清洗====================================================== res = huanhang_wash_after(res) # add_all_error = self._get_all_errors(res) # if add_all_error: # # errmsg = add_all_error # errmsg = "" # errcode = 1 # else: # errmsg = "" # errcode = 0 # "errcode": errcode, "errmsgs": "
".join(map(lambda x: str(x), errmsg)), return {"items": res}, paper_type # 整合了所有错误的结果 @staticmethod def _get_all_errors(res): """ 整套试卷结构化完成以后,把所有报错放在一个list里面: all_errors = [{"单选题第1题目":[]},{"解答题第2题":[]},{},{}] :param res: :return: """ type_names = [] errmgs = [] spliterr_point = [] for one_res in res: type_names.append(one_res["type"]) if "text_errmsgs" in one_res: errmgs.append(one_res["text_errmsgs"]) else: errmgs.append("") if 'spliterr_point' in one_res: spliterr_point.append(one_res['spliterr_point']) # 给同种题型的名字重新编码 new_names = [] for k, v in enumerate(type_names): if v: nums = str(type_names[:k]).count(v) else: nums = k if spliterr_point: add_n = insert_sort2get_idx(spliterr_point, k+1) new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1 + add_n, k + 1 + add_n)) else: new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1, k + 1)) all_errors = [] for name, error in zip(new_names, errmgs): if len(error) > 0: all_errors.append({name: error}) return all_errors if __name__ == '__main__': # 单份试卷测试 import json from bson.objectid import ObjectId # path1 = r"F:\zwj\parse_2021\data\fail\2\2.txt" # path = r"F:\zwj\parse_2021\res_folder\13.html" # images_url1 = "" # "http://49.233.23.58:11086/ser_static/4439/files/" # html = "

"+"

\n

".join(html.split("\n"))+"

" # with open(r"F:\zwj\WL\parse_2021\res_folder\9aa310629f1153f0b20951e550611359__2021_03_12_10_42_44.json", # 'r') as load_f: # html = json.load(load_f) # print(load_dict) # path2 = r"F:\zwj\parse_2021\data\fail\doc\11\11.html" path2 = r"F:\zwj\parse_2021\res_folder\2021_04_02_18_01_41.html" html = open(path2, "r", encoding="utf-8").read() # print(html) res1 = WordParseStructure(html, "").structure() pprint(res1) print('题目数量:', len(res1[0]["items"])) # mongo = Mongo() # data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")}) # # pprint(data["item_ocr"]) # res1 = WordParseStructure(data["item_ocr"], images_url1).structure() # print(res1) # print('题目数量:', len(res1[0]["items"])) # 6837 序号有些乱 6836 图片位置和格式有问题 # 6822 16A、和16B、类型的序号怎么处理 'item_id'有int和 str 型,须统一处理下 # 6820 答案页没有明显标识 # 14.html 只有答案,没有题干 # 21.html 多套题目在一起,多个从1开始的序号,最后一道题,把后面题目都放在一起了,需要判断一下吗? # import json # re_f = open("207.txt", 'w', encoding='utf-8') # json.dump(res1[0], re_f) # json文件 # path1 = r"F:\zwj\parse_2021\res_folder\674a594b0dd55d8ecdf9406f9f699359__2021_03_30_13_08_54.json" # with open(path1,'r',encoding='utf-8') as f: # html= json.load(f) # pprint(html) # res1 = WordParseStructure(html, "").structure() # print(res1) # print('题目数量:', len(res1[0]["items"]))