#!/usr/bin/env/python
# -*- coding:utf-8 -*-
from pprint import pprint
# from utils.exam_type import get_exam_type
from structure.final_structure import one_item_structure
from utils.stem_ans_split import get_split_pos
from utils.washutil import *
from structure.three_parse_structure import *
from utils.pic_pos_judge import img_regroup
from structure.paper_text_structure import WordParseStructure
from func_timeout import func_set_timeout
from utils.xuanzuoti2slave import toslave_bef, toslave_aft
paper_types = ["第三种试卷格式:题目与答案分开",
"第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
"第一种试卷格式:教师用卷,含答案和解析关键字"]
class StructureExporter(WordParseStructure):
"""
基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
"""
def img_repl(self, one_dict):
"""
初步拆分题目后,图片信息的替换
:return:
"""
imgs = {s: re.findall("", one_dict[s]) for s in ['stem', 'key', 'parse']}
for k, imgs_seq in imgs.items():
for img in imgs_seq:
img = re.sub("(?", one_dict["analy"]):
img = re.sub("(? 10: # 带相同个数的答案和解析
paper_type = paper_types[2]
item_res = split_by_keywords(row_list)
if type(item_res) == str and re.search("格式有误|没有换行|题型不明确|题型行格式有问题", item_res):
print("第一种试卷格式解析格式有误")
try:
paper_type = paper_types[1]
item_res = split_by_topicno(row_list, self.subject)
except:
return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
else:
paper_type = paper_types[1]
item_res = split_by_topicno(row_list, self.subject)
item_list =[]
if type(item_res) == str:
return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
else:
if type(item_res) == tuple:
if len(item_res) == 2:
item_list, item_no_type = item_res
else:
item_list, item_no_type, item_groups, ans_groups = item_res
# pprint(item_list)
print('****************初步切分题目的个数*****************', len(item_list))
res = []
if item_list:
item_list = img_regroup(item_list, row_list) # 图片重组判断
if self.subs2src:
item_list = list(map(self.img_repl, item_list)) # 图片信息替换还原
# ---------初步拆分题目错误判断--------------------
# ---------新题型进一步拆分--------------------
new_item = [[k, i] for k, i in enumerate(item_list) if re.search("选[修学]", i["stem"][:10])]
have_slave = 0
to_slave = []
if new_item:
try:
have_slave = 1
for one in new_item:
new_res = toslave_bef(one[1])
if type(new_res) == list:
to_slave.extend(new_res)
item_list.remove(one[1])
else:
item_list[one[0]] = new_res
except:
pass
if to_slave:
item_list.extend(to_slave)
# ==========小题结构化========
# from multiprocessing.dummy import Pool as ThreadPool
# pool = ThreadPool(2) # 比# pool = multiprocessing.Pool(3)速度快
consumer = ['noslave'] * len(item_list)
items_no_type = [item_no_type] * len(item_list)
xyz = zip(item_list, consumer, items_no_type)
# res = list(pool.map(one_item_structure, xyz))
res = list(map(one_item_structure, xyz)) # 和多进程相比,这样速度也很快
# ==========最后的清洗=========
res = wash_after(res, item_groups, ans_groups)
if have_slave and not to_slave:
res = list(map(toslave_aft, res))
# 结果返回
if self.is_reparse:
return {"html":self.new_html, "items": res}, paper_type
else:
return {"items": res}, paper_type
@staticmethod
def _get_all_errors(res):
"""
整套试卷结构化完成以后,把所有报错放在一个list里面:
all_errors = [{"单选题第1题目":[]},{"解答题第2题":[]},{},{}]
:param res:
:return:
"""
type_names = []
errmgs = []
spliterr_point = []
for one_res in res:
type_names.append(one_res["type"])
if "text_errmsgs" in one_res:
errmgs.append(one_res["text_errmsgs"])
else:
errmgs.append("")
if 'spliterr_point' in one_res:
spliterr_point.append(one_res['spliterr_point'])
# 给同种题型的名字重新编码
new_names = []
for k, v in enumerate(type_names):
if v:
nums = str(type_names[:k]).count(v)
else:
nums = k
if spliterr_point:
add_n = insert_sort2get_idx(spliterr_point, k+1)
new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1 + add_n, k + 1 + add_n))
else:
new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1, k + 1))
all_errors = []
for name, error in zip(new_names, errmgs):
if len(error) > 0:
all_errors.append({name: error})
return all_errors
if __name__ == '__main__':
# 单份试卷测试
import json
from bson.objectid import ObjectId
# path1 = r"F:\zwj\parse_2021\data\fail\2\2.txt"
# path = r"F:\zwj\parse_2021\res_folder\13.html"
# images_url1 = "" # "http://49.233.23.58:11086/ser_static/4439/files/"
# html = "
"+"
\n".join(html.split("\n"))+"
"
# with open(r"F:\zwj\Text_Structure\fail_files3\c5e222c5fbded2a2264ae002907fc92c__2021_04_16_18_43_23.json", 'r') as load_f:
# html = json.load(load_f)
# print(load_dict)
path2 = r"F:\zwj\new_word_text_extract_2021\data\地理\3\安徽高三地理.html"
# path2 = r"F:\zwj\new_word_text_extract_2021\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级(下)第二次联考地理试卷-普通用卷.html"
# path2 = r"F:\zwj\new_word_parse_2021\data\huaxue\huexue2.html"
# path2 = r"F:\zwj\new_word_text_extract_2021\data\phy_clean.html"
html = open(path2, "r", encoding="utf-8").read()
# print(html)
res1 = StructureExporter(html, "",1, "地理").export()
# new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html")
# re_f = open(new_fpath, 'a+', encoding='utf-8')
# for i in res1[0]["items"]:
# re_f.write(str(i))
pprint(res1[0]["items"])
print('题目数量:', len(res1[0]["items"]))
# new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"
# re_f = open(new_fpath, 'w', encoding='utf-8')
# json.dump(res1, re_f, ensure_ascii=False)
# mongo = Mongo()
# data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")})
# # pprint(data["item_ocr"])
# res1 = WordParseStructure(data["item_ocr"], images_url1).structure()
# print(res1)
# print('题目数量:', len(res1[0]["items"]))
# 6837 序号有些乱 6836 图片位置和格式有问题
# 6822 16A、和16B、类型的序号怎么处理 'item_id'有int和 str 型,须统一处理下
# 6820 答案页没有明显标识
# 14.html 只有答案,没有题干
# 21.html 多套题目在一起,多个从1开始的序号,最后一道题,把后面题目都放在一起了,需要判断一下吗?
# import json
# re_f = open("207.txt", 'w', encoding='utf-8')
# json.dump(res1[0], re_f)
# json文件
# for file in os.listdir(r"F:\zwj\Text_Structure\fail_files"):
# path1 = os.path.join(r"F:\zwj\Text_Structure\fail_files", file)
# # path1 = r"F:\zwj\Text_Structure\fail_files\89a6911f57bf89aba898651b27d2a2fc__2021_04_09_18_50_19.json"
# with open(path1,'r',encoding='utf-8') as f:
# html= json.load(f)
# pprint(html)
# # try:
# # res1 = WordParseStructure(html, "").structure()
# # os.remove(path1)
# # except:
# # pass
# res1 = WordParseStructure(html, "").structure()
# pprint(res1)
# print('题目数量:', len(res1[0]["items"]))