#!/usr/bin/env/python
# -*- coding:utf-8 -*-
from pprint import pprint
# from utils.exam_type import get_exam_type
from structure.final_structure import one_item_structure
from utils.stem_ans_split import get_split_pos
from utils.washutil import *
from structure.three_parse_structure import *
from utils.pic_pos_judge import img_regroup
from structure.paper_text_structure import WordParseStructure
from func_timeout import func_set_timeout
from utils.xuanzuoti2slave import toslave_bef, toslave_aft
paper_types = ["第三种试卷格式:题目与答案分开",
"第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
"第一种试卷格式:教师用卷,含答案和解析关键字"]
class StructureExporter(WordParseStructure):
"""
基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
"""
def img_repl(self, one_dict):
"""
初步拆分题目后,图片信息的替换
:return:
"""
imgs = {s: re.findall("
"+"
\n".join(html.split("\n"))+"
" # with open(r"F:\zwj\Text_Structure\fail_files3\c5e222c5fbded2a2264ae002907fc92c__2021_04_16_18_43_23.json", 'r') as load_f: # html = json.load(load_f) # print(load_dict) # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\地理\3\安徽高三地理.html" # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\地理\2\gd1.html" # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\地理\shuguang.html" # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\huaxue\huexue2.html" # path2 = r"F:\zwj\new_word_text_extract_2021\data\phy_clean.html" # path2 = r"G:\zwj\Word2Html\data\yuwen\yuwen1.html" # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_art\data\语文\bj.html" # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\语文\2\tianjin.html" # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\语文\2\61c5380666e78ea2a20b4ff0.html" # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\政治\jiexi_bj.html" # path2 = r"F:\zwj\Text_Structure\accept_files\62a2f9b9765759d85567a3e4.html" # 6239991e6ca622396925f66b 624cf82d12cd45a7836f3430 626b4b1f81b582c0470d01b0 # 627b64b0814132f0d7b12589 627b622981b582c0470d020e # 6294326cf84c0e279ac6484e.html 62903acaf84c0e279ac647fb path2 = r"C:\Users\Administrator\Desktop\666a67fec3c4da9e7009b531.html" path2 = r"C:\Users\Administrator\Desktop\66459c62c3c4da9e7009ae9d.html" # path2 = r"F:\zwj\Text_Structure\accept_files\62aae86a765759d85567a475.html" html = open(path2, "r", encoding="utf-8").read() # html = json.loads(html) 621845626ca622396925f55c html2 = """ 1. I’m anxious___________ your injury.Are you feeling any better now? 2. After he was back on his feet, he was anxious___________ (return) to school as soon as possible. 3. Helen was ___________ to death when she saw the ___________scene.She hid herself in the corner, shaking with___________(fright). 4. The music outside___________ (annoy) Tom soon. He couldn’t keep his___________ (concentrate) with such ___________ (annoy) music going on yesterday. 5. With so many people talking around, he still concentrated ___________ doing his homework,which left a deep impression on me. 6. The result was far beyond ___________ we had expected, which brought great joy to every one of us. 7. If the dress doesn’t fit you, take it back and you can exchange it ___________ another one of the same price. 8. The dictionary is out of date:many words have been added ___________ the language since it came out. 9. This vacation I went to an island on the Pacific and ___________ by its scenery. The island has left a lasting ___________ on me. 10. We are confident about the future and will never lose our confidence ___________ the achievements we will make. 11. He has worked for nearly 20 years, so he is senior ___________ most of his workmates. 12. Although he is three years junior ___________ me, he has more work experience. """ res1 = StructureExporter(html, "202406131725", "语文", 1).export() # new_fpath = os.path.join(r"G:\zwj\WL\Text_Structure\fail_files", "res_政治.json") # re_f = open(new_fpath, 'w', encoding='utf-8') # json.dump(res1[0]["items"], re_f, ensure_ascii=False) # for i in res1[0]["items"]: # re_f.write(str(i)) pprint(res1[0]["items"]) print('题目数量:', len(res1[0]["items"])) # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json" # re_f = open(new_fpath, 'w', encoding='utf-8') # json.dump(res1, re_f, ensure_ascii=False) # mongo = Mongo() # data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")}) # # pprint(data["item_ocr"]) # res1 = WordParseStructure(data["item_ocr"], images_url1).structure() # print(res1) # print('题目数量:', len(res1[0]["items"])) # 6837 序号有些乱 6836 图片位置和格式有问题 # 6822 16A、和16B、类型的序号怎么处理 'item_id'有int和 str 型,须统一处理下 # 6820 答案页没有明显标识 # 14.html 只有答案,没有题干 # 21.html 多套题目在一起,多个从1开始的序号,最后一道题,把后面题目都放在一起了,需要判断一下吗? # import json # re_f = open("207.txt", 'w', encoding='utf-8') # json.dump(res1[0], re_f) # json文件 # for file in os.listdir(r"F:\zwj\Text_Structure\fail_files"): # path1 = os.path.join(r"F:\zwj\Text_Structure\fail_files", file) # # path1 = r"F:\zwj\Text_Structure\fail_files\89a6911f57bf89aba898651b27d2a2fc__2021_04_09_18_50_19.json" # with open(path1,'r',encoding='utf-8') as f: # html= json.load(f) # pprint(html) # # try: # # res1 = WordParseStructure(html, "").structure() # # os.remove(path1) # # except: # # pass # res1 = WordParseStructure(html, "").structure() # pprint(res1) # print('题目数量:', len(res1[0]["items"]))