structure_main.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. # import sys
  4. # sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci")
  5. from pprint import pprint
  6. from typing import Any
  7. # from utils.exam_type import get_exam_type
  8. # from utils.get_data import Mongo
  9. from structure.final_structure import one_item_structure
  10. from utils.stem_ans_split import get_split_pos
  11. from utils.washutil import *
  12. from utils.washutil_for_DL_way import HtmlWash_2
  13. from structure.three_parse_structure import *
  14. from utils.pic_pos_judge import img_regroup
  15. from func_timeout import func_set_timeout
  16. import requests
  17. import time
  18. from structure.ans_structure import get_ans_match
  19. from utils.xuanzuoti2slave import toslave_bef, toslave_aft
  20. logger = configs.myLog(__name__, log_cate="reparse_ruku_log").getlog()
  21. paper_types = ["第三种试卷格式:题目与答案分开",
  22. "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
  23. "第一种试卷格式:教师用卷,含答案和解析关键字"]
  24. class WordParseStructure:
  25. """
  26. 基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
  27. """
  28. def __init__(self, html, wordid, is_reparse=0, must_latex=0, source="zxhx", subject="数学"):
  29. self.html = html
  30. self.is_reparse = is_reparse
  31. self.wordid = wordid
  32. self.must_latex = must_latex
  33. self.source = source
  34. self.subject = subject
  35. def __call__(self):
  36. if self.source in ["school", "qtk"] and re.search("物理|数学", self.subject): # "school" "xue_guan", "teacher"
  37. t1 = time.time()
  38. res = self.structure_combine_DL()
  39. if not res[0]:
  40. logger.info("----【paper_id:{}】模型切题没切出来".format(self.wordid))
  41. return self.structure()
  42. logger.info("----【paper_id:{}】采用切题服务花费时间:{}".format(self.wordid, time.time()-t1))
  43. return res
  44. else:
  45. return self.structure()
  46. def structure_combine_DL(self):
  47. # 第一步:清洗
  48. htmltext, row_list, new_html = HtmlWash_2(self.html, self.wordid, self.is_reparse,
  49. must_latex=self.must_latex).html_cleal()
  50. if not row_list:
  51. return {"errcode": 1, "errmsgs": "题文没有有效信息", "data": {}}, ""
  52. # 第二步:寻找题目和答案的切分点,一定要有“答案”关键字
  53. split_res = get_split_pos(row_list)
  54. if type(split_res) == str:
  55. return {"errcode": 1, "errmsgs": split_res, "data": {}}, paper_types[0]
  56. row_list, items_list, ans_list, _ = split_res
  57. rd1_may_fail = 0
  58. paper_type = ""
  59. item_res = {}
  60. if "【答案】" in "".join(items_list) or "【解析】" in "".join(items_list):
  61. rd1_may_fail = 1
  62. elif items_list:
  63. paper_type = "第三种试卷格式:题目与答案分开"
  64. try:
  65. r1 = requests.post(url=configs.topic_segment_ip,
  66. json={"content": "<br>".join(items_list), "subject": self.subject,
  67. "paper_id": self.wordid, "text_type": "stem_block"})
  68. item_res = r1.json()["res"]
  69. # 试卷开头容易切错,需判断一下;也可以不判断
  70. if len(item_res)>1 and re.match('<img .*?height="[1-4][0-9]?.*?/>', item_res[0]['stem']) and \
  71. "$" not in item_res[0]['stem'] and (re.search("试[题卷]", item_res[0]['stem']) or
  72. re.match("<img .*?/>\s*[\u4e00-\u9fa5\d]{,20}$")):
  73. item_res = item_res[1:]
  74. # print(item_res)
  75. r2 = requests.post(url=configs.topic_segment_ip,
  76. json={"content": "<br>".join(ans_list), "subject": self.subject,
  77. "paper_id": self.wordid, "text_type": "answer_block"})
  78. all_ans, ans_no = r2.json()["res"]
  79. # print(1111111111111,all_ans)
  80. print(ans_no)
  81. # 根据ans_no纠正切错的all_ans,如[2, 6, 4, None, 7, None, 5, None, 1]
  82. if abs(len([i for i in ans_no if i]) - len(item_res)) <= 2:
  83. last_idx = None
  84. new_ans_no = ans_no.copy()
  85. for i, no in enumerate(ans_no):
  86. if no is not None:
  87. last_idx = i
  88. if i > 0 and no is None and last_idx is not None:
  89. all_ans[last_idx] += "\n"+all_ans[i]
  90. all_ans[i] = ""
  91. new_ans_no[i] = "del"
  92. all_ans = [j for j in all_ans if j]
  93. ans_no = [i for i in new_ans_no if i != 'del']
  94. if abs(len(ans_no) - len(item_res)) > 2:
  95. item_res = ans_block_split(ans_list, item_res)
  96. else:
  97. item_res = get_ans_match(item_res, all_ans, ans_no, {}, 'model_split')
  98. except Exception as e:
  99. logger.info("----【paper_id:{}】切题服务异常:{}".format(self.wordid, e))
  100. else:
  101. rd1_may_fail = 1
  102. if rd1_may_fail:
  103. try:
  104. r3 = requests.post(url=configs.topic_segment_ip,
  105. json={"content": "<br>".join(row_list), "subject": self.subject,
  106. "paper_id": self.wordid, "text_type": "stem_block"})
  107. item_res = r3.json()["res"]
  108. # 还需判断下教师卷
  109. for k, one_res in enumerate(item_res):
  110. if re.search('\n【(答案|[解分][析答]|详解|点[评睛]|考点|专题)】', one_res["stem"]):
  111. case = "case1" # 默认有“答案”关键字
  112. if re.search(r'\n【答案】|[\n】]\s*答案\s*[::]', one_res["stem"]) is None:
  113. # 没“答案”关键字
  114. case = "case0"
  115. dd1 = stem_ans_split(one_res, case) # 对切分后的每道题再细分
  116. one_res["stem"] = dd1["stem"]
  117. del dd1["stem"]
  118. one_res.update(dd1)
  119. else: # 没有解析的情况
  120. one_res.update({"key": "", "parse": ""})
  121. except Exception as e:
  122. logger.info("----【paper_id:{}】切题服务异常:{}".format(self.wordid, e))
  123. # ==========小题结构化========
  124. if item_res:
  125. # 答案解析字段完善
  126. for i, one_item in enumerate(item_res):
  127. if 'key' not in one_item:
  128. item_res[i]['key'] = ""
  129. if 'parse' not in one_item:
  130. item_res[i]['parse'] = ""
  131. # 单题结构化
  132. consumer = ['noslave'] * len(item_res)
  133. items_no_type = [1] * len(item_res)
  134. xyz = zip(item_res, consumer, items_no_type)
  135. res = list(map(one_item_structure, xyz)) # 和多进程相比,这样速度也很快
  136. # pprint(res)
  137. # ==========最后的清洗=========
  138. res = wash_after(res, self.wordid, self.subject)
  139. # 针对模型可能切错的地方纠正,放在切割模型预测中纠正了
  140. # for i, one_item in enumerate(res):
  141. # if i>0 and one_item['topic_num'] is None and res[i-1]['topic_num'] is not None and res[i+1]['topic_num'] is not None \
  142. # and res[i+1]['topic_num'] - res[i-1]['topic_num'] == 1 and not one_item['key'] and not one_item['parse']:
  143. # if res[i-1]["parse"]:
  144. # res[i - 1]["parse"] += one_item['stem']
  145. # del res[i]
  146. # elif res[i-1]["key"]:
  147. # res[i - 1]["key"] += one_item['stem']
  148. # del res[i]
  149. # pprint(res)
  150. # 结果返回
  151. if self.is_reparse:
  152. return {"html":new_html, "items": res}, paper_type
  153. else:
  154. return {"items": res}, paper_type
  155. else:
  156. return {}, paper_type
  157. def img_repl(self, one_dict):
  158. """
  159. 初步拆分题目后,图片信息的替换
  160. :return:
  161. """
  162. imgs = {s: re.findall("<img.*?/>", one_dict[s]) for s in ['stem', 'key', 'parse']}
  163. for k, imgs_seq in imgs.items():
  164. for img in imgs_seq:
  165. img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
  166. if img in self.subs2src:
  167. one_dict[k] = one_dict[k].replace(img, self.subs2src[img])
  168. # if type(self.img_url) == str and self.img_url:
  169. # one_dict[k] = re.sub(r'<img src="files/', '<img src="' + str(self.img_url), str(one_dict[k]))
  170. if "analy" in one_dict:
  171. for img in re.findall("<img.*?/>", one_dict["analy"]):
  172. img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
  173. one_dict["analy"] = one_dict["analy"].replace(img, self.subs2src[img])
  174. return one_dict
  175. # @func_set_timeout(30)
  176. def structure(self):
  177. """结构化入口"""
  178. # 第一步:清洗
  179. row_list, self.subs2src, new_html = HtmlWash(self.html, self.wordid, self.is_reparse,
  180. must_latex=self.must_latex).html_cleal()
  181. # pprint(row_list)
  182. if not row_list:
  183. return {"errcode": 1, "errmsgs": "题文没有有效信息", "data": {}}, ""
  184. # 判断考试类型
  185. # paper_other_info = get_exam_type(row_list)
  186. # 第二步:寻找题目和答案的切分点,一定要有“答案”关键字
  187. split_res = get_split_pos(row_list)
  188. if type(split_res) == str:
  189. return {"errcode": 1, "errmsgs": split_res, "data": {}}, paper_types[0]
  190. row_list, items_list, ans_list, is_may_ans = split_res
  191. rd2_is_fail = 0
  192. rd1_may_fail = 0
  193. item_res, paper_type, item_no_type = "", "", 1
  194. if "【答案】" in "".join(items_list) or "【解析】" in "".join(items_list):
  195. rd1_may_fail = 1
  196. else:
  197. if items_list:
  198. paper_type = paper_types[0]
  199. reform_res = items_ans_reform(items_list, ans_list)
  200. if type(reform_res) == str:
  201. return {"errcode": 1, "errmsgs": reform_res, "data": {}}, paper_type
  202. else:
  203. if len(reform_res)==2:
  204. item_res = reform_res
  205. else:
  206. item_res, item_no_type, rd2_is_fail = reform_res
  207. if not items_list or rd1_may_fail or (is_may_ans and rd2_is_fail):
  208. ans_n = re.findall("【答案】", "\n".join(row_list))
  209. if ans_n and len(ans_n) == len(re.findall("【解析】", "\n".join(row_list))) > 10: # 带相同个数的答案和解析
  210. paper_type = paper_types[2]
  211. item_res = split_by_keywords(row_list)
  212. if type(item_res) == str and re.search("格式有误|没有换行|题型不明确|题型行格式有问题", item_res):
  213. print("第一种试卷格式解析格式有误")
  214. try:
  215. paper_type = paper_types[1]
  216. item_res = split_by_topicno(row_list)
  217. except:
  218. return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
  219. else:
  220. paper_type = paper_types[1]
  221. item_res = split_by_topicno(row_list)
  222. if type(item_res) == str:
  223. return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
  224. else:
  225. item_list = item_res
  226. if type(item_res) == tuple:
  227. item_list, item_no_type = item_res
  228. # pprint(item_list)
  229. print('****************初步切分题目的个数*****************', len(item_list))
  230. res = []
  231. if item_list:
  232. item_list = img_regroup(item_list, row_list) # 图片重组判断
  233. if self.subs2src:
  234. item_list = list(map(self.img_repl, item_list)) # 图片信息替换还原
  235. # ---------初步拆分题目错误判断--------------------
  236. # ---------新题型进一步拆分--------------------
  237. # new_item = [[k, i] for k, i in enumerate(item_list) if re.search("选[修学考]", i["stem"][:10])]
  238. # have_slave = 0
  239. # to_slave = {}
  240. # if new_item:
  241. # try:
  242. # have_slave = 1
  243. # for one in new_item:
  244. # new_res = toslave_bef(one[1])
  245. # item_list[one[0]] = new_res
  246. # if type(new_res) == list:
  247. # to_slave[one[0]] = new_res
  248. # except:
  249. # pass
  250. # if to_slave:
  251. # item_list = [i if type(i) == list else [i] for i in item_list]
  252. # item_list = sum(item_list, [])
  253. # ==========小题结构化========
  254. # from multiprocessing.dummy import Pool as ThreadPool
  255. # pool = ThreadPool(2) # 比# pool = multiprocessing.Pool(3)速度快
  256. # pprint(item_list)
  257. consumer = ['toslave'] * len(item_list)
  258. items_no_type = [item_no_type] * len(item_list)
  259. xyz = zip(item_list, consumer, items_no_type)
  260. # res = list(pool.map(one_item_structure, xyz))
  261. res = list(map(one_item_structure, xyz)) # 和多进程相比,这样速度也很快
  262. # pprint(res)
  263. # ==========最后的清洗=========
  264. res = wash_after(res, self.wordid, self.subject)
  265. # if have_slave and not to_slave:
  266. # res = list(map(toslave_aft, res))
  267. # 结果返回
  268. if self.is_reparse:
  269. return {"html":new_html, "items": res}, paper_type
  270. else:
  271. return {"items": res}, paper_type
  272. @staticmethod
  273. def _get_all_errors(res):
  274. """
  275. 整套试卷结构化完成以后,把所有报错放在一个list里面:
  276. all_errors = [{"单选题第1题目":[]},{"解答题第2题":[]},{},{}]
  277. :param res:
  278. :return:
  279. """
  280. type_names = []
  281. errmgs = []
  282. spliterr_point = []
  283. for one_res in res:
  284. type_names.append(one_res["type"])
  285. if "text_errmsgs" in one_res:
  286. errmgs.append(one_res["text_errmsgs"])
  287. else:
  288. errmgs.append("")
  289. if 'spliterr_point' in one_res:
  290. spliterr_point.append(one_res['spliterr_point'])
  291. # 给同种题型的名字重新编码
  292. new_names = []
  293. for k, v in enumerate(type_names):
  294. if v:
  295. nums = str(type_names[:k]).count(v)
  296. else:
  297. nums = k
  298. if spliterr_point:
  299. add_n = insert_sort2get_idx(spliterr_point, k+1)
  300. new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1 + add_n, k + 1 + add_n))
  301. else:
  302. new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1, k + 1))
  303. all_errors = []
  304. for name, error in zip(new_names, errmgs):
  305. if len(error) > 0:
  306. all_errors.append({name: error})
  307. return all_errors
  308. if __name__ == '__main__':
  309. # 单份试卷测试
  310. import json
  311. from bson.objectid import ObjectId
  312. # path1 = r"F:\zwj\parse_2021\data\fail\2\2.txt"
  313. # path = r"F:\zwj\parse_2021\res_folder\13.html"
  314. # images_url1 = "" # "http://49.233.23.58:11086/ser_static/4439/files/"
  315. # html = "<p>"+"</p>\n<p>".join(html.split("\n"))+"</p>"
  316. # with open(r"F:\zwj\Text_Structure\fail_files3\c5e222c5fbded2a2264ae002907fc92c__2021_04_16_18_43_23.json", 'r') as load_f:
  317. # html = json.load(load_f)
  318. # print(load_dict)
  319. # path2 = r"C:\Users\Python\Desktop\bug\5-9\663c90361ec1003b58557474.html"
  320. path2 = r"F:\zwj\Text_Structure\accept_files\667cb9c0c3c4da9e7009b8c4.html"
  321. path2 = r"F:\zwj\Text_Structure\accept_files\668f4d57c3c4da9e7009bcd8.html"
  322. # path2 = r"C:\Users\Python\Desktop\bug\6419746d11a1cdad550f5502.html"
  323. # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\data\620bbf7aa7d375f4518b98e1.html"
  324. # path2 = r"F:\zwj\new_word_text_extract_v2\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级(下)第二次联考地理试卷-普通用卷.html"
  325. # path2 = r"F:\zwj\new_word_parse_2021\data\huaxue\huexue2.html"
  326. # path2 = r"C:\Users\Python\Desktop\bug\6258cc7af84c0e279ac64301.html" # 正则卡死
  327. # path2 = r"C:\Users\Python\Desktop\bug\629073b9f84c0e279ac64811.html" # 正则卡死
  328. # 62650d5cf84c0e279ac643f1 6258cc7af84c0e279ac64301 62660fa2f84c0e279ac643f5
  329. # path2 = r"C:\Users\Python\Desktop\123\666fcb5bc3c4da9e7009b607_2.html"
  330. html = open(path2, "r", encoding="utf-8").read()
  331. # html = """
  332. # <html><head><meta charset="utf-8" /></head><body>\n<p>1.下列化学符号中的数字“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示的意义不正确的是</p>\n<p>A.<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image2.png" width="21px" height="24px" data-latex="$$" />:“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示两个氧原子</p>\n<p>B.<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image3.png" width="34px" height="24px" data-latex="$$" />:“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示一个二氧化氮分子含有两个氧原子</p>\n<p>C.<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image4.png" width="40px" height="21px" data-latex="$$" />:“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示两个氢氧根离子</p>\n<p>D.<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image5.png" width="36px" height="28px" data-latex="$$" />:“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image6.png" width="21px" height="17px" data-latex="$$" />”表示氧化镁中镁元素的化合价为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image6.png" width="21px" height="17px" data-latex="$$" />价</p>\n<p>【答案】</p>\n<p>A</p>\n<p>【解析】</p>\n<p>根据元素符号前面的数字表示原子的个数,元素符号右下角的数字表示一个分子中的原子个数,离子符号前面的数字表示离子的个数,元素符号正上方的数字表示元素的化合价。A.<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image2.png" width="21px" height="24px" data-latex="$$" />:“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示一个氧分子由两个氧原子组成,故选项表示的意义不正确;B.元素符号右下角的数字表示一个分子中的原子个数,故<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image3.png" width="34px" height="24px" data-latex="$$" />:“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示一个二氧化氮分子含有两个氧原子,故表示的意义正确;C.离子符号前面的数字表示离子的个数,故<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image4.png" width="40px" height="21px" data-latex="$$" />:“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示两个氢氧根离子,故表示的意义正确;D.元素符号正上方的数字表示元素的化合价,故<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image7.png" width="36px" height="28px" data-latex="$$" />中的“<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image1.png" width="13px" height="17px" data-latex="$$" />”表示镁元素的化合价为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image6.png" width="21px" height="17px" data-latex="$$" />价,故表示的意义正确。故选:A。</p>\n<p> </p>\n<p>2.亚油酸具有降低人体血液中胆固醇及血脂的作用,它的化学式为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image8.png" width="64px" height="24px" data-latex="$$" />,下列说法中正确的是</p>\n<p>A.亚油酸是由三个元素构成的化合物</p>\n<p>B.每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image9.png" width="20px" height="18px" data-latex="$$" />个原子</p>\n<p>C.亚油酸中碳.氧元素的质量比为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image10.png" width="38px" height="18px" data-latex="$$" /></p>\n<p>D.每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image11.png" width="18px" height="18px" data-latex="$$" />个碳原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image12.png" width="20px" height="18px" data-latex="$$" />个氢原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image13.png" width="9px" height="17px" data-latex="$$" />个氧分子</p>\n<p>【答案】</p>\n<p>C</p>\n<p>【解析】</p>\n<p>A.由化学式可知,亚油酸是由<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image14.png" width="16px" height="18px" data-latex="$$" />、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image15.png" width="17px" height="17px" data-latex="$$" />、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image16.png" width="16px" height="18px" data-latex="$$" />三种元素组成的化合物,A错误。B.每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image17.png" width="102px" height="18px" data-latex="$$" />个原子,B错误。C.亚油酸中碳.氧元素的质量比为<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image18.png" width="184px" height="18px" data-latex="$$" />,C正确。D.每个亚油酸分子中含有<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image11.png" width="18px" height="18px" data-latex="$$" />个碳原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image12.png" width="20px" height="18px" data-latex="$$" />个氢原子、<img src="http://192.168.1.140:8800/ser_static/1638177060408824/word/media/image19.png" width="13px" height="17px" data-latex="$$" />个氧原子,D错误。故选:C。</p>\n</body></html>
  333. # """
  334. # print(html)
  335. # html = "\n1、已知集合M满足{1,2}≤M≤{1,2,5,6,7},则\n符合条件的集合M有__个."
  336. # html = html.replace('<img src="files', '<img src="/word/media')
  337. res1 = WordParseStructure(html, "668f4d57c3c4da9e7009bcd8",
  338. is_reparse=1, must_latex=1,
  339. source="qtk", subject="数学")()
  340. # new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html")
  341. # re_f = open(new_fpath, 'a+', encoding='utf-8')
  342. # for i in res1[0]["items"]:
  343. # re_f.write(str(i))
  344. # pprint(res1)
  345. pprint(res1[0]['items'])
  346. print('题目数量:', len(res1[0]["items"]))
  347. # with open(r"F:\zwj\Text_Structure\accept_files\temp.json", "w",encoding='utf-8') as f:
  348. # json.dump(res1[0]["items"],f, ensure_ascii=False)
  349. # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"
  350. # re_f = open(new_fpath, 'w', encoding='utf-8')
  351. # json.dump(res1, re_f, ensure_ascii=False)
  352. # mongo = Mongo()
  353. # data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")})
  354. # # pprint(data["item_ocr"])
  355. # res1 = WordParseStructure(data["item_ocr"], images_url1).structure()
  356. # print(res1)
  357. # print('题目数量:', len(res1[0]["items"]))
  358. # 6837 序号有些乱 6836 图片位置和格式有问题
  359. # 6822 16A、和16B、类型的序号怎么处理 'item_id'有int和 str 型,须统一处理下
  360. # 6820 答案页没有明显标识
  361. # 14.html 只有答案,没有题干
  362. # 21.html 多套题目在一起,多个从1开始的序号,最后一道题,把后面题目都放在一起了,需要判断一下吗?
  363. # import json
  364. # re_f = open("207.txt", 'w', encoding='utf-8')
  365. # json.dump(res1[0], re_f)
  366. # json文件
  367. # for file in os.listdir(r"F:\zwj\Text_Structure\fail_files"):
  368. # path1 = os.path.join(r"F:\zwj\Text_Structure\fail_files", file)
  369. # # path1 = r"F:\zwj\Text_Structure\fail_files\89a6911f57bf89aba898651b27d2a2fc__2021_04_09_18_50_19.json"
  370. # with open(path1,'r',encoding='utf-8') as f:
  371. # html= json.load(f)
  372. # pprint(html)
  373. # # try:
  374. # # res1 = WordParseStructure(html, "").structure()
  375. # # os.remove(path1)
  376. # # except:
  377. # # pass
  378. # res1 = WordParseStructure(html, "").structure()
  379. # pprint(res1)
  380. # print('题目数量:', len(res1[0]["items"]))