structure_main.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import sys
  4. sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_art")
  5. from pprint import pprint
  6. # from utils.exam_type import get_exam_type
  7. from structure.final_structure import one_item_structure
  8. from utils.stem_ans_split import get_split_pos
  9. from utils.washutil import *
  10. from structure.three_parse_structure import *
  11. from utils.pic_pos_judge import img_regroup
  12. from structure.paper_text_structure import WordParseStructure
  13. from func_timeout import func_set_timeout
  14. from utils.xuanzuoti2slave import toslave_bef, toslave_aft
  15. paper_types = ["第三种试卷格式:题目与答案分开",
  16. "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
  17. "第一种试卷格式:教师用卷,含答案和解析关键字"]
  18. class StructureExporter(WordParseStructure):
  19. """
  20. 基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
  21. """
  22. def img_repl(self, one_dict):
  23. """
  24. 初步拆分题目后,图片信息的替换
  25. :return:
  26. """
  27. imgs = {s: re.findall("<img.*?/>", one_dict[s]) for s in ['stem', 'key', 'parse', 'com_stem'] if s in one_dict}
  28. for k, imgs_seq in imgs.items():
  29. for img in imgs_seq:
  30. img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
  31. one_dict[k] = one_dict[k].replace(img, self.subs2src[img])
  32. # if type(self.img_url) == str and self.img_url:
  33. # one_dict[k] = re.sub(r'<img src="files/', '<img src="' + str(self.img_url), str(one_dict[k]))
  34. if "analy" in one_dict:
  35. for img in re.findall("<img.*?/>", one_dict["analy"]):
  36. img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
  37. one_dict["analy"] = one_dict["analy"].replace(img, self.subs2src[img])
  38. return one_dict
  39. # @func_set_timeout(30)
  40. def export(self):
  41. """结构化入口"""
  42. if not self.row_list:
  43. return {"errcode": 1, "errmsgs": "题文没有有效信息", "data": {}}, ""
  44. # print(self.row_list)
  45. # 判断考试类型
  46. # paper_other_info = get_exam_type(row_list)
  47. # 第二步:寻找题目和答案的切分点,一定要有“答案”关键字
  48. split_res = get_split_pos(self.row_list)
  49. if type(split_res) == str:
  50. return {"errcode": 1, "errmsgs": split_res, "data": {}}, paper_types[0]
  51. row_list, items_list, ans_list, is_may_ans = split_res
  52. rd2_is_fail = 0
  53. rd1_may_fail = 0
  54. item_res, paper_type, item_no_type = "", "", 1
  55. item_groups, ans_groups = {}, {}
  56. if "【答案】" in "".join(items_list) or "【解析】" in "".join(items_list):
  57. rd1_may_fail = 1
  58. else:
  59. if items_list:
  60. paper_type = paper_types[0]
  61. reform_res = items_ans_reform(items_list, ans_list, self.subject)
  62. if type(reform_res) == str:
  63. return {"errcode": 1, "errmsgs": reform_res, "data": {}}, paper_type
  64. else:
  65. # item_res = reform_res
  66. if len(reform_res) == 2:
  67. item_res, item_no_type = reform_res
  68. else:
  69. item_res, item_no_type, rd2_is_fail, item_groups = reform_res
  70. if not items_list or rd1_may_fail or (is_may_ans and rd2_is_fail):
  71. ans_n = re.findall("【答案】", "\n".join(row_list))
  72. parse_n = len(re.findall("【解析】", "\n".join(row_list)))
  73. if self.subject not in ["地理", "语文"] and ans_n and len(ans_n) == parse_n > 10: # 带相同个数的答案和解析
  74. paper_type = paper_types[2]
  75. item_res = split_by_keywords(row_list, self.subject)
  76. if type(item_res) == str and re.search("格式有误|没有换行|题型不明确|题型行格式有问题", item_res):
  77. print("第一种试卷格式解析格式有误")
  78. try:
  79. paper_type = paper_types[1]
  80. item_res = split_by_topicno(row_list, self.subject)
  81. except:
  82. return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
  83. else:
  84. paper_type = paper_types[1]
  85. item_res = split_by_topicno(row_list, self.subject)
  86. item_list = []
  87. if type(item_res) == str:
  88. return {"errcode": 1, "errmsgs": item_res, "data": {}}, paper_type
  89. else:
  90. if type(item_res) == tuple:
  91. if len(item_res) == 2:
  92. item_list, item_no_type = item_res
  93. else:
  94. item_list, item_no_type, item_groups, ans_groups = item_res
  95. elif type(item_res) == list:
  96. item_list = item_res
  97. # pprint(item_list)
  98. print('****************初步切分题目的个数*****************', len(item_list))
  99. res = []
  100. if item_list:
  101. item_list = img_regroup(item_list, row_list) # 图片重组判断
  102. if self.subs2src:
  103. item_list = list(map(self.img_repl, item_list)) # 图片信息替换还原
  104. # ---------初步拆分题目错误判断--------------------
  105. # ---------新题型进一步拆分--------------------
  106. # new_item = [[k, i] for k, i in enumerate(item_list) if re.search("选[修学]", i["stem"][:10])]
  107. # have_slave = 0
  108. # to_slave = []
  109. # if new_item:
  110. # try:
  111. # have_slave = 1
  112. # for one in new_item:
  113. # new_res = toslave_bef(one[1])
  114. # if type(new_res) == list:
  115. # to_slave.extend(new_res)
  116. # item_list.remove(one[1])
  117. # else:
  118. # item_list[one[0]] = new_res
  119. # except:
  120. # pass
  121. # if to_slave:
  122. # item_list.extend(to_slave)
  123. # ==========小题结构化========
  124. # from multiprocessing.dummy import Pool as ThreadPool
  125. # pool = ThreadPool(2) # 比# pool = multiprocessing.Pool(3)速度快
  126. # pprint(item_list)
  127. consumer = ['toslave'] * len(item_list) # noslave
  128. items_no_type = [item_no_type] * len(item_list)
  129. sj = [self.subject] * len(item_list)
  130. xyz = zip(item_list, consumer, items_no_type, sj, [0] * len(item_list))
  131. # res = list(pool.map(one_item_structure, xyz))
  132. res = list(map(one_item_structure, xyz)) # 和多进程相比,这样速度也很快
  133. # ==========最后的清洗=========
  134. # pprint(res)
  135. res = wash_after(res, item_groups, ans_groups, self.subject)
  136. # if have_slave and not to_slave:
  137. # res = list(map(toslave_aft, res))
  138. # 结果返回
  139. if self.is_reparse:
  140. return {"html": self.new_html, "items": res}, paper_type
  141. else:
  142. return {"items": res}, paper_type
  143. @staticmethod
  144. def _get_all_errors(res):
  145. """
  146. 整套试卷结构化完成以后,把所有报错放在一个list里面:
  147. all_errors = [{"单选题第1题目":[]},{"解答题第2题":[]},{},{}]
  148. :param res:
  149. :return:
  150. """
  151. type_names = []
  152. errmgs = []
  153. spliterr_point = []
  154. for one_res in res:
  155. type_names.append(one_res["type"])
  156. if "text_errmsgs" in one_res:
  157. errmgs.append(one_res["text_errmsgs"])
  158. else:
  159. errmgs.append("")
  160. if 'spliterr_point' in one_res:
  161. spliterr_point.append(one_res['spliterr_point'])
  162. # 给同种题型的名字重新编码
  163. new_names = []
  164. for k, v in enumerate(type_names):
  165. if v:
  166. nums = str(type_names[:k]).count(v)
  167. else:
  168. nums = k
  169. if spliterr_point:
  170. add_n = insert_sort2get_idx(spliterr_point, k+1)
  171. new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1 + add_n, k + 1 + add_n))
  172. else:
  173. new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1, k + 1))
  174. all_errors = []
  175. for name, error in zip(new_names, errmgs):
  176. if len(error) > 0:
  177. all_errors.append({name: error})
  178. return all_errors
  179. if __name__ == '__main__':
  180. # 单份试卷测试
  181. import json
  182. from bson.objectid import ObjectId
  183. # path1 = r"F:\zwj\parse_2021\data\fail\2\2.txt"
  184. # path = r"F:\zwj\parse_2021\res_folder\13.html"
  185. # images_url1 = "" # "http://49.233.23.58:11086/ser_static/4439/files/"
  186. # html = "<p>"+"</p>\n<p>".join(html.split("\n"))+"</p>"
  187. # with open(r"F:\zwj\Text_Structure\fail_files3\c5e222c5fbded2a2264ae002907fc92c__2021_04_16_18_43_23.json", 'r') as load_f:
  188. # html = json.load(load_f)
  189. # print(load_dict)
  190. # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\地理\3\安徽高三地理.html"
  191. # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\地理\2\gd1.html"
  192. # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\地理\shuguang.html"
  193. # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\huaxue\huexue2.html"
  194. # path2 = r"F:\zwj\new_word_text_extract_2021\data\phy_clean.html"
  195. # path2 = r"G:\zwj\Word2Html\data\yuwen\yuwen1.html"
  196. # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_art\data\语文\bj.html"
  197. # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\语文\2\tianjin.html"
  198. # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\语文\2\61c5380666e78ea2a20b4ff0.html"
  199. # path2 = r"G:\zwj\WL\Text_Structure\new_tiku_structure_v3_art\data\政治\jiexi_bj.html"
  200. # path2 = r"F:\zwj\Text_Structure\accept_files\62a2f9b9765759d85567a3e4.html"
  201. # 6239991e6ca622396925f66b 624cf82d12cd45a7836f3430 626b4b1f81b582c0470d01b0
  202. # 627b64b0814132f0d7b12589 627b622981b582c0470d020e
  203. # 6294326cf84c0e279ac6484e.html 62903acaf84c0e279ac647fb
  204. path2 = r"C:\Users\Administrator\Desktop\666a67fec3c4da9e7009b531.html"
  205. path2 = r"C:\Users\Administrator\Desktop\66753958c3c4da9e7009b7ae.html"
  206. path2 = r"F:\zwj\Text_Structure\accept_files\66799166c3c4da9e7009b84f_2.html"
  207. html = open(path2, "r", encoding="utf-8").read()
  208. # html = json.loads(html) 621845626ca622396925f55c
  209. html2 = """
  210. 1. I’m anxious___________ your injury.Are you feeling any better now?
  211. 2. After he was back on his feet, he was anxious___________ (return) to school as soon as possible.
  212. 3. Helen was ___________ to death when she saw the ___________scene.She hid herself in the corner, shaking with___________(fright).
  213. 4. The music outside___________ (annoy) Tom soon. He couldn’t keep his___________ (concentrate) with such ___________ (annoy) music going on yesterday.
  214. 5. With so many people talking around, he still concentrated ___________ doing his homework,which left a deep impression on me.
  215. 6. The result was far beyond ___________ we had expected, which brought great joy to every one of us.
  216. 7. If the dress doesn’t fit you, take it back and you can exchange it ___________ another one of the same price.
  217. 8. The dictionary is out of date:many words have been added ___________ the language since it came out.
  218. 9. This vacation I went to an island on the Pacific and ___________ by its scenery. The island has left a lasting ___________ on me.
  219. 10. We are confident about the future and will never lose our confidence ___________ the achievements we will make.
  220. 11. He has worked for nearly 20 years, so he is senior ___________ most of his workmates.
  221. 12. Although he is three years junior ___________ me, he has more work experience.
  222. """
  223. res1 = StructureExporter(html, "202406251733", "语文", 0).export()
  224. # new_fpath = os.path.join(r"G:\zwj\WL\Text_Structure\fail_files", "res_政治.json")
  225. # re_f = open(new_fpath, 'w', encoding='utf-8')
  226. # json.dump(res1[0]["items"], re_f, ensure_ascii=False)
  227. # for i in res1[0]["items"]:
  228. # re_f.write(str(i))
  229. pprint(res1[0]["items"])
  230. # pprint(res1[0]["html"])
  231. print('题目数量:', len(res1[0]["items"]))
  232. # new_fpath = r"F:\zwj\Text_Structure\new_tiku_structure_2021\res_folder\10-28.json"
  233. # re_f = open(new_fpath, 'w', encoding='utf-8')
  234. # json.dump(res1, re_f, ensure_ascii=False)
  235. # mongo = Mongo()
  236. # data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")})
  237. # # pprint(data["item_ocr"])
  238. # res1 = WordParseStructure(data["item_ocr"], images_url1).structure()
  239. # print(res1)
  240. # print('题目数量:', len(res1[0]["items"]))
  241. # 6837 序号有些乱 6836 图片位置和格式有问题
  242. # 6822 16A、和16B、类型的序号怎么处理 'item_id'有int和 str 型,须统一处理下
  243. # 6820 答案页没有明显标识
  244. # 14.html 只有答案,没有题干
  245. # 21.html 多套题目在一起,多个从1开始的序号,最后一道题,把后面题目都放在一起了,需要判断一下吗?
  246. # import json
  247. # re_f = open("207.txt", 'w', encoding='utf-8')
  248. # json.dump(res1[0], re_f)
  249. # json文件
  250. # for file in os.listdir(r"F:\zwj\Text_Structure\fail_files"):
  251. # path1 = os.path.join(r"F:\zwj\Text_Structure\fail_files", file)
  252. # # path1 = r"F:\zwj\Text_Structure\fail_files\89a6911f57bf89aba898651b27d2a2fc__2021_04_09_18_50_19.json"
  253. # with open(path1,'r',encoding='utf-8') as f:
  254. # html= json.load(f)
  255. # pprint(html)
  256. # # try:
  257. # # res1 = WordParseStructure(html, "").structure()
  258. # # os.remove(path1)
  259. # # except:
  260. # # pass
  261. # res1 = WordParseStructure(html, "").structure()
  262. # pprint(res1)
  263. # print('题目数量:', len(res1[0]["items"]))