structure_mian.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import os
  4. import re
  5. from pprint import pprint
  6. from utils.exam_type import get_exam_type
  7. from utils.get_data import Mongo
  8. from utils.stem_ans_split import get_split_pos
  9. from utils.washutil import *
  10. from structure.option import option_structure
  11. from structure.three_parse_structure import *
  12. from utils.pic_pos_judge import img_regroup
  13. from utils.dati2slave import get_slave
  14. from func_timeout import func_set_timeout
  15. # 各题型结构化
  16. def one_item_structure(xyz):
  17. """
  18. 判断解析类型,解析类型为:
  19. if:
  20. 1.content不需要再做其他处理<-- 答案没有[;;],且答案不是ABCDEFG
  21. 2.选择题类,需要把content中的ABCD各选项内容提取出来<--答案是ABCDEFG
  22. else:
  23. 都要看是否含有小题,如果含有小题,需要把小题提取出来,slave
  24. 3.填空题类,(1)需要提取content中下划线的个数
  25. 选择题结构化:单选或者多选<--要把各选项是什么提取出来放在slave中
  26. one_item:{"content":xxxx,"answer":xxx,"parse":xxx}
  27. consumer: 分“高中数学”还是“全学科”;
  28. item_no_type:题号是否以(\d)的形式
  29. :return:
  30. """
  31. one_item, consumer, item_no_type = xyz
  32. # print(one_item)
  33. if "【章节】" in one_item["parse"]: # 属于后一个题的,后面须调整
  34. one_item["chapter"] = one_item["parse"].split("【章节】")[1].split("\n")[0]
  35. one_item["parse"] = one_item["parse"].replace("【章节】" + one_item["chapter"], "")
  36. if "【章节】" in one_item["content"]: # 属于后一个题的,后面须调整
  37. one_item["chapter"] = one_item["content"].split("【章节】")[1].split("\n")[0]
  38. one_item["content"] = one_item["content"].replace("【章节】" + one_item["chapter"], "")
  39. if "【选做题】" in one_item["content"] + one_item["answer"] + one_item["parse"]:
  40. opt_str = re.search(r"【选做题】:'(\d+)分'", one_item["content"] + one_item["answer"] + one_item["parse"])
  41. one_item["option_st"] = "选做题,"+opt_str.group(1) if opt_str else "选做题" # 选做题开始的位置,后面的题开始是选做题
  42. one_item["content"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["content"])
  43. one_item["answer"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["answer"])
  44. one_item["parse"] = re.sub("【选做题】(:'(\d+)分')?", "", one_item["parse"])
  45. ans = one_item["answer"]
  46. con = one_item["content"]
  47. parse = re.sub(r"((?<=[\n】])|^)\s*解\s*[::]", "", one_item["parse"])
  48. if not one_item["item_topic_name"]:
  49. # one_item["errmsgs"].append("本题没有给出明确题型!")
  50. # return one_item
  51. if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()):
  52. one_item["item_topic_name"] = "单选题" if len(ans.strip()) == 1 else "多选题"
  53. elif re.search(r"[((]\s*[))]", one_item["content"]) or \
  54. len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["content"])) >= 4:
  55. one_item["item_topic_name"] = "选择题"
  56. elif re.findall(r"_{2,}", one_item["content"]):
  57. one_item["item_topic_name"] = "填空题"
  58. else:
  59. one_item["item_topic_name"] = "简答题"
  60. topic_type = one_item["item_topic_name"]
  61. # print(topic_type)
  62. if topic_type.replace("题", "") in ["单选", "多选", "选择"]:
  63. one_item = option_structure(one_item, con, ans, item_no_type)
  64. elif consumer == 'toslave': # 拆小题
  65. one_item = get_slave(one_item, con, parse, ans)
  66. if ('slave' not in one_item or not one_item['slave']) and 'analy' in one_item:
  67. del one_item['analy']
  68. if one_item["item_topic_name"] == "多选题":
  69. one_item = option_structure(one_item, con, ans, item_no_type)
  70. else: # 不拆小题,非选择题
  71. pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+((<img src=.*?/>|[^_;;。?!])+?)_+([cdkm上]?m?\s*.?[。.?]?\s*($|<br/>|<img src|……))")
  72. pattern2 = re.compile(r"((有|存在|[是为])[\u4e00-\u9fa5]{0,2})\s*_+(\d+)_+\s*([\u4e00-\u9fa5,,;;。..])")
  73. if re.findall(r"_{2,}", one_item["content"]): # re.search("_+([^_]*?)_+", one_item['content']):
  74. one_item["blank_num"] = len(re.findall(r"_{2,}", one_item["content"]))
  75. else:
  76. # 是否只需将所有标点符号去除即可,这里容易判断错误!!!!
  77. if re.search("^[A-Z]{2,}$",
  78. re.sub(r"[^\w><≤≥≡≦≧+-≈≠﹢﹣±㏒㏑∑∏π><==×÷/()()﹙﹚\[\]﹛﹜{\}∧∨∠▰▱△∆⊙⌒"
  79. r"⊆⊂⊇⊃∈∩∉∪⊕∥∣≌∽∞∝⊥∫∬∮∯Φ∅≮≯∁∴∵∷←↑→↓↖↗↘↙‖〒¤○′″¢°℃℉"
  80. r"αβγδεζηθικλμνξορστυφχψωϕ%‰℅㎎㎏㎜㎝㎞㎡㎥㏄㏎㏕$£¥º¹²³⁴ⁿ₁₂₃₄·∶½⅓⅔¼¾⅛⅜⅝⅞"
  81. r"ΑΒΓΔΕΖΗΘΙΚΜ]", "", ans)):
  82. one_item["item_topic_name"] = "多选题"
  83. one_item = option_structure(one_item, con, ans, item_no_type)
  84. if one_item["item_topic_name"] == "填空题" and re.search("_{2,}", one_item['content']) is None:
  85. # -----放在huanhang_wash_after中调整--------------
  86. # blank_ans =[]
  87. # while re.search(pattern1, one_item["content"]): # 答案直接填在____上的情况
  88. # blank_con1 = re.search(pattern1, one_item["content"])
  89. # one_item["content"] = one_item["content"].replace(blank_con1.group(0),
  90. # blank_con1.group(1) + "____" + blank_con1.group(4))
  91. # blank_ans.append(blank_con1.group(2))
  92. # while re.search(pattern2, one_item["content"]): # 答案直接填在____上的情况
  93. # blank_con1 = re.search(pattern2, one_item["content"])
  94. # one_item["content"] = one_item["content"].replace(blank_con1.group(0),
  95. # blank_con1.group(1) + "____" + blank_con1.group(3))
  96. # blank_ans.append(blank_con1.group(2))
  97. # if not ans:
  98. # one_item["answer"] = ";".join(blank_ans)
  99. # one_item["blank_num"] = len(blank_ans)
  100. # ----------------------------------------------
  101. if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()):
  102. one_item["item_topic_name"] = "单选题" if len(ans.strip()) == 1 else "多选题"
  103. one_item = option_structure(one_item, con, ans, item_no_type)
  104. elif re.search(r"[((]\s*[))]", one_item["content"]) or ('步骤' not in one_item["content"] and
  105. len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["content"])) >= 4):
  106. one_item["item_topic_name"] = "选择题"
  107. one_item = option_structure(one_item, con, ans, item_no_type)
  108. elif re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["content"]):
  109. one_item["blank_num"] = len(re.findall('(有|存在|[是为==])[ \s]{3,}[a-zA-Z]', one_item["content"]))
  110. elif re.findall('[ \s]{3,}[a-zA-Z]\s*[,;.。;,]', one_item["content"]):
  111. one_item["blank_num"] = len(re.findall('\s{3,}\n*\s*[a-zA-Z]\s*[,;.。;,.]', one_item["content"]))
  112. elif re.search(pattern1, one_item["content"]) is None and re.search(pattern2, one_item["content"]) is None:
  113. stem = re.sub("<img src=.*?/>|[,,.。.、、]", "", one_item["content"])
  114. if len(stem) > 2:
  115. one_item["item_topic_name"] = "解答题"
  116. # print('------------------------------------------------')
  117. if one_item:
  118. if re.match(r"(\[.*?\])?\s*\(.*?(\d+)分\)", one_item["content"].strip()):
  119. one_item["score"] = float(re.match(r"(\[.*?\])?\(.*?(\d+)分\)", one_item["content"].strip()).group(2))
  120. one_item["content"] = re.sub(r"(\[.*?\])?\(.*?\d+分\)", "", one_item["content"][:20]) + one_item["content"][20:]
  121. return one_item
  122. paper_types = ["第三种试卷格式:题目与答案分开",
  123. "第二种试卷格式: 不同时含有或都不含有{答案}和{解析}关键字",
  124. "第一种试卷格式:教师用卷,含答案和解析关键字"]
  125. @func_set_timeout(30)
  126. class WordParseStructure:
  127. """
  128. 基于wordbin出来的html结果进一步做 试卷类型 非模板结构化
  129. """
  130. def __init__(self, html, images_url, is_reparse=0):
  131. self.html = html
  132. self.img_url = images_url
  133. self.is_reparse = is_reparse
  134. def img_repl(self, one_dict):
  135. """
  136. 初步拆分题目后,图片信息的替换
  137. :return:
  138. """
  139. # print("one_dict:", one_dict)
  140. #
  141. imgs = {s: re.findall("<img.*?/>", one_dict[s]) for s in ['content', 'answer', 'parse']}
  142. for k, imgs_seq in imgs.items():
  143. for img in imgs_seq:
  144. img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
  145. one_dict[k] = one_dict[k].replace(img, self.subs2src[img])
  146. if type(self.img_url) == str and self.img_url:
  147. one_dict[k] = re.sub(r'<img src="files/', '<img src="' + str(self.img_url), str(one_dict[k]))
  148. if "analy" in one_dict:
  149. for img in re.findall("<img.*?/>", one_dict["analy"]):
  150. img = re.sub("(?<!\s)(w_h|data-latex)=", r" \1=", img)
  151. one_dict["analy"] = one_dict["analy"].replace(img, self.subs2src[img])
  152. return one_dict
  153. def structure(self):
  154. """结构化入口"""
  155. # 第一步:清洗
  156. row_list, self.subs2src = html_cleal(self.html, self.img_url, self.is_reparse)
  157. # pprint(row_list)
  158. # print(self.subs2src)
  159. if not row_list:
  160. return {"errcode": 1, "errmsgs": "题文没有有效信息", "items": []}, ""
  161. # 判断考试类型
  162. # paper_other_info = get_exam_type(row_list)
  163. # 第二步:寻找题目和答案的切分点,一定要有“答案”关键字
  164. split_res = get_split_pos(row_list)
  165. if type(split_res) == str:
  166. return {"errcode": 1, "errmsgs": split_res, "items": []}, paper_types[0]
  167. row_list, items_list, ans_list, is_may_ans = split_res
  168. rd2_is_fail = 0
  169. item_res, paper_type, item_no_type = "", "", 1
  170. if items_list:
  171. paper_type = paper_types[0]
  172. reform_res = items_ans_reform(items_list, ans_list)
  173. if type(reform_res) == str:
  174. return {"errcode": 1, "errmsgs": reform_res, "items": []}, paper_type
  175. else:
  176. item_res, item_no_type, rd2_is_fail = reform_res
  177. if not items_list or (is_may_ans and rd2_is_fail):
  178. ans_n = re.findall("【答案】", "\n".join(row_list))
  179. if ans_n and len(ans_n) == len(re.findall("【解析】", "\n".join(row_list))) > 10: # 带相同个数的答案和解析
  180. paper_type = paper_types[2]
  181. item_res = split_by_keywords(row_list)
  182. if type(item_res) == str and re.search("格式有误|没有换行|题型不明确", item_res):
  183. print("第一种试卷格式解析格式有误")
  184. try:
  185. paper_type = paper_types[1]
  186. item_res = split_by_topicno(row_list)
  187. except:
  188. return {"errcode": 1, "errmsgs": item_res, "items": []}, paper_type
  189. else:
  190. paper_type = paper_types[1]
  191. item_res = split_by_topicno(row_list)
  192. #
  193. print(paper_type)
  194. # pprint(item_res)
  195. if type(item_res) == str:
  196. return {"errcode": 1, "errmsgs": item_res, "items": []}, paper_type
  197. else:
  198. item_list = item_res
  199. if type(item_res) == tuple:
  200. item_list, item_no_type = item_res
  201. # pprint(item_list)
  202. print('****************初步切分题目的个数*****************', len(item_list))
  203. res = []
  204. if item_list:
  205. item_list = img_regroup(item_list) # 图片重组判断
  206. # 图片信息替换还原------------------------
  207. item_list = list(map(self.img_repl, item_list))
  208. # ---------初步拆分题目错误判断--------------------
  209. # ===================================小题结构化======================================================
  210. # from multiprocessing.dummy import Pool as ThreadPool
  211. # pool = ThreadPool(2) # 比# pool = multiprocessing.Pool(3)速度快
  212. consumer = ['noslave'] * len(item_list)
  213. # consumer = ['all'] * len(item_list)
  214. items_no_type = [item_no_type] * len(item_list)
  215. xyz = zip(item_list, consumer, items_no_type)
  216. # res = list(pool.map(one_item_structure, xyz))
  217. res = list(map(one_item_structure, xyz))
  218. # res = list(map(one_item_structure, item_list, consumer, items_no_type)) # 这样速度也很快
  219. # pprint(res)
  220. # ===================================最后的清洗======================================================
  221. res = huanhang_wash_after(res)
  222. # add_all_error = self._get_all_errors(res)
  223. # if add_all_error:
  224. # # errmsg = add_all_error
  225. # errmsg = ""
  226. # errcode = 1
  227. # else:
  228. # errmsg = ""
  229. # errcode = 0
  230. # "errcode": errcode, "errmsgs": "<br/>".join(map(lambda x: str(x), errmsg)),
  231. return {"items": res}, paper_type # 整合了所有错误的结果
  232. @staticmethod
  233. def _get_all_errors(res):
  234. """
  235. 整套试卷结构化完成以后,把所有报错放在一个list里面:
  236. all_errors = [{"单选题第1题目":[]},{"解答题第2题":[]},{},{}]
  237. :param res:
  238. :return:
  239. """
  240. type_names = []
  241. errmgs = []
  242. spliterr_point = []
  243. for one_res in res:
  244. type_names.append(one_res["type"])
  245. if "text_errmsgs" in one_res:
  246. errmgs.append(one_res["text_errmsgs"])
  247. else:
  248. errmgs.append("")
  249. if 'spliterr_point' in one_res:
  250. spliterr_point.append(one_res['spliterr_point'])
  251. # 给同种题型的名字重新编码
  252. new_names = []
  253. for k, v in enumerate(type_names):
  254. if v:
  255. nums = str(type_names[:k]).count(v)
  256. else:
  257. nums = k
  258. if spliterr_point:
  259. add_n = insert_sort2get_idx(spliterr_point, k+1)
  260. new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1 + add_n, k + 1 + add_n))
  261. else:
  262. new_names.append("{}第{}题(在整份word中的序号为{}题)".format(v, nums + 1, k + 1))
  263. all_errors = []
  264. for name, error in zip(new_names, errmgs):
  265. if len(error) > 0:
  266. all_errors.append({name: error})
  267. return all_errors
  268. if __name__ == '__main__':
  269. # 单份试卷测试
  270. import json
  271. from bson.objectid import ObjectId
  272. # path1 = r"F:\zwj\parse_2021\data\fail\2\2.txt"
  273. # path = r"F:\zwj\parse_2021\res_folder\13.html"
  274. # images_url1 = "" # "http://49.233.23.58:11086/ser_static/4439/files/"
  275. # html = "<p>"+"</p>\n<p>".join(html.split("\n"))+"</p>"
  276. # with open(r"F:\zwj\WL\parse_2021\res_folder\9aa310629f1153f0b20951e550611359__2021_03_12_10_42_44.json",
  277. # 'r') as load_f:
  278. # html = json.load(load_f)
  279. # print(load_dict)
  280. # path2 = r"F:\zwj\parse_2021\data\fail\doc\11\11.html"
  281. path2 = r"F:\zwj\parse_2021\res_folder\2021_04_02_18_01_41.html"
  282. html = open(path2, "r", encoding="utf-8").read()
  283. # print(html)
  284. res1 = WordParseStructure(html, "").structure()
  285. pprint(res1)
  286. print('题目数量:', len(res1[0]["items"]))
  287. # mongo = Mongo()
  288. # data = mongo.get_data_info({"_id": ObjectId("5fc64c9c4994183dda7e75b2")})
  289. # # pprint(data["item_ocr"])
  290. # res1 = WordParseStructure(data["item_ocr"], images_url1).structure()
  291. # print(res1)
  292. # print('题目数量:', len(res1[0]["items"]))
  293. # 6837 序号有些乱 6836 图片位置和格式有问题
  294. # 6822 16A、和16B、类型的序号怎么处理 'item_id'有int和 str 型,须统一处理下
  295. # 6820 答案页没有明显标识
  296. # 14.html 只有答案,没有题干
  297. # 21.html 多套题目在一起,多个从1开始的序号,最后一道题,把后面题目都放在一起了,需要判断一下吗?
  298. # import json
  299. # re_f = open("207.txt", 'w', encoding='utf-8')
  300. # json.dump(res1[0], re_f)
  301. # json文件
  302. # path1 = r"F:\zwj\parse_2021\res_folder\674a594b0dd55d8ecdf9406f9f699359__2021_03_30_13_08_54.json"
  303. # with open(path1,'r',encoding='utf-8') as f:
  304. # html= json.load(f)
  305. # pprint(html)
  306. # res1 = WordParseStructure(html, "").structure()
  307. # print(res1)
  308. # print('题目数量:', len(res1[0]["items"]))