washutil.py 54 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. # 本文件包含以下函数
  4. # table_label_cleal:去掉表格中的换行符
  5. # html_cleal :html文件清洗
  6. # wash_after: 处理最终结果多余的换行符
  7. import datetime
  8. import re
  9. import shutil
  10. # from operator import itemgetter
  11. # from itertools import groupby
  12. # from PIL import Image
  13. import base64, os, random
  14. import time
  15. import requests
  16. import hashlib
  17. from pprint import pprint
  18. # from bs4 import BeautifulSoup
  19. # UPLOAD_FOLDER = config.UPLOAD_FOLDER
  20. import configs
  21. from utils.equation_extract import get_equation_instr, get_simpstr2eqn
  22. from utils.field_eq2latex import get_latex
  23. from utils.html_again_parse import css_label_wash
  24. # from structure.structure_main import WordParseStructure
  25. def table_label_cleal(con):
  26. """
  27. 去掉表格中的【换行符】
  28. """
  29. # print(con)
  30. # print('------------------------------------------')
  31. con = re.sub(r"\n(\s|\n|\t)+", "\n", con)
  32. count = 1
  33. while re.search(r"</?[a-z]+>\n(</?[a-z]+>|<td\s+\n*[a-z=\"\d]+>)", con, re.S) and count <= 10:
  34. con = re.sub("(</?t[dr]>|</?table>|</?tbody>|</?div>)\n(</?t[dr]>|</div>|</?table>|</?tbody>|<p>)",
  35. r"\1\2", con, flags=re.S)
  36. con = re.sub(r'(</?t[rd]>)\n(<td\s.+?>)', r'\1\2', con, flags=re.S)
  37. count += 1
  38. # if re.search(r"<table>(.|\n)+?</table>", con, re.S|re.M):
  39. # aa = re.search(r"(<table>(.|\n)+?</table>)", con, re.S|re.M)
  40. # con = con.replace(aa.group(1),aa.group(1).replace("\n",""))
  41. # 将空表格的情况去掉
  42. con = re.sub(r'<table>[\s\n\t]*?<tbody>[\s\n\t]*?(<tr>[\s\n\t]*?<td[^<>]*?>[\s\n\t]*?<p>[\s\n\t]*?</p>'
  43. r'[\s\n\t]*?</td>[\s\n\t]*?</tr>[\s\n\t]*?)+</tbody>[\s\n\t]*?</table>[\s\n\t]*?<p>', "", con,
  44. flags=re.S)
  45. con = re.sub(r'(</table><p>)\s*([((]\s*\d\s*[))])', r'\1\n\2', con)
  46. return con
  47. def base642img(html_data, wordid):
  48. """
  49. 【基于mathjax渲染输出是css-html格式】
  50. 将base64编码的图片保存到本地
  51. :return:
  52. """
  53. # 二进制图片进行转化, 按“word_id”建立文件夹
  54. # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d')
  55. # file_path = configs.IMG_FOLDER + '/' + str(self.wordid)
  56. # if not os.path.exists(file_path):
  57. # os.makedirs(file_path)
  58. # else:
  59. # 思路1:删除图片,重建文件夹,【所有的新图片都是以base64格式传过来的】
  60. # shutil.rmtree(file_path)
  61. # os.makedirs(file_path)
  62. # 思路2:每一次再解析都将base64图片保存到本地再以路径形式返回
  63. # st = len(os.listdir(file_path)) # 不要以序号索引的形式命名
  64. # 统计所有base64编码
  65. all_base64_image = re.findall(r'(<img ([a-z]+="[^"]*?" )?src="(data:image[^>"]+?)"(.*?)\s*/?>)', str(html_data), flags=re.S)
  66. if all_base64_image:
  67. file_path = configs.IMG_FOLDER + '/' + str(wordid)
  68. if not os.path.exists(file_path):
  69. os.makedirs(file_path)
  70. # 新图片命名
  71. name_list = random.sample(range(100000, 999999), len(all_base64_image))
  72. for n, img in enumerate(all_base64_image):
  73. img1 = img[2].split(",", maxsplit=1)
  74. img_type_info = re.search("data:image/(.+?);base64", img1[0])
  75. img_type = img_type_info.group(1) if img_type_info else ""
  76. # 可能还有alt和style的属性,暂时先不要
  77. w_info = re.search('( width="\d+")', img[3])
  78. h_info = re.search('( height="\d+")', img[3])
  79. img_data = base64.b64decode(str(img1[-1]))
  80. if img_type:
  81. # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape)
  82. img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type
  83. save_path = os.path.join(file_path, img_name)
  84. with open(save_path, 'wb') as f:
  85. f.write(img_data)
  86. # self.localnewpic_list.append(save_path)
  87. # put_key = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(img_name)
  88. # self.put_key_list.append(save_path)
  89. flag_behind = '" />'
  90. if w_info and h_info:
  91. flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />'
  92. temp_img = '<img src="' + configs.new_img_ip + '/' + str(wordid) + '/' + img_name + flag_behind
  93. # new_img = '<img src="http://' + configs.public_bucket_addr + put_key + '" />'
  94. html_data = html_data.replace(img[0], temp_img)
  95. return html_data
  96. class HtmlWash():
  97. def __init__(self, html, wordid, is_reparse=0, img_url="", must_latex=0):
  98. """
  99. html文本清洗
  100. 批量再解析中,新增图片信息替换的文本返回作为ocr保存文本,
  101. 继续往下清洗的文本,则进入结构化解析逻辑中
  102. """
  103. # super().__init__(html, wordid, is_reparse, must_latex)
  104. self.html = html
  105. self.img_url = img_url
  106. self.wordid = wordid
  107. self.is_reparse = is_reparse
  108. self.must_latex = must_latex
  109. # self.put_key_list = []
  110. # self.localnewpic_list =[]
  111. self.sub_list = ["</?div>", "</?b>", "</?caption>", "</?center>", "</?cite>", "</?code>", "</?colgroup>",
  112. "</?menu>", "</?dd>", "</?dir>", "</?li>", "</?em>", "</?article>", "</?header>", "</?ruby>",
  113. "</?summary>", "</?details>", "</?strong>", "</?strike>", "</?small>", "</?select>",
  114. "</?section>", "</?script>", "</?[su]>", "</?var>", "</?ul>", "</?tt>", "</?title>", "</?thead>",
  115. "</?tfoot>", "<hr />", "<hr>", ""]
  116. self.sub_dd = {'&times;': '×',
  117. '&divide;': '÷',
  118. '&deg;': '°',
  119. '&middot;': '·',
  120. '&plusmn;': '±',
  121. '&ordm;': 'º',
  122. '&sup1;': '¹',
  123. '&sup2;': '²',
  124. '&sup3;': '³',
  125. '&frac12;': '1/2',
  126. '&frac14;': '¼',
  127. '&frac34;': '¾',
  128. '&yen;': '¥',
  129. 'm&sup3;': 'm³',
  130. # '&lt;': '<',
  131. '&pound;': '£',
  132. # '∠&lt;': '&lt;',
  133. '&gt;': '>',
  134. "A": "A",
  135. "А": "A",
  136. "Α": "A",
  137. "B": "B",
  138. "В": "B",
  139. "в": "B",
  140. "Β": "B",
  141. "C": "C",
  142. "С": "C",
  143. "c": "c",
  144. "с": "c",
  145. "D": "D",
  146. "Ε": "E",
  147. "E": "E",
  148. "F": "F",
  149. "G": "G",
  150. "g": "g",
  151. "m": "m",
  152. "N": "N",
  153. "s": "s",
  154. "t": "t",
  155. "/": "/",
  156. "=": "=",
  157. "-": "-",
  158. "2": "2", "3": "3", "4":"4", "5":"5", "6":"6",
  159. "7": "7", "8": "8", "9":"9", "1":"1", "0":"0",
  160. '&nbsp;&nbsp;': ' ',
  161. '&nbsp;': ' ',
  162. "〖": '【',
  163. "〗": '】',
  164. "題": '题',
  165. "单项选择": '单选',
  166. "多项选择": '多选',
  167. # "不定项选择": '选择',
  168. "双项选择": '多选',
  169. "实验与探究题": '实验',
  170. "原理综合题": '原理题',
  171. }
  172. def new_pic_sub(self):
  173. """
  174. 针对base64图片先保存到本地,入库时再换成腾讯云线上地址
  175. # 第一版:再解析中,将二进制图片进行转化,图片怎么保存比较好,先再“天数”建立文件夹
  176. 第一版:再解析中,根据“word_id”建立文件夹
  177. :return:
  178. """
  179. if self.is_reparse:
  180. # css 标签清洗
  181. self.html = css_label_wash(self.html)
  182. # 保存base64编码的图片
  183. self.html = base642img(self.html, self.wordid)
  184. self.new_html = self.html
  185. def html_cleal(self):
  186. # =======清洗mathjax标签========
  187. if "MathJax" in self.html: # 再解析中存在mathjax公式渲染的标签
  188. all_mathjax = re.findall('(<span class="MathJax_Preview".*?</script>(</span>)*)', self.html)
  189. for jax in all_mathjax:
  190. latex = re.findall('<script .+?">(((?!(</)).)*?)</script>(</span>)*', jax[0])
  191. if latex:
  192. latex = "${}$".format(latex[0][0])
  193. self.html = self.html.replace(jax[0], latex)
  194. else:
  195. self.html = self.html.replace(jax[0], "")
  196. # ======再解析中的新图片处理=====
  197. self.new_pic_sub()
  198. # =====特殊符号处理=====
  199. html2txt = re.sub(r"|".join(self.sub_list), "", str(self.html)) # ("", " ") #2020/4/7
  200. html2txt = re.sub("|".join(self.sub_dd.keys()), lambda x: self.sub_dd[x.group()], html2txt) # 2020/4/1,4/7,4/20
  201. html2txt = re.sub("[不非]定[向项]选择", "不定选择", html2txt)
  202. html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \
  203. .replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \
  204. .replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ")\
  205. .replace("\u2003", " ").replace("\x7f", " ").replace("\xa0", "")
  206. html2txt = re.sub(r"(<p>\s*)【例题(\d+)】", r"\1\2、", html2txt)
  207. html2txt = re.sub(r"\\\(|\\\)", "$", html2txt)
  208. # 域公式的转化处理;<sub>\<sup>可以在前端显示,不需要用latex渲染
  209. try:
  210. html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid, self.must_latex)
  211. if newhml: # 存在域公式转图片时,需要将原文本的域公式也转为图片信息
  212. self.new_html = newhml
  213. html2txt = html2txt.replace("【omml-latex】", "")
  214. except:
  215. html2txt = html2txt.replace("【omml-latex】", "")
  216. # 字符串公式的处理:如Fe<sub>2</sub>O<sub>3</sub>, 在结构化之后处理比较好
  217. # <br/>处理
  218. html2txt = re.sub(r"<br\s*/?>", "\n", html2txt)
  219. html2txt = re.sub(r"[((]\s*(\d)\s*\$分\s*[))]", r"$(\1分)", html2txt)
  220. # =====题型行的统一处理=====
  221. # ---->>>>>题型行可能放在表格中
  222. if len(re.findall("</table>", html2txt)) >= 8: # 这个限制还不太严谨
  223. for tt in re.finditer('<tr>(((?!(</?tr>)).)*)</tr>', html2txt, re.S):
  224. tt_list = re.split(r'^\s*<td[^<>]*?>|</p></td>|</td>[\n\s]*?<td[^<>]*?>'
  225. r'|</td>\s*\n|</td>\s*$|\n\s*<td[^<>]*?>|<td[^<>]*?><p>',
  226. tt.group(1).strip()) # </td>\s*[$\n]这样无效
  227. tt_list = [col for col in tt_list if col.strip()]
  228. if " ".join(tt_list).replace(" ", "") in ['得分评卷人', '评卷人得分']:
  229. html2txt = html2txt.replace(tt.group(0), "")
  230. else:
  231. pass
  232. # html2txt = html2txt.replace(tt.group(0), "<p>" + " ".join(tt_list) + "</p>")
  233. # html2txt = re.sub(r"</?tbody>|</?table>|</?div>", "", html2txt)
  234. # ---->>>>>end
  235. html2txt = re.sub(r"(</table>)\s*([一二三四五六七八九十]\s*[、..、::]?.{2,6}题)", r"\1</p>\2", html2txt)
  236. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt)
  237. html2txt = re.sub(r'<td[^<>]*?><p>(([一二三四五六七八九十])\s*[、..、,,::]\s*(.{2,4}题)\s*</p>)</td>[^p]*?<p>', r"\1",
  238. str(html2txt), flags=re.S)
  239. html2txt = re.sub(r"<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)",
  240. r"<p>\1、\2题", html2txt)
  241. html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "", html2txt)
  242. html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?<!\d)(\d+分)\s*[,,。].{,50}</p>',
  243. r"<p>【选做题】:'\1'</p>", html2txt)
  244. html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "<p>【选做题】</p>", html2txt)
  245. html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*</p>',
  246. r"<p>\1、\2题</p>", html2txt)
  247. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)'
  248. r'([((]\s*本题|.*?\d分)', r"\1" + "、" + r'\2' + "题" + r"\3", html2txt)
  249. html2txt = re.sub(r'([一二三四五六])\s*[、..、,,::]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题',
  250. r"\1" + "、" + r'\2' + "题", html2txt)
  251. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2"
  252. html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt)
  253. html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)',
  254. r"\1" + "、" + "解答题", html2txt)
  255. html2txt = re.sub(r'(?<!<p>)\s*([一二三四五六七八九十]\s*[、..、,,::]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)',
  256. r'</p>\n<p>\1', html2txt)
  257. html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s*</p>', r"<p>\1、本大题</p>",
  258. html2txt)
  259. # html2txt = re.sub(r'<p>\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"<p>一、\1题", html2txt)
  260. # =====答案解析关键字的统一处理=====
  261. html2txt = re.sub(r'【\s*(<img src=((?!/>).)+?/>\s*)*?([解答])\s*(<img src=((?!/>).)+?/>\s*)*?([析案])\s*'
  262. r'(<img src=((?!/>).)+?/>\s*)*?】', r"【\3\6】", str(html2txt)) # 2022/4/28
  263. html2txt = re.sub(r'<p>\s*(解\s*[::])', r"<p>【解答】", str(html2txt))
  264. html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt))
  265. html2txt = re.sub(r'(\n\s*|<p>\s*|\s{2,}|\n\s*\d{,2}\s*[、..、]\s*)(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"\1【\2】", str(html2txt))
  266. html2txt = re.sub(r'(\n|^|<p>)\s*(([1-9]|[1-9][0-9])\s*[..、、])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]',
  267. r"\1\2【\4】", str(html2txt))
  268. html2txt = re.sub(r'([A-D])\s*\[\s*(解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\n【\2】", str(html2txt))
  269. html2txt = re.sub(r'(\n|^|<p>)\s*(分析)\s*[::]', r"【\2】", str(html2txt))
  270. if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt:
  271. html2txt = re.sub(r'【解答】', "【解析】", str(html2txt))
  272. # =====其他关键字的处理=====
  273. html2txt = re.sub(r'<p>\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?</p>', "", str(html2txt))
  274. html2txt = re.sub(r'<p>\s*(选修[\d-]*?[::].{2,15})\s*</p>', r"<p>【章节】\1</p>", html2txt)
  275. html2txt = re.sub(r'<p>\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*</p>',
  276. r"<p>【章节】\2</p>", html2txt)
  277. html2txt = re.sub(r'<p>\s*(基础|中档|综合)题[^p题]*?</p>|<p>\s*【(考点|专题)】[^p]*?</p>', "", str(html2txt))
  278. html2txt = re.sub(r'<p>\s*(基础训练|提升训练|探究培优)</p>', "", str(html2txt))
  279. html2txt = re.sub(r'<p>注意事项[::]\s*</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt, flags=re.S)
  280. html2txt = re.sub(r'<p>注意事项[::]\s*\d\s*[、..、][^/]+?</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt,
  281. flags=re.S)
  282. html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt)
  283. html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt)
  284. html2txt = re.sub(r'\[来源:.*?\]', "", html2txt)
  285. html2txt = re.sub('<p>欢迎访问.*?</p>', '', html2txt)
  286. html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?<!["“=\'])http:.*?\.(com|cn|org)', "",
  287. html2txt) # ww w.gkstk.c om
  288. html2txt = re.sub(r'<(table|tr) [a-z]+="\d+">', r'<\1>', html2txt) # <td rowspan="2">保留
  289. html2txt = re.sub(r'<(table)( [a-z]+=".*?")+>', r'<\1>', html2txt)
  290. html2txt = re.sub(r'<p>\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s*</p>', "<p>【非选择题】</p>", html2txt)
  291. # == == =对可能的题型行的处理 == ==
  292. html2txt = re.sub("<p>【非选择题】</p>((\s|\n|<p>|</p>)*\d{1,2}\s*[..、、].+?)", r"<p>二、解答题</p>\1", html2txt)\
  293. .replace("【非选择题】", "")
  294. # =====选项的处理=====
  295. html2txt = re.sub(r'(<p>\s*([1-9]|[1-9][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*?</p>)',
  296. r"\1</p>\n<p>\3", str(html2txt))
  297. # =====题号的处理=====
  298. html2txt = re.sub(r'([ED]\s*[、..、].*?(\s|</su[pb]>\s*))(([1-9]|[1-9][0-9])\s*[、..、])',
  299. r"\1</p>\n<p>\3", html2txt)
  300. html2txt = re.sub(r'((</?p>|\n)\s*(<img src=.*?"\s*/?>\s*)?([1-9]|[1-9][0-9]))\s*'
  301. r'([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)', r"</p>\1、\5", html2txt)
  302. html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\1、\2",
  303. html2txt)
  304. html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::]|\[(答案|解析)\])", r"<p>\1、\2",
  305. html2txt)
  306. html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([((]\s*\d+\s*分?\s*[))])?(【(解析?|答案?)】|(解析?|答案?)\s*[::]"
  307. r"|\[(答案|解析)\])", r"<p>\1、\2\3", html2txt)
  308. html2txt = re.sub(r"(</?p>|\n)\s*(<img src=((?!/>).)+?/>)\s*([1-9]|[1-9][0-9])\s*"
  309. r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\2</p>" + "\n" + r"<p>\4、\5", html2txt) # 【susp_img】
  310. html2txt = re.sub(r'(</?p>|\n)((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+p[tx]"\s*/?>(\s*</su[bp]>)?\s*)'
  311. r'(([1-9]|[1-9][0-9])\s*[、..、])', r"</p>\2</p>" + "\n" + r"\5", html2txt)
  312. html2txt = re.sub(r"(<p>((?!<p>).)+?(\s|[/\"]>))(([1-9]|[1-9][0-9])\s*[、..、].{,20}本[大小]?题\d+分)",
  313. r"\1</p>" + "\n<p>" + r"\4", html2txt)
  314. html2txt = re.sub(r"</?p>((\s*<su[bp]>\s*)?<img src=.*?/>(\s*</su[bp]>)?"
  315. r"((\s*<su[bp]>\s*)?<img src=((?!/>).)+?/>(\s*</su[bp]>)?)*?\s*)\s*(([1-9]|[1-9][0-9])\s*[、..、])",
  316. r"</p>\1</p>" + "\n<p>" + r"\8", html2txt, flags=re.S)
  317. html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>',
  318. r"\1</p>\n<p>\2</p>", html2txt)
  319. html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>',
  320. r"\1</p>\n<p>\2</p>", html2txt)
  321. html2txt = re.sub(r'(<p>.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>', r"\1</p>\n<p>\2</p>", html2txt)
  322. html2txt = re.sub(r'([1-9]|[1-9][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2", html2txt)
  323. # =====图片的处理=====
  324. # 1>>根据图片宽高的异常值判断删除隐藏图片
  325. def sub1(ss):
  326. if float(ss.group(1)) <= 3 and float(ss.group(2)) <= 3:
  327. return ""
  328. else:
  329. return ss.group(0)
  330. html2txt = re.sub(r'<img src=.*? width="([\d.]+)p[xt]" height="([\d.]+)p[xt]"\s*/?>', sub1, html2txt)
  331. # 2>>将图片中带有的汉字去掉
  332. html2txt = re.sub(r'(<img src=.*?) alt=".+?"', r"\1", html2txt)
  333. # html2txt = re.sub(r'(<img src=.+?(?<!\\)\")>', r"\1 />", html2txt) # 将">换为" />
  334. html2txt = re.sub(r'(<img src=(?!\sstyle=)+?(?<!\\)\")>', r"\1 />", html2txt) # 将">换为" />
  335. # 3>>建立图片id字典,对原图片信息第一次替换
  336. html2txt = re.sub(r'( src=".*?files)\\image', r"\1/image", html2txt)
  337. all_image = re.findall(r'<img src=".*?image[\da-z]+\..*?[/\"]>', html2txt)
  338. src2subs = {}
  339. subs2src = {}
  340. for src in all_image:
  341. # 校本题库上传的图片名称是随机数,故设置映射
  342. # kk = re.search('(<img src=".*?image\d+\.(png|gif|jpg|jpeg))', src)
  343. # new_src = src.replace(kk.group(1), self.img_url[kk.group(1)]) if type(self.img_url) == dict and kk else src
  344. # 图片信息简化替换
  345. print(src)
  346. new_src = re.sub(r'( data-latex)="\s*\\\[(.*?)\\\]\s*"', r'\1="$\2$"', src)
  347. new_src = re.sub(r'( data-latex="\$[^"]+?\$")',
  348. lambda x: x.group(1).replace("<", " \lt ").replace(" ", " "), new_src)
  349. latex_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?(data-latex=".*?")', src)
  350. mathpix = " " + latex_info.group(3).replace("\n", "").strip().replace(" ", " ") if latex_info else ""
  351. if mathpix and len(mathpix) > 20:
  352. mathpix = ""
  353. w_h_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?width="([\d.]+)[pxt]*?"\s*height="([\d.]+)[pxt]*?"', src)
  354. w_h = " w_h=" + w_h_info.group(3).split('.')[0] + "*" + w_h_info.group(4).split('.')[0] \
  355. if w_h_info and not mathpix else "" # w_h 和 mathpix只存在一个
  356. # image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
  357. image_info = re.search(r'<img src=".*?/([^/]+?)/(new_)?image([\da-z]+)\.', src) # 2023.12.1
  358. print(image_info.groups())
  359. image_id = image_info.group(1) + image_info.group(3)
  360. if len(image_id) > 10:
  361. image_id = image_id[-10:]
  362. src2subs[src] = '<imgsrc' + image_id + w_h + mathpix + "/>"
  363. subs2src['<imgsrc' + image_id + w_h + mathpix + "/>"] = new_src
  364. for k, v in src2subs.items():
  365. html2txt = html2txt.replace(k, v)
  366. # ------------------------------------------------------------------------
  367. # ========html 转 list=========
  368. html2txt = re.sub(r'(</?div>|</table>|</?body>)(\n\s*)*?<p>', r"\1</p>" + "\n<p>", html2txt, flags=re.S)
  369. # >>>>>> <table>先替换后再切割
  370. # 不能简单按 \n 切割,表格里面也可能有换行,应该先替换后再切割
  371. subs2table = {}
  372. all_table = re.findall(r'<table>.*?</table>', html2txt, flags=re.S)
  373. for k, v in enumerate(all_table):
  374. html2txt = html2txt.replace(v, "<t{}b>".format(k))
  375. # 将表格中的换行去掉
  376. v = re.sub(r'<p>\s*(</?t[drh]( .*?")?>|</?table>|</?tbody>)\s*</p>', r"\1", v)
  377. v = re.sub(r'</td></p>[\n\s]*<p><td>', "</td><td>", v)
  378. v = re.sub(r'<td>(<p>|\s|</p>|\n)*</td>', "<td> </td>", v)
  379. v = re.sub(r'</tbody></?p></table>', "</tbody></table>", v)
  380. v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(\s*<p>\s*</p>)[\s\n]*?(<br\s*/?>|\n)+', r"\1", v, flags=re.S)
  381. v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(<br\s*/?>|\n|</p>|\s)+', r"\1", v, flags=re.S)
  382. v = re.sub(r'(</t[drh]( .*?")?>|</table>|</tbody>)(<br\s*/?>|\n|<p>|\s)+', r"\1", v, flags=re.S)
  383. # 暂时还有table标签首尾的换行没去掉
  384. subs2table["<t{}b>".format(str(k))] = v
  385. # <造成的css标签冲突处理 2021-10-13
  386. def sub2(ss):
  387. if re.search(r'^(img|/?h[123456]|/?su[bp]>|t\d+b>|br\s*/?>'
  388. r'|/?(p|span|font|article|ul|ol|div|table|t?body|html|head|t[drh])(\s*|\s+style=.*?")>'
  389. r'|/?[a-z]+ style=.*?">)', ss.group(1)) is None:
  390. return "&lt;{}".format(ss.group(1))
  391. else:
  392. return "<{}".format(ss.group(1))
  393. html2txt = re.sub("<([^<]{1,30})", sub2, html2txt)
  394. # print(html2txt)
  395. # >>>>>> html 切割
  396. con_list = sum([re.split('<p>|<h[12345]>', i) if len(re.findall("<p>|<h[12345]>", i)) > 1 else [i] for i in
  397. re.split(r"\n+|</p>(?!</td>)|</h[12345]>", html2txt)], []) # html2txt)[:-1]
  398. con_list = [re.sub(r"^\n*\s*(<p>|<h[12345]>)+", "", ii) for ii in con_list]
  399. # >>>>>> <table> 替换回去
  400. if subs2table:
  401. con_list = [re.sub(r"|".join(subs2table.keys()), lambda x: subs2table[x.group()], ii) for ii in con_list]
  402. # 剩余个别标签处理
  403. con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*</\1>$", "", i.strip()) for i in con_list] # 2020/4/7,14
  404. con_list = [re.sub(r"^(<table>|</td>|<td[^<>]*?>|</?tr>)+?(.|\n)+?([一二三四五六七八九十])\s*[、..、]\s*(.{2,4}题)(.|\n)+?</table>",
  405. r"\3、\4", i.strip())
  406. for i in con_list]
  407. # 把最后可能还存在的</?p>或考号信息去掉
  408. con_list = [re.sub("</?p>|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$"
  409. "|((学校|班级|姓名|座位号|准考号|[学考]号)[\s::_]*?){2,}$", "", i.strip()) for i in con_list]
  410. # =====答案行格式处理====
  411. temp_list = [re.split(r"^((\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+)", v.strip(), maxsplit=1)[1::3]
  412. if re.match(r'(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$'
  413. r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?评分标准'
  414. r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$',
  415. re.sub(r"[上下]?学[年期]|[\d—【】..、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "",
  416. v.strip())) else [v] for v in con_list]
  417. con_list = sum(temp_list, [])
  418. # =====对可能的题号的处理==== 如2、3、4、5、 加了【fei】 # 重新修改!!!!!!!!!!
  419. con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[..、、])", r"【fei】\1", i.strip())
  420. if (len(re.findall(r"(^|\s*[..、、])\s*[1-9][0-9]?\s*[..、、]", i)) >= 3
  421. and len(re.sub(r"[\d..、、\s]", "", i)) < 2) else i for i in con_list]
  422. # =====头尾清除没用的信息=====
  423. if con_list and re.search(r"[\u4e00-\u9fa5]|<img ", con_list[0]) is None:
  424. con_list = con_list[1:]
  425. while con_list and re.search(r"声明[::].*?著作权属.*?所有|(邮箱|用户|日期|QQ)\s*[::].+?", con_list[-1]):
  426. con_list = con_list[:-1]
  427. return con_list, subs2src, self.new_html
  428. def del_no(item, item_no_type=1):
  429. """去开头的题号"""
  430. if item_no_type == 2:
  431. item = re.sub(r'^\n*\s*[((]\s*([1-9]|[1-9][0-9])\s*[))]\s*[..、、::]?', "", item)
  432. return item
  433. item = re.sub(r'^\n*\s*([1-9]|[1-9][0-9])\s*[..、、::]', "", item)
  434. return item
  435. def html_cleal_test(htmlf): # 不用
  436. html2txt = re.sub(r"&nbsp;", "", htmlf.read()) # ("", " ")
  437. # html2txt.replace("①", "(1).").replace("②", "(2).").replace("③", "(3).")
  438. con_list = [re.sub(r"^\n+\s+<p>", "", ii) for ii in html2txt.split("</p>")[:-1]]
  439. # pprint(con_list)
  440. if re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None:
  441. con_list = con_list[1:]
  442. return con_list
  443. def get_md5(image_id):
  444. """
  445. 由于hash不处理unicode编码的字符串(python3默认字符串是unicode)
  446. 所以这里判断是否字符串,如果是则进行转码
  447. 初始化md5、将image_name进行加密、然后返回加密字串
  448. """
  449. image_name = str(image_id) + str(time.time()) + str(random.random())
  450. image_name = image_name.encode("utf-8")
  451. md = hashlib.md5()
  452. md.update(image_name)
  453. return str(md.hexdigest())
  454. def wash_after(res_dict, subject="数学"):
  455. """
  456. 1.处理最终结果多余的换行符;2.对题文中已给答案的选择填空进行替换;3.选择题的细分
  457. :param res_dict:
  458. :return:
  459. """
  460. pattern1 = re.compile(
  461. r"([是为点]|等于|=|=|有|存在)\s*_+((<img src=((?!/>).)+?[/\"]>|[^_;;。?!,\n])+?)(?<![==_])_+([cdkm上]?m?\s*.?[。.?]?\s*"
  462. r"($|<br/>|<img src|……))")
  463. pattern2 = re.compile(r"((有|存在|[是为])[\u4e00-\u9fa5]{0,2})\s*_+(\d+)_+\s*([\u4e00-\u9fa5,,;;。..])")
  464. chapter_no = {}
  465. option_st = 0
  466. is_optional = False
  467. option_score = 0
  468. select_type_id = []
  469. all_content_str_list = []
  470. topic_type_list = []
  471. for num, sr in enumerate(res_dict):
  472. sr["stem"] = re.sub(r"\n[_\-\s]*密[…O•.\s]*封[….O•\s]*装?[…O•.\s]*订?[….O•\s]*线?"
  473. r"|\n\s*((学校|班级|姓名|座位?号|准考号|学号)[\s::_]*){2,}", "", sr["stem"])
  474. sr["stem"] = re.sub(r'\n\s*(第\s*[^\s]\s*卷|第[一二三四]部分)\s*([((].*?[))]|非?选择题.{,8})?\s*\n', "\n", sr["stem"])
  475. if num == len(res_dict) - 1: # 对拆分后的最后一道题进行特殊判断
  476. end_con = sr["stem"] + sr["parse"]
  477. if len(re.findall(r"[\u4e00-\u9fa5]", end_con)) > 1000 and (
  478. len(re.findall(r"\n\s*([1-9]|1[0-9])\s*[..、、].+?",
  479. end_con)) > 4 or len(re.findall(r"[((]\s*[))]|_{2,}", end_con)) > 6):
  480. sr['errmsgs'].append("原试卷格式有问题,导致本题可能包含了很多非本题的题文")
  481. if not re.sub(r"[(())\n\s]", "", sr["stem"]):
  482. sr['errmsgs'].append("本题没有题干,请检查题干格式是否正确")
  483. if "-" in str(sr["item_id"]) and sr['type'] in ["选择题", "填空题"]:
  484. if (not sr["key"] or sr["key"]=="见解析") and re.search("[A-H]+", re.sub("[;;、、\n(())\s]|\d+分", "", sr["parse"])):
  485. sr["key"] = re.sub("[;;、、\n(())\s]|\d+分", "", sr["parse"])
  486. sr["parse"] = ""
  487. # 把首尾的换行都去掉
  488. # sr["stem"] = table_label_cleal(re.sub(r"\n\s*","<br/>",sr.get("stem", "").lstrip()))
  489. # 将选择题和填空题中的题干中出现答案的情况 去掉答案
  490. kuo_con1 = re.search(r'([是为]|等于|[==有]|表示)\s*[((]\s*([A-Zc][A-Zc;;和与、、\s]*?)[))]\s*(.?($|\n|<br/>|<img))',
  491. sr["stem"])
  492. kuo_con2 = re.search("[((]\s*([A-Zc][A-Zc;;和与、、\s]*?)[))]\s*(.?($|\n|<br/>))", sr["stem"])
  493. if sr['type'].replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
  494. # sr["type"] = "选择"
  495. # 针对选择题在题文中已给出答案的处理
  496. if kuo_con1:
  497. sr["stem"] = sr["stem"].replace(kuo_con1.group(0), kuo_con1.group(1) + "( )" + kuo_con1.group(3))
  498. sr["key"] = kuo_con1.group(2).replace("c", "C") if not sr["key"] else sr["key"]
  499. elif kuo_con2:
  500. sr["stem"] = sr["stem"].replace(kuo_con2.group(0), "( )" + kuo_con2.group(2))
  501. sr["key"] = kuo_con2.group(1).replace("c", "C") if not sr["key"] else sr["key"]
  502. # sr['options_text'] = ""
  503. elif sr['type'] == '填空题':
  504. # sr["type"] = "填空"
  505. ans_list = []
  506. # 针对填空题在题文中已给出答案的处理
  507. sub_n = 0
  508. while re.search(pattern1, sr["stem"]):
  509. blank_con1 = re.search(pattern1, sr["stem"])
  510. sr["stem"] = sr["stem"].replace(blank_con1.group(0),
  511. blank_con1.group(1) + "____" + blank_con1.group(5))
  512. ans_list.append(blank_con1.group(2))
  513. sub_n += 1
  514. if sub_n > 5:
  515. break
  516. while re.search(pattern2, sr["stem"]):
  517. blank_con2 = re.search(pattern2, sr["stem"])
  518. # 这里的限制条件易出错,可以再判断一下
  519. sr["stem"] = sr["stem"].replace(blank_con2.group(0),
  520. blank_con2.group(1) + "____" + blank_con2.group(4))
  521. ans_list.append(blank_con2.group(2))
  522. if re.findall(r"_{2,}", sr["stem"]):
  523. sr["blank_num"] = len(re.findall(r"_{2,}", sr["stem"]))
  524. if not sr["key"] and ans_list:
  525. sr["key"] = "; ".join(ans_list)
  526. # 已知题型是错误的情况,如解答题,放在填空题中
  527. if 'blank_num' not in sr and re.search("_+([^_]*?)_+", sr['stem']) is None:
  528. sr['errmsgs'].append("填空题题干中没有下划线(__),与题型(填空题)不符")
  529. # stem_c = re.sub("<img src=.*?/>|[,,.。.、、]", "", sr["stem"])
  530. # if len(stem_c) > 2: # 不自动纠错
  531. # sr["type"] = "解答题"
  532. # sr["type"] = "解答"
  533. # else: # 大题题型先不做范围判断
  534. # if sr['type'] and sr['type'].replace("题", "") not in ["解答", "计算", "实验", "作图"]:
  535. # sr["type1"] = "解答"
  536. # else:
  537. # sr["type1"] = sr['type'].replace("题", "")
  538. # if "is_optional" not in sr:
  539. # sr["is_optional"] = is_optional
  540. # sr["option_str"] = ""
  541. # 换行符处理!
  542. sr["stem"] = sr.get("stem", "").strip().replace("\n\n", "\n").replace("\n", "<br/>") # 2020/4/10 gai
  543. # sr["stem"] = get_equation_instr(sr["stem"])
  544. if "options" in sr: # 对选项部分进行格式处理
  545. for i in range(len(sr['options'])):
  546. sr['options'][i] = get_simpstr2eqn(sr['options'][i].strip()).replace("\n\n", "\n").replace("\n", "<br/>")
  547. # sr['options'][i] = get_equation_instr(sr['options'][i].strip()).replace("\n\n", "\n").replace("\n", "<br/>")
  548. if "slave" in sr and sr["slave"]:
  549. # 带小题的大题,格式处理,高中数学没有这一功能
  550. for s in sr["slave"]:
  551. s["stem"] = s.get("stem", "").strip().replace("\n\n", "\n").replace("\n", "<br/>")
  552. # 已分小问了的题号,是不会带小题号的,故不需要替换
  553. # s["stem"] = re.sub(r"[((]\s*(\d|ⅰⅱⅲⅳ|i{1,3})\s*[))]|[①②③④]\s*(?![+-])", "", s["stem"][:5]) + s["stem"][5:]
  554. s["parse"] = s.get("parse", "").strip().replace("\n\n", "\n").replace("\n", "<br/>")\
  555. .replace("解答:解:", "解答:").replace("解答:解:", "解答:")
  556. s["key"] = s.get("key", "").strip().replace("\n\n", "\n").replace("\n", "<br/>")
  557. # sr["slave"] = sr.get("slave", "").replace("\n", "<br>")
  558. if "answer_type" in s:
  559. s["answer_type"] = configs.answer_type[s["answer_type"]]
  560. else:
  561. # s["parse"] = css_conflict_deal(s["parse"]) # "css 冲突标签处理"
  562. sr["parse"] = sr.get("parse", "").lstrip().replace("\n\n", "\n").replace("\n", "<br/>")
  563. sr["parse"] = re.sub("^【解[答析]】\s*", "", sr["parse"])
  564. # sr["parse"] = get_equation_instr(sr["parse"])
  565. sr["key"] = sr.get("key", "").lstrip().replace("\n\n", "\n").replace("\n", "<br/>")
  566. # sr["key"] = get_equation_instr(sr["key"])
  567. if "answer_type" in sr:
  568. sr["answer_type"] = configs.answer_type[sr["answer_type"]]
  569. if not sr["parse"] and not sr["key"]: # 答案和解析都没有
  570. # sr["parse"] = "略"
  571. # sr["key"] = "略"
  572. sr['errmsgs'].append("本题缺少答案和解析")
  573. elif not sr["key"] and sr["parse"]:
  574. sr["key"] = "" # 见解析
  575. elif re.sub("见解析|略|空|无|没有|答案", "", sr["key"]) and not sr["parse"]:
  576. sr["parse"] = "略"
  577. # if "本选做题缺少解析" not in sr['errmsgs'] and "本题缺少解析" not in sr['errmsgs']:
  578. # sr['errmsgs'].append("本题缺少解析")
  579. # 辅助标签处理
  580. # sr["analysis"] = ""
  581. if "analy" in sr: # 存在题目分析时,将其放在解析里
  582. sr["analy"] = sr.get("analy", "").strip().replace("\n\n", "\n")
  583. if len(sr["analy"].replace(" ", "")) >= 10:
  584. sr["parse"] = "【分析】"+sr["analy"].replace("\n", "<br/>") + "<br/>【详解】" + sr["parse"]
  585. del sr["analy"]
  586. if "chapter" in sr: # 如选修4-5:不等式选讲
  587. if sr['item_id'] + 1 <= len(res_dict):
  588. chapter_no[sr['item_id']] = sr["chapter"]
  589. del sr["chapter"]
  590. # 是否为选做题"is_optional",两种形式不会同时出现
  591. if "option_st" in sr: # 带有此标签的后面的题目都是选做题option_score
  592. # option_st = sr['item_id']
  593. # is_optional = True
  594. # if "," in sr["option_st"]:
  595. # option_score = int(sr["option_st"].split(",")[-1])
  596. del sr["option_st"]
  597. # elif sr['type'] == '选做题': # 题型是选做题 如五、选做题
  598. # select_type_id.append(sr['item_id'])
  599. # sr['is_optional'] = 'true'
  600. # sr['score'] = option_score
  601. # elif "type1" in sr and sr["type1"] == "解答" and "is_optional" not in sr:
  602. # sr["is_optional"] = is_optional
  603. # if is_optional:
  604. # sr['score'] = option_score
  605. # if "type1" in sr:
  606. # del sr["type1"]
  607. # 题型纠正
  608. # 将选择题改为单选或多选,"is_multiple_choice"
  609. sr['type'] = re.sub("([单多])项选择题?", r"\1选题", sr['type'])
  610. sr['type'] = sr['type'].replace("题题", "题") # .replace("简答", "解答")
  611. # sr['type'] = re.sub("(计算|简答)题?", "解答题", sr['type'])
  612. if sr['type'] in ["选择", "选择题"]: # 有的科目只有选择题,不分单选和多选
  613. if len(re.findall("[A-Z]", sr["key"])) > 1:
  614. sr['type'] = '多选题'
  615. elif len(re.findall("[A-Z]", sr["key"])) == 1:
  616. sr['type'] = '单选题'
  617. elif "数学" in subject or "物理" in subject:
  618. sr['type'] = '单选题'
  619. info_x = re.search("^[((](多)选题?[))]", sr["stem"].replace(" ", ""))
  620. if info_x:
  621. sr['type'] = '{}选题'.format(info_x.group(1))
  622. if sr['type'] == '多选题':
  623. if len(re.findall("[A-Z]", sr["key"])) == 1:
  624. sr['errmsgs'].append("本题答案个数与题型(多选题)不符")
  625. # sr["is_multiple_choice"] = 'true'
  626. elif sr['type'] == '单选题':
  627. # sr["is_multiple_choice"] = 'false'
  628. if "options" in sr and len(sr["options"]) > 4:
  629. sr['errmsgs'].append("选项个数多于4个,与题型(单选题)不符")
  630. if len(re.findall("[A-Z]", sr["key"])) > 1:
  631. sr['errmsgs'].append("本题答案个数与题型(单选题)不符")
  632. elif sr['type'] == '不定选择题':
  633. if len(re.findall("[A-Z]", sr["key"])) > 1:
  634. sr['type'] = '多选题'
  635. elif len(re.findall("[A-Z]", sr["key"])) == 1:
  636. sr['type'] = '单选题'
  637. elif "数学" in subject or "物理" in subject:
  638. sr['type'] = '单选题'
  639. else:
  640. sr['type'] = '选择题'
  641. if "缺少答案" not in "".join(sr['errmsgs']):
  642. sr['errmsgs'].append("本题缺少答案")
  643. elif "数学" in subject:
  644. if sr['type'].replace("题", "") == "填空":
  645. if sr['blank_num'] > 1:
  646. sr['type'] = "多空题"
  647. else:
  648. sr['type'] = "单空题"
  649. elif sr['type'].replace("题", "") not in ["单空", "多空"]:
  650. sr['type'] = "解答题"
  651. # elif "物理" in subject:
  652. # # 用第一版模型预测
  653. # content = sr['stem']
  654. # if "options" in sr and sr["options"]:
  655. # content+= "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
  656. # for idm, option in enumerate(sr["options"])])
  657. # try:
  658. # r = requests.post(url=configs.phy_topicType_ip,
  659. # json={"content": content, "period": "高中",
  660. # "topic_type": sr['type']})
  661. # sr['type'] = r.json()["res"]
  662. # if sr['type'] == "简答题":
  663. # sr['type'] = "解答题"
  664. # except Exception as e:
  665. # print(e)
  666. # if sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
  667. # sr['type'] = "填空题"
  668. # else:
  669. # sr['type'] = "解答题"
  670. elif sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
  671. sr['type'] = "填空题"
  672. elif sr['type'] not in ["选择", "选择题"]:
  673. sr['type'] = "解答题"
  674. content = sr['stem']
  675. if "options" in sr and sr["options"]:
  676. content += "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
  677. for idm, option in enumerate(sr["options"])])
  678. all_content_str_list.append(content)
  679. topic_type_list.append(sr['type'])
  680. # """按照原先高中数学解析的最后输出格式整理输出"""
  681. # sr["type"] = sr['type'].replace("非选择", "解答").replace("题题", "题") #
  682. sr["topic_num"] = sr['item_id']
  683. sr['errmsgs'] = ";".join(sr['errmsgs'])
  684. sr["parse"] = re.sub(r"试题【([分解]析)】", r"试题\1:", sr["parse"]) # 解析
  685. sr["key"] = re.sub("([;;]|<br/>)\s*$", "", sr["key"])
  686. if 'susp_pic' in sr:
  687. del sr['susp_pic']
  688. if 'is_optional' in sr:
  689. del sr['is_optional']
  690. if 'spliterr_point' in sr:
  691. del sr['spliterr_point']
  692. if 'score' in sr:
  693. del sr['score']
  694. del sr['item_id']
  695. # ---------------------字符串公式处理--------------------------------
  696. # sr["stem"] = get_equation_instr(sr["stem"])
  697. # sr["key"] = get_equation_instr(sr["key"])
  698. # sr["parse"] = get_equation_instr(sr["parse"])
  699. # if "options" in sr:
  700. # sr["options"] = list(map(get_equation_instr, sr["options"]))
  701. # ----------------------------------------------------------------
  702. # 物理题型批量调接口:节约时间
  703. if "物理" in subject:
  704. epoches = int(len(all_content_str_list) / 10)
  705. pred_topic_types = []
  706. if epoches > 0:
  707. last = 0
  708. for epoch in range(epoches):
  709. input_data = {"content": all_content_str_list[last:(epoch+1)*10], "period": "高中",
  710. "topic_type": topic_type_list[last:(epoch+1)*10]}
  711. last = (epoch+1)*10
  712. try:
  713. r = requests.post(url=configs.phy_topicType_ip, json=input_data)
  714. pred_topic_types.extend(r.json()["res"])
  715. except Exception as e:
  716. print(e)
  717. pred_topic_types.extend([""]*10)
  718. rest_con = all_content_str_list[last:]
  719. rest_topic_type = topic_type_list[last:]
  720. else:
  721. rest_con = all_content_str_list
  722. rest_topic_type = topic_type_list
  723. if rest_con:
  724. input_data = {"content": rest_con, "period": "高中", "topic_type": rest_topic_type}
  725. try:
  726. r = requests.post(url=configs.phy_topicType_ip, json=input_data)
  727. pred_topic_types.extend(r.json()["res"])
  728. except Exception as e:
  729. print(e)
  730. pred_topic_types.extend([""] * len(rest_con))
  731. # 将预测题型替换到res_dict中
  732. if any([True for i in pred_topic_types if i]) and len(pred_topic_types) == len(res_dict):
  733. for idx, pred_type in enumerate(pred_topic_types):
  734. if pred_type and res_dict[idx]['type'] in ["填空题", "解答题"]:
  735. if pred_type == "简答题":
  736. pred_type = "解答题"
  737. res_dict[idx]['type'] = pred_type
  738. # --------------------------------------------------------------
  739. # 换行符替换
  740. convert_huanhang(res_dict)
  741. # ------------------------------------------------------------------------
  742. # if chapter_no: # 章节标签下移一位
  743. # for c, v in chapter_no.items():
  744. # res_dict[c]["chapter"] = v
  745. # 选做题"option_str"处理
  746. # if select_type_id:
  747. # for s in select_type_id:
  748. # if len(select_type_id) == 2:
  749. # res_dict[s - 1]['option_str'] = "2选1"
  750. # elif len(select_type_id) == 4:
  751. # res_dict[s - 1]['option_str'] = "4选2"
  752. # else:
  753. # res_dict[s - 1]['errmsgs'] += ";<br/>选做题不是“2选1”和“4选2”类型"
  754. # if option_st:
  755. # print("option_st:", option_st)
  756. # for s in range(option_st, len(res_dict)):
  757. # if (len(res_dict) - option_st) == 2:
  758. # res_dict[s]['option_str'] = "2选1"
  759. # elif (len(res_dict) - option_st) == 4:
  760. # res_dict[s]['option_str'] = "4选2"
  761. # else:
  762. # res_dict[s]['errmsgs'] += ";<br/>选做题不是“2选1”和“4选2”类型"
  763. # 再解析中的新图片上传腾讯云
  764. # 再设置一个入库接口,点击入库,才开始从本地上传图片
  765. return res_dict
  766. def convert_huanhang(items_list):
  767. """
  768. 递归 换行符替换:\n --> <br/>
  769. :param items_list:
  770. :return:
  771. """
  772. if isinstance(items_list, list):
  773. for k, one_i in enumerate(items_list):
  774. items_list[k] = convert_huanhang(one_i)
  775. elif isinstance(items_list, dict):
  776. for k, v in items_list.items():
  777. if k == "answer_type" and type(v) == str:
  778. items_list[k] = configs.answer_type[v]
  779. else:
  780. items_list[k] = convert_huanhang(v)
  781. if "answer_type" in items_list and items_list["answer_type"] == 2:
  782. if ("slave" not in items_list or not items_list["slave"]) and "stem" in items_list:
  783. items_list["stem"] = re.sub(r"(__{2,})", r'<span style="color:RGB(50%,40%,30%); blank space">\1</span>',
  784. items_list["stem"])
  785. elif isinstance(items_list, str):
  786. item_str = items_list.strip().replace("\n\n", "\n")
  787. item_str = re.sub(r'(</table>)(<br\s*/?>|\n)+', r"\1", item_str)
  788. return item_str.replace("\n", "<br/>")
  789. else:
  790. return items_list
  791. return items_list
  792. def css_conflict_deal(item):
  793. """
  794. 针对<a, <p 符号 在前端显示被过滤掉的问题:对“<”左右加$, 注意条件:“<”前$为双数时加,
  795. :return: str
  796. """
  797. # item = item.replace("<", "&lt;").replace(">", "&gt;") # 2021-8-24
  798. # item = re.sub("<(?!img src)", "&lt;", item) # 还有表格
  799. item = item.replace("$<$", "【*_*】") # 多次单题解析时会出现$<$
  800. item = re.sub(r"<(/?su[bp]|br\s*/?|/?table( .*?)?|/?tbody( .*?)?|/?t[rhd]( .*?)?)>", r"【\1】", item)
  801. if re.search(r"(?<!\\\()<", item):
  802. n1=0
  803. n2=0
  804. for i in re.finditer("<(?!img)", item):
  805. if item[:i.start()+2*n1+4*n2].count("$") % 2 == 0:
  806. item = item[:i.start()+2*n1+4*n2] + "$<$" + item[i.start()+1+2*n1+4*n2:]
  807. n1 += 1
  808. else:
  809. item = item[:i.start() + 2 * n1+4*n2]+" \lt " + item[i.start()+1+2*n1+4*n2:]
  810. n2 += 1
  811. # -----------------------------------------------------------
  812. item = item.replace("【*_*】", "$<$")
  813. item = re.sub(r"\\\)\s*\$<\$", r"\) &lt;", item)
  814. # while re.search(r"\\\(((?!\\\().)*?\$<\$((?!\\\().)*?\\\)", item): # 这个
  815. # item = re.sub(r"(\\\(((?!\\\().)*?)\$<\$(((?!\\\().)*?\\\))", r"\1 \lt \3", item) # 线上r"\1 &lt; \2"
  816. while re.search(r"\\\(.*?\$<\$.*?\\\)", item):
  817. item = re.sub(r"(\\\(.*?)\$<\$(.*?\\\))", r"\1 &lt; \2", item) # r"\1 \t \2"
  818. item = re.sub(r"【(/?su[bp]|br\s*/?|/?table( .*?)?|/?tbody( .*?)?|/?t[rhd]( .*?)?)】", r"<\1>", item)
  819. item = re.sub(r"(<br\s*/?>\s*|\n\s*)+<(/?table( .*?)?|/?tbody( .*?)?|/?t[rhd]( .*?)?)>\s*(<br\s*/?>\s*|\n\s*)+",
  820. r"<\2>", item)
  821. item = item.replace("$<$span class=", "<span class=")
  822. return item
  823. def insert_sort2get_idx(item_list, num):
  824. """
  825. :param item_list: 拍好序的列表
  826. :param num: 插入的数值
  827. :return: 插入的位置
  828. """
  829. add_n = 0
  830. for i in range(len(item_list)):
  831. if num > item_list[i]:
  832. add_n += 1
  833. else:
  834. break
  835. return add_n
  836. # def find_seq_num(num_list):
  837. # """
  838. # 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下
  839. # 将连续的数字进行分组
  840. # :param num_list:输入[3, 4, 8, 9, 12, 13, 14]
  841. # :return: [[3, 4],[8, 9],[12, 13, 14]]
  842. # """
  843. # seq_ranges = []
  844. # for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]):
  845. # group = (map(itemgetter(1), g))
  846. # group = list(map(int, group))
  847. # seq_ranges.append(group)
  848. # return seq_ranges
  849. # def del_exception_value(item_list):
  850. # """
  851. # 去列表中的异常值,题目越多,越容易突出异常值
  852. # :return:
  853. # """
  854. # import numpy as np
  855. # max_v = max(item_list)
  856. # arr_mean = np.mean(item_list) # 均值
  857. # arr_var = np.var(item_list) # 方差
  858. # while max_v > len(item_list)+4:
  859. # item_list.remove(max_v)
  860. # print(item_list)
  861. # arr_mean = np.mean(item_list) # 去最大值后的均值
  862. # arr_var = np.var(item_list) # 去最大值后的方差
  863. # max_v = max(item_list)
  864. # # print("均值与方差:",arr_mean,arr_var)
  865. # if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3:
  866. # return item_list
  867. # else:
  868. # exception_value = []
  869. # for i in item_list:
  870. # # print(abs((i - arr_mean) / arr_var), i)
  871. # if(abs((i - arr_mean)/arr_var)) > 0.3:
  872. # exception_value.append(i)
  873. # right_seq = [i for i in item_list if i not in exception_value]
  874. # return right_seq
  875. def pic_transfer(con_list):
  876. aft_opt = [] # 针对选项后是题目图片的情况,进行移位
  877. if "\n" in con_list[-1]:
  878. ccon = re.split("\n+", con_list[-1])
  879. while re.match("<img src=", ccon[-1]) and len(ccon) > 1:
  880. aft_opt.insert(0, ccon[-1])
  881. ccon = ccon[:-1]
  882. if aft_opt:
  883. con_list[0] += "\n" + "\n".join(aft_opt)
  884. con_list[-1] = "\n".join(ccon)
  885. con_list[0] = re.sub(r"\(\d+分\)", "", con_list[0][:9]) + con_list[0][9:]
  886. return con_list
  887. def judge_split_error(item_list):
  888. """
  889. 转对试卷切分后的小题判断是否存在切分错误的情况,能纠错就纠错,不能则删除
  890. :return:
  891. """
  892. # for k, v in enumerate(item_list):
  893. # if k>0 and v['item_id'] - item_list[k-1]['item_id']>1:
  894. # if
  895. if __name__ == '__main__':
  896. # -------------生成requirements.txt---------------
  897. # pip freeze > requirements.txt
  898. # import os, sys
  899. #
  900. # project_root = os.path.dirname(os.path.realpath(__file__)) # 找到当前目录
  901. # print(project_root)
  902. #
  903. # # 找到解释器,虚拟环境目录
  904. # python_root = sys.exec_prefix
  905. # print(python_root)
  906. #
  907. # # 拼接生成requirements命令
  908. # command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt'
  909. # print(command)
  910. #
  911. # # 执行命令。
  912. # os.system(command)
  913. # ----------------一键安装 requirements.txt------------
  914. # pip install -r requirement.txt
  915. # python_root + '\Scripts\' + pip install -r requirements.txt
  916. # import os
  917. # rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx")
  918. # print(rrr)
  919. # item = "<a 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$ <img 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$"
  920. # item = r"2.下列选项中,使不等式\( x<\frac{1}{x}< x_{2} \)"
  921. # ww = css_conflict_deal(item)
  922. # print(ww)
  923. p1 = r"C:\Users\Python\Desktop\123\62314b31a7d375f4518b9afd.html"
  924. t1 = open(p1, 'r', encoding="utf8").read()
  925. res = HtmlWash(t1, '11111111',must_latex=1).html_cleal()
  926. print(res)
  927. # html, wordid, is_reparse=0, img_url="", must_latex=0)