123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248 |
- # encoding=utf-8
- import os
- import sys
- sys.path.append(os.getcwd())
- import os
- import re
- import requests
- import unicodedata
- from bs4 import BeautifulSoup
- from main_clear.latex2maple.latex2maple import structured
- from config import formula_url
- # tjt新增修改(针对页面渲染直接取出data-latex导致清洗出错进行特殊处理)
- # '$'特殊处理(方法一)
- def non_data_latex_replace(s):
- s = s.replace("\(", "$")
- s = s.replace("\)", "$")
- s_doll = s.replace("$", "$$")
- s_list = s_doll.split('$')
- s_list_len = len(s_list)
- if s_list_len == 1:
- return s
- i = 0
- # ['文本','','公式','','数字','','','公式','']->找空字符串去处理被''包围"公式"和"数字"
- while i < s_list_len:
- if s_list[i] == '':
- if i+2 < s_list_len and s_list[i+1] != '' and s_list[i+2] == '':
- s_list[i+1] = structured(s_list[i+1].replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
- i += 2
- i += 1
- return ''.join(s_list)
- # '$'特殊处理(方法二)
- def non_data_latex_iter(s):
- s = s.replace("\(", "$")
- s = s.replace("\)", "$")
- s_doll = s.replace("$", "$$")
- s_list = s_doll.split('$')
- s_list_len = len(s_list)
- if s_list_len == 1:
- return s
- # ['文本','','公式','','数字','','','公式','']->找被空字符''包围的"公式"和"数字"进行处理
- # 要先判断索引是否满足长度要求
- s_list = [structured(ele.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
- if (i+1)<s_list_len and i>0 and ele!='' and s_list[i-1]=='' and s_list[i+1]=='' else ele
- for i, ele in enumerate(s_list)]
- return ''.join(s_list)
- # '$'特殊处理(方法三)
- def non_data_latex_regexp(s):
- s = s.replace("\(", "$")
- s = s.replace("\)", "$")
- re_list = re.findall(r"\$.*?\$", s)
- if len(re_list) > 0:
- latex_list = [ele for ele in re_list if ele.strip() != '']
- if len(latex_list) == len(re_list):
- latex_list = [structured(ele.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
- for ele in latex_list]
- for i in range(len(latex_list)):
- s = s.replace(re_list[i], latex_list[i], 1)
- return s
- # 转义字符特殊处理
- def escape_func(s):
- # 转义字符转换
- s = s.replace("\a", "\\a")
- s = s.replace("\b", "\\b")
- s = s.replace("\f", "\\f")
- s = s.replace("\n", "\\n")
- s = s.replace("\r", "\\r")
- s = s.replace("\t", "\\t")
- s = s.replace("\v", "\\v")
- s = s.replace('<\/', '</')
- s = s.replace('\\\\', '\\')
- # 部分指令特殊处理
- s = s.replace(r'*-*', '')
- s = s.replace(r'\[', '')
- s = s.replace(r'\]', '')
- s = s.replace(r"\lt", "<")
- s = s.replace(r"\gt", ">")
- s = s.replace(r'\theta', 'θ')
- s = s.replace(r'\a*rg', 'arg')
- s = s.replace(r'\leftrightarrow', '↔')
- s = s.replace(r'\Leftrightarrow', '⇔')
- s = s.replace(r'\rightleftharpoons', '⇌')
- s = s.replace(r'\leftharpoonup', '↼')
- s = s.replace(r'\rightharpoonup', '⇀')
- s = s.replace(r'\leftharpoondown', '↽')
- s = s.replace(r'\rightharpoonupdown', '⇁')
- s = s.replace(r'\leftarrow', '←')
- s = s.replace(r'\Leftarrow', '⇐')
- s = s.replace(r'\rightarrow', '→') # 前面要加上r,否则结果显示替换不成功
- s = s.replace(r'\Rightarrow', '⇒')
- s = s.replace(r'\right', '') # 前面要加上r,否则结果显示替换不成功
- s = s.replace(r'\left', '')
- # 向量特殊处理
- s = s.replace("overleftarrow", "overrightarrow")
- # 分号(/)特殊处理
- s = s.replace("dfrac", "frac")
- # <latex>标签特殊处理
- s = s.replace('<latex>', '$').replace('</latex>', '$')
- return s
- def get_maplef_items(html, hnsw_index, is_train=False):
- html = re.sub('(data-latex=".*?")', lambda x: x.group(1).replace("\n", ""), html, flags=re.S)
- html = escape_func(html)
- soup = BeautifulSoup(html, features="lxml")
- s = ''
- for i in soup.prettify().split('\n'):
- if i.strip().startswith('<img'):
- s2 = BeautifulSoup(i, features="lxml")
- if s2.img:
- s3 = s2.img.get('data-latex')
- if s3:
- s += structured(s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
- else:
- s3 = s2.img.get('src')
- if not s3:
- continue
- elif len(s3.split('?')) == 2:
- s3 = s3.split('?')[-1]
- else:
- # 云题库特殊处理
- if hnsw_index == 0 or is_train is True:
- if 'class="tiankong"' in i:
- s += '____'
- s3 = ''
- else:
- s3 = 'img'
- # 校本题库特殊处理
- elif hnsw_index == 1 and is_train is False:
- try:
- img_res = requests.post(formula_url, data={'img_url': s3}).json()
- if img_res['is_success'] == 1:
- s3 = img_res['texts']
- else:
- s3 = 'img'
- except Exception as e:
- s3 = 'img'
- # <latex>标签特殊处理
- s3 = s3.replace('<latex>', '$').replace('</latex>', '$')
- s += structured(s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
- elif i.strip().startswith('<'):
- pass
- else:
- s += i
- # tjt新增修改(针对页面渲染直接取出data-latex导致清洗出错进行特殊处理)
- try:
- # '$'特殊处理
- s = non_data_latex_iter(s)
- except Exception as e:
- try:
- s = non_data_latex_regexp(s)
- except Exception as e:
- pass
- # 结果显示替换不成功
- s = re.sub(r'\s', " ", s)
- s = re.sub(r"(begin|end){?(gathered|array)", "", s)
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace('%20', '')
- s = s.replace('%3E', '>')
- s = s.replace('%3C', '<')
- s = s.replace('img', '')
- s = s.replace('$', '')
- s = s.replace('mathbf', '')
- s = s.replace('operatornamem', '')
- s = s.replace('beginarra*y', '')
- # s = s.replace('endarra*y', '')
- s = s.replace('slant', '')
- s = s.replace('endarra*y', '')
- s = s.replace('hfill', '')
- s = s.replace('#', '')
- # 结果显示替换不成功
- s = s.replace('\n', '')
- s = s.replace('\r', '')
- s = s.replace(r'①', '(1)、')
- s = s.replace(r'②', '(2)、')
- s = s.replace(r'③', '(3)、')
- s = s.replace(r'④', '(4)、')
- s = re.sub(r'/images/1-50/[1-9].gif', '( )', s)
- s = re.sub(r'/([0-9a-z/*.]*?)(png|jpg|gif)', 'img', s)
- s = unicodedata.normalize('NFKC', s) # 中文符号转换成英文
- s = s.replace(",", ",").replace(":", ":").replace(";", ";").replace("“", "'").replace("”", "'"). \
- replace("?", "?").replace("!", "!").replace("。", ",").replace("(", "(").replace(")", ")"). \
- replace(".", ".").replace("【", "[").replace("】", "]")
-
- # tjt修改支持"= ."或"=___."处理
- s = re.sub(r"[==][ _]*\.?$", "等于多少", s)
- # tjt注释
- # s = re.sub(r"[==]\s*(\(\s*\)|_+|(\s*))\.?", "等于多少", s)
- # s = re.sub(r"[==]\.?$", "等于多少", s)
- s = re.sub(r"\(\s*[??]*\s*\)|(\s*[??]*\s*)\.?$", "", s)
- s = re.sub(r"(为|是|等于|=|=)img\.?$", "等于多少", s)
- s = re.sub(r"img(为|是|等于|=|=)\.?$", "等于多少", s)
- s = re.sub(r"[fFGg]\^\(-1\)\*\(.*?\)", "反函数_y", s)
- s = s.replace("图像", "图象")
- s = s.replace("椭圆", "椭椭")
- s = s.replace("⇒", "所以")
- s = re.sub(r'img$', '', str(s))
- s = re.sub(r"[==]\s*(\(\s*\)|_{1,5}|(\s*))", "等于多少", s)
- s = re.sub(r"\(\s*[??]*\s*\)|(\s*[??]*\s*)$", "", s)
- s = re.sub(r"(为|是|等于)img$", "等于多少", s)
- # s = re.sub(r"([A-Za-z0-9])'", r"\1", s)
- s = re.sub(r"over(right|left)arrow", "向量", s)
- s = re.sub(r"[Vv]e\*?nn", "韦恩", s)
- s = re.sub(r"\^['′]", "'", s)
- # tjt修改
- s = s.replace("×", "*")
- s = s.replace("%%", '')
- s = s.replace('\\n', '')
- s = s.replace('\\r', '')
- s = s.replace("^°", "°")
- s = re.sub(r"°\^([Cc])", r"°\1", s)
- s = re.sub(r"([0-9])\)°([^Cc])", r"\1°)\2", s)
- # tjt修改支持"= ."或"=___."处理
- s = re.sub(r"[==][ _]*\.?$", "等于多少", s)
- return s
- if __name__ == "__main__":
- # print(structured( r'$z = \left( {{m^2} - 5m + 6} \right) + \left( {m - 3} \right)i$'))
- # li = ['<p><img src="/data/word/wordimg/2019/05/5ce6485c56f42.png" style="width: 36pt; height: 21.6pt" data-type="math" data-latex="\\[y = {e^x}\\]" width="48" height="29"/></p>', '<p><img src="/data/word/wordimg/2019/05/5ce6485c58025.png" style="width: 64.8pt; height: 28.8pt" data-type="math" data-latex="\\[y = - {\\log _{\\frac{1}{\\pi }}}x\\]" width="86" height="38"/></p>', '<p><img src="/data/word/wordimg/2019/05/5ce6485c5910a.png" style="width: 43.2pt; height: 21.6pt" data-type="math" data-latex="\\[y = \\sqrt x \\]" width="58" height="29"/></p>', '<p><img src="/data/word/wordimg/2019/05/5ce6485c5a2b1.png" style="width: 57.6pt; height: 28.8pt" data-type="math" data-latex="\\[y = {\\log _{\\frac{1}{2}}}x\\]" width="77" height="38"/></p>']
- #
- s= r"""<article><p>Have you ever wondered how your favorite NBA team received its famous name? All NBA teams have an interesting story or a history behind their names. Some of the names reflect the city's culture or history, others came from previous owners and many were selected through "Name the Team" contests.<br/>For teams like Los Angeles and Utah, the names were not always a reflection of the city. Even though Los Angeles has no lakes, the Laker name has been a city treasure for almost 40 years. Before going to Los Angeles, the team originated in Minneapolis, Minnesota. In 1948, team officials chose the name for its direct relationship to the state's motto, "The Land of 10,000 Lakes." The team name went unchanged after moving to Los Angeles in 1960.<br/>Because Utah's team originated in New Orleans, Louisiana, it was called the Jazz. In 1974, New Orleans club officials chose the name to represent the city for its reputation as the "jazz capital of the world." The name stayed with the team even after finding a new home in Salt Lake City, Utah in 1979.<br/>The Chicago Bull's original owner, Richard Klein, named the team the Bulls. He picked the name because a fighting bull is relentless, and never quits. Klein, who founded the club in1966, believed these qualities were necessary for a championship team and hoped his Chicago athletes would live up to the team name. A belief that Bulls—winner of the six NBA championships— have definitely followed.<br/>In 1967, the Indian Pacers selected their team name in a different way from most other teams. Their decision was based on what they wanted to accomplish in the NBA. Team officials chose the Pacers name because the organization wanted to set the "pace" in professional basketball.<br/></p></article>"""
- # from requests_toolbelt import MultipartEncoder
- # import requests
- # print(requests.post('http://192.168.1.145:8086/math_data_clean',data = {'item':s}).json())
- #
- # for s in li:
- s="""<p>如图,已知<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/c0b6d5bb7d966f18ef9353138f6352cc.png" style="width: 65.25pt; height: 14.25pt" data-type="math" data-latex="$\angle AOB = 40^\circ $" width="87" height="19" />,<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/5dd55320d1a52d7d34b2fd5e94433c8c.png" style="width: 87.75pt; height: 14.25pt" data-type="math" data-latex="$\angle BOC = 3\angle AOB$" width="117" height="19" />,<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/c1664fb157b79a64d991d7c77af335d7.png" style="width: 21pt; height: 14.25pt" data-type="math" data-latex="$OD$" width="28" height="19" />平分<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/480718502d54647275c7fe8ac58bac75.png" style="width: 36.75pt; height: 14.25pt" data-type="math" data-latex="$\angle AOC$" width="49" height="19" />,求<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/d2d28e3733589e90a99a8391c54171ff.png" style="width: 38.25pt; height: 14.25pt" data-type="math" data-latex="$\angle COD$" width="51" height="19" />的度数.</p><p>解:<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/e18901783e1f03c948bd1a1f617912cf.png" style="width: 72.75pt; height: 14.25pt" data-type="math" data-latex="$\because \angle BOC = 3\angle $" width="97" height="19" />______,<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/47a2d20361dc34d3d372d485f4f8a9f7.png" style="width: 65.25pt; height: 14.25pt" data-type="math" data-latex="$\angle AOB = 40^\circ $" width="87" height="19" />,</p><p><img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/92a7bd71a095ca92577acd42e4cab402.png" style="width: 56.25pt; height: 14.25pt" data-type="math" data-latex="$\therefore \angle BOC = $" width="75" height="19" />______<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/414575fac3ad1e5fb552f948f76c54ac.png" style="width: 8.25pt; height: 12.75pt" data-type="math" data-latex="$^\circ $" width="11" height="17" />,</p><p><img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/c35b0bafb0d67ef84b9d8cc60afdc724.png" style="width: 56.25pt; height: 14.25pt" data-type="math" data-latex="$\therefore \angle AOC = $" width="75" height="19" />______<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/eef861150281f1cf1a5ce7cd013a7995.png" style="width: 11.25pt; height: 11.25pt" data-type="math" data-latex="$ + $" width="15" height="15" />______,</p><p><img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/de740a6139e9045a39fe4820ca34fa2d.png" style="width: 80.25pt; height: 14.25pt" data-type="math" data-latex="$\therefore \angle AOC = 160^\circ $" width="107" height="19" /></p><p><img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/2af5afb7e01160519ad998abe0993af1.png" style="width: 30.75pt; height: 14.25pt" data-type="math" data-latex="$\because OD$" width="41" height="19" />平分<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/b81f079120974bc8ef9736e4c574353c.png" style="width: 36.75pt; height: 14.25pt" data-type="math" data-latex="$\angle AOC$" width="49" height="19" /></p><p><img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/c3551758644fd7c37d9e127b0936448f.png" style="width: 57pt; height: 14.25pt" data-type="math" data-latex="$\therefore \angle COD = $" width="76" height="19" />______<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/ca6337eef25ff570fb1de161ef162948.png" style="width: 9.75pt; height: 9pt" data-type="math" data-latex="$ = $" width="13" height="12" />______<img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/7c4334acc724beee1f283a162be5e1d6.png" style="width: 8.25pt; height: 12.75pt" data-type="math" data-latex="$^\circ $" width="11" height="17" />.</p><p><img src="http://zxhx-pro-1302712961.cos.ap-beijing.myqcloud.com/teacher/uploadfiles/wording/0/2022/07/28/edf70a69a16fe0a1e9e69467f4a90c7e.png" style="width: 2.270833in; height: 1.246731in" width="218" height="120" /></p>"""
- s="""<p><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">为确保信息安全,信息需加密传输,发送方由明文</span><img width="15" height="9" src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zsytk/topic/image/2022/08/05/1659694400394379.gif?%20\to" class="gsImgLatex mathType" style="font-variant-ligatures: normal; orphans: 2; white-space: normal; widows: 2; vertical-align: middle;"/><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">密文(加密),接收方由密文</span><img width="15" height="9" src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zsytk/topic/image/2022/08/05/1659694400394379.gif?%20\to" class="gsImgLatex mathType" style="font-variant-ligatures: normal; orphans: 2; white-space: normal; widows: 2; vertical-align: middle;"/><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">明文(解密),已知加密规则为:明文</span><img width="54" height="15" src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zsytk/topic/image/2022/08/05/1659694401424587.gif?a,b,c,d" class="gsImgLatex mathType" style="font-variant-ligatures: normal; orphans: 2; white-space: normal; widows: 2; vertical-align: middle;"/><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">对应密文</span><img width="179" height="15" src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zsytk/topic/image/2022/08/05/1659694401118594.gif?a%20+%202b,2b%20+%20c,2c%20+%203d,4d" class="gsImgLatex mathType" style="font-variant-ligatures: normal; orphans: 2; white-space: normal; widows: 2; vertical-align: middle;"/><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">.例如,明文</span><img width="54" height="15" src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zsytk/topic/image/2022/08/05/1659694401275521.gif?1,2,3,4" class="gsImgLatex mathType" style="font-variant-ligatures: normal; orphans: 2; white-space: normal; widows: 2; vertical-align: middle;"/><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">对应密文</span><img width="71" height="15" src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zsytk/topic/image/2022/08/05/1659694401650019.gif?5,7,18,16" class="gsImgLatex mathType" style="font-variant-ligatures: normal; orphans: 2; white-space: normal; widows: 2; vertical-align: middle;"/><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">.当接收文收到密文</span><img width="79" height="15" src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zsytk/topic/image/2022/08/05/1659694401264806.gif?14,9,23,28" class="gsImgLatex mathType" style="font-variant-ligatures: normal; orphans: 2; white-space: normal; widows: 2; vertical-align: middle;"/><span style="font-variant-ligatures: normal; orphans: 2; widows: 2;">,解密得到的明文为( )</span></p>"""
- s="""<p><img src=\"http://tkimgs.zhixinhuixue.net/image/word/2021/05/05/1620215474707966.png\" data-latex=\"${G\\dfrac{{m}_{1}{m}_{2}}{{r}^{2}}}$\" width=\"60\" height=\"29\"/></p>"""
- s="""图1是放置在水平地面上的落地式话筒架实物图,图2是其示意图.支撑杆AB垂直于地面l,活动杆CD固定在支撑杆上的点E处.若∠AED=48°,BE=110*cm,DE=80*cm,求活动杆端点D离地面的高度DF.(结果精确到1*cm,参考数据:sin(48)°≈0.74,cos(48)°≈0.67,tan(48)°≈1.11)"""
- s="""一辆汽车在平直的公路上以某一初速度运动,运动过程中保持恒定的牵引功率,其加速度<img src="http://192.168.1.140:8888/ser_static/360/files/image40.png" style="width: 9.65pt; height: 11.3pt" data-type="math" width="13" height="15" />和速度的倒数<img src="http://192.168.1.140:8888/ser_static/360/files/image41.png" style="width: 14.5pt; height: 30.65pt" data-type="math" width="19" height="41" />的关系图象如图所示。若已知汽车的质量<img src="http://192.168.1.140:8888/ser_static/360/files/image42.png" style="width: 68.25pt; height: 18.25pt" data-type="math" width="91" height="24" />,则根据图象所给的信息,能求出的物理量是(<img src="http://192.168.1.140:8888/ser_static/360/files/image43.png" style="width: 60.2pt; height: 18.25pt" data-type="math" width="80" height="24" />)( )<br/><img src="http://192.168.1.140:8888/ser_static/360/files/image44.png" style="width: 1.802083in; height: 1.260417in" width="173" height="121" />"""
- print(get_maplef_items(s, 1))
|