# encoding=utf-8 import os import sys sys.path.append(os.getcwd()) sys.path.append("/home/cv/workspace/tujintao/document_segmentation") import os import re import requests import unicodedata from bs4 import BeautifulSoup from Utils.main_clear.latex2maple.latex2maple import structured # tjt新增修改(针对页面渲染直接取出data-latex导致清洗出错进行特殊处理) # '$'特殊处理(方法一) def non_data_latex_replace(s): s = s.replace("\(","$") s = s.replace("\)","$") s_doll = s.replace("$", "$$") s_list = s_doll.split('$') s_list_len = len(s_list) if s_list_len == 1: return s i = 0 # ['文本','','公式','','数字','','','公式','']->找空字符串去处理被''包围"公式"和"数字" while i < s_list_len: if s_list[i] == '': if i+2 < s_list_len and s_list[i+1] != '' and s_list[i+2] == '': s_list[i+1] = structured(s_list[i+1].replace('%20', ' ').replace('%3C', '<').replace('%3E', '>')) i += 2 i += 1 return ''.join(s_list) # '$'特殊处理(方法二) def non_data_latex_iter(s): s = s.replace("\(","$") s = s.replace("\)","$") s_doll = s.replace("$", "$$") s_list = s_doll.split('$') s_list_len = len(s_list) if s_list_len == 1: return s # ['文本','','公式','','数字','','','公式','']->找被空字符''包围的"公式"和"数字"进行处理 # 要先判断索引是否满足长度要求 s_list = [structured(ele.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>')) if (i+1)0 and ele!='' and s_list[i-1]=='' and s_list[i+1]=='' else ele for i, ele in enumerate(s_list)] return ''.join(s_list) # '$'特殊处理(方法三) def non_data_latex_regexp(s): s = s.replace("\(","$") s = s.replace("\)","$") re_list = re.findall(r"\$.*?\$", s) if len(re_list) > 0: latex_list = [ele for ele in re_list if ele.strip() != ''] if len(latex_list) == len(re_list): latex_list = [structured(ele.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>')) for ele in latex_list] for i in range(len(latex_list)): s = s.replace(re_list[i], latex_list[i], 1) return s # 转义字符特殊处理 def escape_func(s): # 转义字符转换 # s = s.replace("\a", "\\a") # s = s.replace("\b", "\\b") # s = s.replace("\f", "\\f") # s = s.replace("\n", "\\n") # s = s.replace("\r", "\\r") # s = s.replace("\t", "\\t") # s = s.replace("\v", "\\v") # s = s.replace('<\/', '") # s = s.replace(r'\theta', 'θ') s = s.replace(r'\a*rg', 'arg') # s = s.replace(r'\leftrightarrow', '↔') # s = s.replace(r'\Leftrightarrow', '⇔') # s = s.replace(r'\rightleftharpoons', '⇌') # s = s.replace(r'\leftharpoonup', '↼') # s = s.replace(r'\rightharpoonup', '⇀') # s = s.replace(r'\leftharpoondown', '↽') # s = s.replace(r'\rightharpoonupdown', '⇁') # s = s.replace(r'\leftarrow', '←') # s = s.replace(r'\Leftarrow', '⇐') # s = s.replace(r'\rightarrow', '→') # 前面要加上r,否则结果显示替换不成功 # s = s.replace(r'\Rightarrow', '⇒') # s = s.replace(r'\right', '') # 前面要加上r,否则结果显示替换不成功 # s = s.replace(r'\left', '') # 向量特殊处理 # s = s.replace("overleftarrow", "overrightarrow") # # 分号(/)特殊处理 # s = s.replace("dfrac", "frac") # 标签特殊处理 s = s.replace('', '$').replace('', '$') return s def get_maplef_items(html): """ # data-latex="xxx", 不要data-latex=\"xxx\"和data-latex=\'xxx\',否则BeautifulSoup会掉内容 """ html = re.sub('(data-latex=".*?")', lambda x: x.group(1).replace("\n", ""), html, flags=re.S) html = escape_func(html) soup = BeautifulSoup(html, features="lxml") s = '' # print(soup.prettify()) for i in soup.prettify().split('\n'): if i.strip().startswith('')) s3 = re.sub(r"^\\\[(.*?)\\\]$", r"$\1$", s3) s += s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>').strip() else: s3 = s2.img.get('src') if not s3: continue elif len(s3.split('?')) == 2: s3 = s3.split('?')[-1] else: if 'class="tiankong"' in i: s += '____' s3 = '' else: s3 = 'img' s += structured(s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>')) elif i.strip().startswith('<'): if re.match("|

", i.strip()): s += "\\n" pass else: s += i.strip() # print(s) # print("****************************") # tjt新增修改(针对页面渲染直接取出data-latex导致清洗出错进行特殊处理) # try: # s = non_data_latex_iter(s) # 拿到字符串中的latex再转maple # except Exception as e: # try: # s = non_data_latex_regexp(s) # 拿到字符串中的latex再转maple # except Exception as e: # pass # 结果显示替换不成功 s = re.sub(r'\s+', " ", s) s = re.sub(r'\n+', "\n", s) s = s.replace('#', '').replace(" ", " ") # s = re.sub(r"(begin|end){?(gathered|array)", "", s) s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace('%20', '') s = s.replace('%3E', '>') s = s.replace('%3C', '<') # s = s.replace('img', '') # s = s.replace('$', '') # s = s.replace('mathbf', '') # s = s.replace('operatornamem', '') # s = s.replace('beginarra*y', '') # s = s.replace('endarra*y', '') # s = s.replace('slant', '') # s = s.replace('endarra*y', '') # s = s.replace('hfill', '') # 结果显示替换不成功 # s = s.replace('\n', '') # s = s.replace('\r', '') # s = s.replace(r'①', '(1)、') # s = s.replace(r'②', '(2)、') # s = s.replace(r'③', '(3)、') # s = s.replace(r'④', '(4)、') # s = s.replace(r'⑤', '(5)、') # s = s.replace(r'⑥', '(6)、') # s = s.replace(r'⑦', '(7)、') # s = s.replace(r'⑧', '(8)、') # s = s.replace(r'⑨', '(9)、') # s = s.replace(r'⑩', '(10)、') # s = re.sub(r'/images/1-50/[1-9].gif', '( )', s) # s = re.sub(r'/([0-9a-z/*.]*?)(png|jpg|gif)', 'img', s) # s = unicodedata.normalize('NFKC', s) # 中文符号转换成英文 # s = s.replace(",", ",").replace(":", ":").replace(";", ";").replace("“", "'").replace("”", "'"). \ # replace("?", "?").replace("!", "!").replace("。", ",").replace("(", "(").replace(")", ")"). \ # replace(".", ".").replace("【", "[").replace("】", "]") # tjt修改支持"= ."或"=___."处理 # s = re.sub(r"[==][ _]*\.?$", "等于多少", s) # tjt注释 # s = re.sub(r"[==]\s*(\(\s*\)|_+|(\s*))\.?", "等于多少", s) # s = re.sub(r"[==]\.?$", "等于多少", s) # s = re.sub(r"\(\s*[??]*\s*\)|(\s*[??]*\s*)\.?$", "", s) # s = re.sub(r"(为|是|等于|=|=)img\.?$", "等于多少", s) # s = re.sub(r"img(为|是|等于|=|=)\.?$", "等于多少", s) # s = re.sub(r"[fFGg]\^\(-1\)\*\(.*?\)", "反函数_y", s) # s = s.replace("图像", "图象") # s = s.replace("椭圆", "椭椭") # s = s.replace("⇒", "所以") # s = re.sub(r'img$', '', str(s)) # s = re.sub(r"[==]\s*(\(\s*\)|_{1,5}|(\s*))", "等于多少", s) # s = re.sub(r"\(\s*[??]*\s*\)|(\s*[??]*\s*)$", "", s) # s = re.sub(r"(为|是|等于)img$", "等于多少", s) # # s = re.sub(r"([A-Za-z0-9])'", r"\1", s) # s = re.sub(r"over(right|left)arrow", "向量", s) # s = re.sub(r"[Vv]e\*?nn", "韦恩", s) # s = re.sub(r"\^['′]", "'", s) # tjt修改 # s = s.replace("×", "*") # s = s.replace("%%", '') # s = s.replace('\\n', '') # s = s.replace('\\r', '') # s = s.replace("^°", "°") # s = re.sub(r"°\^([Cc])", r"°\1", s) # s = re.sub(r"([0-9])\)°([^Cc])", r"\1°)\2", s) # # tjt修改支持"= ."或"=___."处理 # s = re.sub(r"[==][ _]*\.?$", "等于多少", s) return s.strip() if __name__ == "__main__": # print(structured( r'$z = \left( {{m^2} - 5m + 6} \right) + \left( {m - 3} \right)i$')) # li = ['

', '

', '

', '

'] # s= r"""

Have you ever wondered how your favorite NBA team received its famous name? All NBA teams have an interesting story or a history behind their names. Some of the names reflect the city's culture or history, others came from previous owners and many were selected through "Name the Team" contests.
For teams like Los Angeles and Utah, the names were not always a reflection of the city. Even though Los Angeles has no lakes, the Laker name has been a city treasure for almost 40 years. Before going to Los Angeles, the team originated in Minneapolis, Minnesota. In 1948, team officials chose the name for its direct relationship to the state's motto, "The Land of 10,000 Lakes." The team name went unchanged after moving to Los Angeles in 1960.
Because Utah's team originated in New Orleans, Louisiana, it was called the Jazz. In 1974, New Orleans club officials chose the name to represent the city for its reputation as the "jazz capital of the world." The name stayed with the team even after finding a new home in Salt Lake City, Utah in 1979.
The Chicago Bull's original owner, Richard Klein, named the team the Bulls. He picked the name because a fighting bull is relentless, and never quits. Klein, who founded the club in1966, believed these qualities were necessary for a championship team and hoped his Chicago athletes would live up to the team name. A belief that Bulls—winner of the six NBA championships— have definitely followed.
In 1967, the Indian Pacers selected their team name in a different way from most other teams. Their decision was based on what they wanted to accomplish in the NBA. Team officials chose the Pacers name because the organization wanted to set the "pace" in professional basketball.

""" # from requests_toolbelt import MultipartEncoder # import requests # print(requests.post('http://192.168.1.145:8086/math_data_clean',data = {'item':s}).json()) # # for s in li: s="""

如图,已知平分,求的度数.

解:______,

______

____________,

平分

____________.

""" s="""

为确保信息安全,信息需加密传输,发送方由明文密文(加密),接收方由密文明文(解密),已知加密规则为:明文对应密文.例如,明文对应密文.当接收文收到密文,解密得到的明文为(   )

""" s="""

""" s="""图1是放置在水平地面上的落地式话筒架实物图,图2是其示意图.支撑杆AB垂直于地面l,活动杆CD固定在支撑杆上的点E处.若∠AED=48°,BE=110*cm,DE=80*cm,求活动杆端点D离地面的高度DF.(结果精确到1*cm,参考数据:sin(48)°≈0.74,cos(48)°≈0.67,tan(48)°≈1.11)""" s="""

如图,在直三棱柱中,是边长为的等边三角形,分别为的中点.

(1)证明:平面;

(2)若异面直线所成的余弦值为,求与平面所成角的正弦值.

""" s = '大上传流程测试型解答题_{}99 999
(1)问题一
(2)问题二' s = r' 1 . 1cm3纯水的质量是1g,1 cm3水中有3.34×1022个水分子,试计算:
(1)1个水分子的质量约为多少千克?       
(2)若水分子的直径约为40nm,将1cm3水中的水分子紧密排列成一条直线,共有多长?' print(non_data_latex_iter(s)) # import json # f = json.loads(open(r"C:\Users\Administrator\Desktop\62b41710765759d85567a54b__2023_05_16_18_02_29.json", # 'r',encoding="utf-8").read()) # for i in f["items"]: # print(get_maplef_items(i["stem"]))