import re
# import sys
from PointerNet import config
from bs4 import BeautifulSoup
from Utils.main_clear.sci_clear import non_data_latex_iter, non_data_latex_regexp
logger = config.myLog(__name__, log_cate="tmp_clear",
subject="clear_log").getlog()
num2circle = {"1": "①", "2": "②", "3": "③", "4": "④",
"5": "⑤", "6": "⑥", "7": "⑦", "8": "⑧",
"9": "⑨", "10": "⑩", "11": "⑪", "12": "⑫",
"13": "⑬", "14": "⑭", "15": "⑮", "16": "⑯",
"17": "⑰", "18": "⑱", "19": "⑲", "20": "⑳"}
def ltx_wash(ss):
raw_ss = ss
try:
ss = non_data_latex_iter(ss) # 拿到字符串中的latex再转maple
ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
except:
try:
ss = non_data_latex_regexp(ss) # 拿到字符串中的latex再转maple
ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
except:
pass
if ss.replace("$", "").strip():
ss = "${}$".format(ss)
ss = re.sub(r'\$\s*\$(.+?)\$\s*\$', r"$\1$", ss.strip())
ss = f"【{ss}###{raw_ss}】"
else:
ss = "【公式】"
return ss
def simpwash(html, paper_id, need_latex=0):
"""
# data-latex="xxx", 引号里面不要再有",否则BeautifulSoup会掉内容
"""
imgs = []
formula_without_ltx = 0
html = re.sub(r'data-latex="(.*?)(?])', lambda x: 'data-latex="{}"'.format(
x.group(1).replace("\"", "'")), html, flags=re.S) # .replace("\n", "")
html = html.replace(r'*-*', '').replace(r'\a*rg', 'arg').replace(' \s*
', "
", html)
html = html.replace("'", "'").replace(" ", " ").replace("\xa0", " ")
html = re.sub(r'(
', "
", html) s = re.sub('
', "
", s)
s = re.sub('
', "
", s)
# ltx处理
def sub4(ss):
if ss.group(2).replace("$", "").strip():
new_ltx = ltx_wash(ss.group(2))
if new_ltx != "【公式】":
return new_ltx
return f"【公式{ss.group(0)}】"
s = re.sub(r' data-latex="(\\\\\[|\\\[)(.*?)(\\\]|\\\\\])"', r' data-latex="$\2$"', s)
s = re.sub(r').)+?\$?)".*? />', sub4, s, flags=re.S)
else:
# print("paper_id:::", paper_id)
soup = BeautifulSoup(html, features="lxml")
s = ''
# print(soup.prettify())
quan_begin_with_zero = 0
all_parts = soup.prettify().split('\n') # 这里必须是\n
for nn, i in enumerate(all_parts):
# print(i)
if i.strip().startswith(''))
if re.sub(r"^\\\[|\\\]$|\$|\s+", "", s3):
s3 = re.sub(r"^\\\[(.*?)\\\]$", r"$\1$", s3)
if re.match("\$.*?\$$", s3.strip()) is None:
s3 = "${}$".format(s3)
# print("latex:::", s3)
s3 = ltx_wash(s3)
if s3 == "【公式】":
s3 = f"【公式{s2.img}】"
# print("latex_washed:::", s3)
s += s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>').strip()
else:
if need_latex:
formula_without_ltx = 1
break
s += f"【公式{s2.img}】"
else:
s3 = s2.img.get('src')
if not s3:
continue
elif 'class="gsImgLatex mathType"' in i:
if len(s3.split('?')) == 2:
# http://tkimgs.zhixinhuixue.net/image/word/2021/11/11/1636638682578739.gif?%20-%20{e^{%20-%20x}}%20-%203x
s3 = ltx_wash("${}$".format(s3.split('?')[-1]))
if s3 == "【公式】":
s3 = f"【公式{s2.img}】"
else:
if re.search('.gif("|$)', s3) is None:
logger.info("【{}】特殊公式格式{}".format(paper_id, str(i)))
if need_latex:
formula_without_ltx = 1
break
s3 = f"【公式{s2.img}】"
else:
if 'data-type="math"' in i:
if need_latex:
formula_without_ltx = 1
break
s3 = f"【公式{s2.img}】"
elif 'class="tiankong"' in i: # ①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳
serial_num = int(s2.img.get('data-num'))
if serial_num == 0:
quan_begin_with_zero = 1
if quan_begin_with_zero:
serial_num = serial_num + 1
if 1 <= serial_num <= 20:
s3 = '__{}__'.format(num2circle[str(serial_num)])
elif int(serial_num) + 1 > 20:
s3 = '__({})__'.format(serial_num)
else:
s3 = '____'
logger.info("【{}】特殊带圈符号:{}".format(paper_id, str(i)))
else:
s3 = s2.img.__str__()
# imgs.append(str(s2.img))
# if s2.img.get('height'):
# img_h = s2.img.get('height')
# else:
# try:
# img_h = Image.open(io.BytesIO(requests.get(s3).content)).size[1]
# except:
# logger.info("【{}】图片有问题:{}".format(subject, paper_id))
# img_h = 0 # 默认不要
# raw_img = re.findall(']*?src="'+s3+'"[^<>]*?/?>', html)
# print(str(s2.img))
# imgs.extend(raw_img)
# img_h = str(img_h).replace("px", "").replace("pt", "").replace("in", "")
# s3 = '【图片】'
# if need_latex == 0 and float(img_h) <= 50:
# s3 = ''
s += s3
elif i.strip().startswith("
|
', '', ' |