123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253 |
- import re
- # import sys
- from PointerNet import config
- from bs4 import BeautifulSoup
- from Utils.main_clear.sci_clear import non_data_latex_iter, non_data_latex_regexp
- logger = config.myLog(__name__, log_cate="tmp_clear",
- subject="clear_log").getlog()
- num2circle = {"1": "①", "2": "②", "3": "③", "4": "④",
- "5": "⑤", "6": "⑥", "7": "⑦", "8": "⑧",
- "9": "⑨", "10": "⑩", "11": "⑪", "12": "⑫",
- "13": "⑬", "14": "⑭", "15": "⑮", "16": "⑯",
- "17": "⑰", "18": "⑱", "19": "⑲", "20": "⑳"}
- def ltx_wash(ss):
- raw_ss = ss
- try:
- ss = non_data_latex_iter(ss) # 拿到字符串中的latex再转maple
- ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
- except:
- try:
- ss = non_data_latex_regexp(ss) # 拿到字符串中的latex再转maple
- ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
- except:
- pass
- if ss.replace("$", "").strip():
- ss = "${}$".format(ss)
- ss = re.sub(r'\$\s*\$(.+?)\$\s*\$', r"$\1$", ss.strip())
- ss = f"【{ss}##&##{raw_ss}】"
- else:
- ss = "【公式】"
- return ss
- def simpwash(html, paper_id, need_latex=0):
- """
- # data-latex="xxx", 引号里面不要再有",否则BeautifulSoup会掉内容
- """
- imgs = []
- formula_without_ltx = 0
- html = re.sub(r'data-latex="(.*?)(?<!\\)"(?=[\s/>])', lambda x: 'data-latex="{}"'.format(
- x.group(1).replace("\"", "'")), html, flags=re.S) # .replace("\n", "")
- html = html.replace(r'*-*', '').replace(r'\a*rg', 'arg').replace('<latex>', '$').replace('</latex>', '$')
- html = re.sub(r'<br/\s*>', "<br>", html)
- html = html.replace("'", "'").replace(" ", " ").replace("\xa0", " ")
- html = re.sub(r'(<t[dr] [^<>]*?)style="[^<>]*?"', r"\1", html)
- html = re.sub(r'(<t[dr] [^<>]*?)valign="[^<>]*?"', r"\1", html)
- html = re.sub(r'(<t[dr] [^<>]*?)align="[^<>]*?"', r"\1", html)
- html = re.sub(r'(<t[dr] [^<>]*?)class="[^<>]*?"', r"\1", html)
- html = re.sub(r'(<t[dr] [^<>]*?)width="[^<>]*?"', r"\1", html)
- html = re.sub(r'(<t[dr] [^<>]*?)height="[^<>]*?"', r"\1", html)
- html = re.sub(r'<p style="[^<>]*?">\s*</p>', "", html)
- if re.search('</?(span|font|article|ul|ol|div)(\s*|\s+style=.*?"|\s+class=.*?")>|text\s*-\s*decoration: underline|border\s*-\s*bottom:'
- '|class="gsImgLatex| type="| style="| class="', str(html)) is None:
- # content = re.sub('</p\s*>', "\n", content).strip().replace("\n\n", "\n").replace("\n", "<br/>").
- s = re.sub('<p(\s*|\s+style=.*?")>', "<p>", html)
- s = re.sub('</p><p>', "<br>", s)
- s = re.sub('</p>|<p>', "<br>", s)
- # ltx处理
- def sub4(ss):
- if ss.group(2).replace("$", "").strip():
- new_ltx = ltx_wash(ss.group(2))
- if new_ltx != "【公式】":
- return new_ltx
- return f"【公式{ss.group(0)}】"
- s = re.sub(r' data-latex="(\\\\\[|\\\[)(.*?)(\\\]|\\\\\])"', r' data-latex="$\2$"', s)
- s = re.sub(r'<img src=((?!src).)+?data-latex="(\$?((?!["/]>).)+?\$?)".*? />', sub4, s, flags=re.S)
-
- else:
- # print("paper_id:::", paper_id)
- soup = BeautifulSoup(html, features="lxml")
- s = ''
- # print(soup.prettify())
- quan_begin_with_zero = 0
- all_parts = soup.prettify().split('\n') # 这里必须是\n
- for nn, i in enumerate(all_parts):
- # print(i)
- if i.strip().startswith('<img'):
- s2 = BeautifulSoup(i, features="lxml")
- if s2.img:
- s3 = s2.img.get('data-latex')
- # print(s3)
- if s3:
- # s += structured(s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
- if re.sub(r"^\\\[|\\\]$|\$|\s+", "", s3):
- s3 = re.sub(r"^\\\[(.*?)\\\]$", r"$\1$", s3)
- if re.match("\$.*?\$$", s3.strip()) is None:
- s3 = "${}$".format(s3)
- # print("latex:::", s3)
- s3 = ltx_wash(s3)
- if s3 == "【公式】":
- s3 = f"【公式{s2.img}】"
- # print("latex_washed:::", s3)
- s += s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>').strip()
- else:
- if need_latex:
- formula_without_ltx = 1
- break
- s += f"【公式{s2.img}】"
- else:
- s3 = s2.img.get('src')
- if not s3:
- continue
- elif 'class="gsImgLatex mathType"' in i:
- if len(s3.split('?')) == 2:
- # http://tkimgs.zhixinhuixue.net/image/word/2021/11/11/1636638682578739.gif?%20-%20{e^{%20-%20x}}%20-%203x
- s3 = ltx_wash("${}$".format(s3.split('?')[-1]))
- if s3 == "【公式】":
- s3 = f"【公式{s2.img}】"
- else:
- if re.search('.gif("|$)', s3) is None:
- logger.info("【{}】特殊公式格式{}".format(paper_id, str(i)))
- if need_latex:
- formula_without_ltx = 1
- break
- s3 = f"【公式{s2.img}】"
- else:
- if 'data-type="math"' in i:
- if need_latex:
- formula_without_ltx = 1
- break
- s3 = f"【公式{s2.img}】"
- elif 'class="tiankong"' in i: # ①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳
- serial_num = int(s2.img.get('data-num'))
- if serial_num == 0:
- quan_begin_with_zero = 1
- if quan_begin_with_zero:
- serial_num = serial_num + 1
- if 1 <= serial_num <= 20:
- s3 = '__{}__'.format(num2circle[str(serial_num)])
- elif int(serial_num) + 1 > 20:
- s3 = '__({})__'.format(serial_num)
- else:
- s3 = '____'
- logger.info("【{}】特殊带圈符号:{}".format(paper_id, str(i)))
- else:
- s3 = s2.img.__str__()
- # imgs.append(str(s2.img))
- # if s2.img.get('height'):
- # img_h = s2.img.get('height')
- # else:
- # try:
- # img_h = Image.open(io.BytesIO(requests.get(s3).content)).size[1]
- # except:
- # logger.info("【{}】图片有问题:{}".format(subject, paper_id))
- # img_h = 0 # 默认不要
- # raw_img = re.findall('<img [^<>]*?src="'+s3+'"[^<>]*?/?>', html)
- # print(str(s2.img))
- # imgs.extend(raw_img)
- # img_h = str(img_h).replace("px", "").replace("pt", "").replace("in", "")
- # s3 = '【图片】'
- # if need_latex == 0 and float(img_h) <= 50:
- # s3 = ''
- s += s3
- elif i.strip().startswith("<table"):
- # if all_parts[nn+1].strip().startswith("<tbody"):
- # if re.match("<tbody|<tr", all_parts[nn + 1].strip()):
- # s += (i.strip() + all_parts[nn+1].strip())
- # s += ("<table>" + all_parts[nn + 1].strip())
- s += "<table>"
- elif re.match("</?t[rd][\s>]|</tbody>|</table>", i.strip()):
- if re.match("</", i.strip()):
- s += i.strip()
- else:
- s += i.strip() if re.search("[\s>]$", s) else " " + i.strip()
- elif i.strip().startswith('<span '):
- # print(666666666666, i)
- if "underline" in i.strip():
- s += "_______"
- elif i.strip().startswith('<'):
- if re.match("<br\s*/?>|</p>", i.strip()):
- s += "<br>"
- pass
- else:
- s += i.strip()
- # print(s)
- # print("****************************")
- s = re.sub(' +', " ", s) # \s匹配任何空白字符,包括空格、制表符、换页符、换行符等
- # s = re.sub('<br>', "\n", s)
- # s = re.sub(r'\\n\s*[;;]\s*\\n', ";\n", s)
- # s = re.sub(r'\\n+', "\n", s)
- # s = re.sub('\\n+', "\n", s)
- # s = re.sub('\n[\n\s]+', "\n", s)
- # s = re.sub(r'\n+(</t[dr]>)', r"\1", s)
- s = re.sub(r'<br>\s*[;;]\s*<br>', ";<br>", s)
- s = re.sub(r'(<br>)+', "<br>", s)
- s = re.sub('<br>(<br>|\s)+', "<br>", s)
- s = re.sub(r'(<br>)+(</t[dr]>)', r"\2", s)
- s = s.replace('#', '').replace("'", "'").replace(" ", " ").replace("\xa0", " ")
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace('%20', '').replace('%22', '"').replace('%40', '@')
- s = s.replace('%25', '%').replace('%26', '&').replace('%23', '#').replace('%28', '(').replace('%29', ')')
- s = s.replace('%2B', '+').replace('%2C', ',').replace('%2F', '/')
- s = s.replace('%3E', '>').replace('%3F', '?').replace('%5C', '\\').replace('%7C', '|')
- s = s.replace('%3C', '<').replace('%3D', '=').replace('%3A', ':').replace('%3B', ';')
- return s.strip(), imgs, formula_without_ltx
- def again_wash(item, paper_id):
- """
- 对试题str再次清洗
- """
- item = re.sub('</table>', '</table><br>', item)
- item = re.sub(r"【(<img .*?\"\s*/?>公式latex提取失败)】", r"【公式\1】", item)
- item = re.sub(r"(?<!公式)(<img .*?[\"']\s*/?>)", r"【图片\1】", item)
- item = re.sub(r"<span style=\"color: red\">(.*?)</span>", r"\1", item)
- item = re.sub(r'<([a-z]+) [a-z]+="[^<>]*?"\s*/?>', lambda x: "" if x.group(1) != "img" else x.group(0), item)
- # item_str = re.sub(r"<([a-z]+)>(.+?)</\1>", lambda x: x.group(2) if x.group(1) not in ["sub", "sup"] else x.group(0), item_str)
- item = re.sub(r'</?body>|</?head>|</?html>|<p>|</p>', '<br>', item)
- item = re.sub(r'<b\*?r\s*/?>', '<br>', item)
- item = re.sub(r'(<br>)+', "<br>", item)
- # item = re.sub(r'\n+', "\n", item)
- # item = re.sub(r'\\n+', "\n", item)
- item = re.sub("\s{3,}", " ", item)
- # 保留原始公式和图片
- text_with_imginfo = re.sub(r"【图片([^】]+?)】", r"\1", item)
- text_with_imginfo = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\2", text_with_imginfo)
- text_with_imginfo = re.sub(r"【公式([^】]+?)】", lambda x: f"【{x.group(1)}】"
- if "latex提取失败" in x.group(1) else x.group(1), text_with_imginfo)
-
- # 简化了公式、图片和表格
- simply_text = re.sub("【图片[^】]+?】", "【图片】", item)
- simply_text = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\1", simply_text)
- simply_text = re.sub("【公式[^】]+?】", "【公式】", simply_text)
- simply_text = re.sub(r'<sub>(.+?)</sub>', r'_{\1}', simply_text)
- simply_text = re.sub(r'<sup>(.+?)</sup>', r'^{\1}', simply_text)
- # 表格简化
- simply_text2 = re.sub(r'<sub>|</sub>|<td>\s*</td>', '', simply_text)
- simply_text2 = re.sub(r'<td .+?["\']>\s*</td>', '', simply_text2)
- simply_text2 = re.sub(r'</td>\s*<td( .+?["\'])?>', ' ', simply_text2)
- simply_text2 = re.sub(r'<tr .+?["\']>\s*</tr>|<table>|</?tbody>|<table .+?["\']>', '', simply_text2)
- simply_text2 = re.sub(r'<tr( .+?["\'])?>\s*<td( .+?["\'])?>', '<tr>', simply_text2)
- simply_text2 = re.sub('</td></tr>', '', simply_text2)
- simply_text2 = re.sub(" {3,}", " ", simply_text2)
- sents_with_imginfo = [i.strip() for i in text_with_imginfo.split("<br>") if i.strip()]
- simply_sents = [i.strip() for i in simply_text2.split("<br>") if i.strip()]
- if len(simply_sents) != len(sents_with_imginfo):
- simply_sents = [i.strip() for i in simply_text.split("<br>") if i.strip()]
- if len(simply_sents) != len(sents_with_imginfo):
- print("清洗有重大问题!!!!!")
- logger.info(f"【{paper_id}】清洗有重大问题")
-
- return sents_with_imginfo, simply_sents
|