import re # import sys from PointerNet import config from bs4 import BeautifulSoup from Utils.main_clear.sci_clear import non_data_latex_iter, non_data_latex_regexp logger = config.myLog(__name__, log_cate="tmp_clear", subject="clear_log").getlog() num2circle = {"1": "①", "2": "②", "3": "③", "4": "④", "5": "⑤", "6": "⑥", "7": "⑦", "8": "⑧", "9": "⑨", "10": "⑩", "11": "⑪", "12": "⑫", "13": "⑬", "14": "⑭", "15": "⑮", "16": "⑯", "17": "⑰", "18": "⑱", "19": "⑲", "20": "⑳"} def ltx_wash(ss): raw_ss = ss try: ss = non_data_latex_iter(ss) # 拿到字符串中的latex再转maple ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss) except: try: ss = non_data_latex_regexp(ss) # 拿到字符串中的latex再转maple ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss) except: pass if ss.replace("$", "").strip(): ss = "${}$".format(ss) ss = re.sub(r'\$\s*\$(.+?)\$\s*\$', r"$\1$", ss.strip()) ss = f"【{ss}##&##{raw_ss}】" else: ss = "【公式】" return ss def simpwash(html, paper_id, need_latex=0): """ # data-latex="xxx", 引号里面不要再有",否则BeautifulSoup会掉内容 """ imgs = [] formula_without_ltx = 0 html = re.sub(r'data-latex="(.*?)(?])', lambda x: 'data-latex="{}"'.format( x.group(1).replace("\"", "'")), html, flags=re.S) # .replace("\n", "") html = html.replace(r'*-*', '').replace(r'\a*rg', 'arg').replace('', '$').replace('', '$') html = re.sub(r'
', "
", html) html = html.replace("'", "'").replace(" ", " ").replace("\xa0", " ") html = re.sub(r'(]*?)style="[^<>]*?"', r"\1", html) html = re.sub(r'(]*?)valign="[^<>]*?"', r"\1", html) html = re.sub(r'(]*?)align="[^<>]*?"', r"\1", html) html = re.sub(r'(]*?)class="[^<>]*?"', r"\1", html) html = re.sub(r'(]*?)width="[^<>]*?"', r"\1", html) html = re.sub(r'(]*?)height="[^<>]*?"', r"\1", html) html = re.sub(r'

\s*

", html) s = re.sub('

', "
", s) s = re.sub('

', "
", s) # ltx处理 def sub4(ss): if ss.group(2).replace("$", "").strip(): new_ltx = ltx_wash(ss.group(2)) if new_ltx != "【公式】": return new_ltx return f"【公式{ss.group(0)}】" s = re.sub(r' data-latex="(\\\\\[|\\\[)(.*?)(\\\]|\\\\\])"', r' data-latex="$\2$"', s) s = re.sub(r').)+?\$?)".*? />', sub4, s, flags=re.S) else: # print("paper_id:::", paper_id) soup = BeautifulSoup(html, features="lxml") s = '' # print(soup.prettify()) quan_begin_with_zero = 0 all_parts = soup.prettify().split('\n') # 这里必须是\n for nn, i in enumerate(all_parts): # print(i) if i.strip().startswith('')) if re.sub(r"^\\\[|\\\]$|\$|\s+", "", s3): s3 = re.sub(r"^\\\[(.*?)\\\]$", r"$\1$", s3) if re.match("\$.*?\$$", s3.strip()) is None: s3 = "${}$".format(s3) # print("latex:::", s3) s3 = ltx_wash(s3) if s3 == "【公式】": s3 = f"【公式{s2.img}】" # print("latex_washed:::", s3) s += s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>').strip() else: if need_latex: formula_without_ltx = 1 break s += f"【公式{s2.img}】" else: s3 = s2.img.get('src') if not s3: continue elif 'class="gsImgLatex mathType"' in i: if len(s3.split('?')) == 2: # http://tkimgs.zhixinhuixue.net/image/word/2021/11/11/1636638682578739.gif?%20-%20{e^{%20-%20x}}%20-%203x s3 = ltx_wash("${}$".format(s3.split('?')[-1])) if s3 == "【公式】": s3 = f"【公式{s2.img}】" else: if re.search('.gif("|$)', s3) is None: logger.info("【{}】特殊公式格式{}".format(paper_id, str(i))) if need_latex: formula_without_ltx = 1 break s3 = f"【公式{s2.img}】" else: if 'data-type="math"' in i: if need_latex: formula_without_ltx = 1 break s3 = f"【公式{s2.img}】" elif 'class="tiankong"' in i: # ①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳ serial_num = int(s2.img.get('data-num')) if serial_num == 0: quan_begin_with_zero = 1 if quan_begin_with_zero: serial_num = serial_num + 1 if 1 <= serial_num <= 20: s3 = '__{}__'.format(num2circle[str(serial_num)]) elif int(serial_num) + 1 > 20: s3 = '__({})__'.format(serial_num) else: s3 = '____' logger.info("【{}】特殊带圈符号:{}".format(paper_id, str(i))) else: s3 = s2.img.__str__() # imgs.append(str(s2.img)) # if s2.img.get('height'): # img_h = s2.img.get('height') # else: # try: # img_h = Image.open(io.BytesIO(requests.get(s3).content)).size[1] # except: # logger.info("【{}】图片有问题：{}".format(subject, paper_id)) # img_h = 0 # 默认不要 # raw_img = re.findall(']*?src="'+s3+'"[^<>]*?/?>', html) # print(str(s2.img)) # imgs.extend(raw_img) # img_h = str(img_h).replace("px", "").replace("pt", "").replace("in", "") # s3 = '【图片】' # if need_latex == 0 and float(img_h) <= 50: # s3 = '' s += s3 elif i.strip().startswith("" + all_parts[nn + 1].strip()) s += "" elif re.match("]||
", i.strip()): if re.match("]$", s) else " " + i.strip() elif i.strip().startswith('|

", i.strip()): s += "
" pass else: s += i.strip() # print(s) # print("****************************") s = re.sub(' +', " ", s) # \s匹配任何空白字符，包括空格、制表符、换页符、换行符等 # s = re.sub('
', "\n", s) # s = re.sub(r'\\n\s*[;；]\s*\\n', "；\n", s) # s = re.sub(r'\\n+', "\n", s) # s = re.sub('\\n+', "\n", s) # s = re.sub('\n[\n\s]+', "\n", s) # s = re.sub(r'\n+()', r"\1", s) s = re.sub(r'
\s*[;；]\s*
', "；
", s) s = re.sub(r'(
)+', "
", s) s = re.sub('
(
|\s)+', "
", s) s = re.sub(r'(
)+()', r"\2", s) s = s.replace('#', '').replace("'", "'").replace(" ", " ").replace("\xa0", " ") s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace('%20', '').replace('%22', '"').replace('%40', '@') s = s.replace('%25', '%').replace('%26', '&').replace('%23', '#').replace('%28', '(').replace('%29', ')') s = s.replace('%2B', '+').replace('%2C', ',').replace('%2F', '/') s = s.replace('%3E', '>').replace('%3F', '?').replace('%5C', '\\').replace('%7C', '|') s = s.replace('%3C', '<').replace('%3D', '=').replace('%3A', ':').replace('%3B', ';') return s.strip(), imgs, formula_without_ltx def again_wash(item, paper_id): """ 对试题str再次清洗 """ item = re.sub('', '
', item) item = re.sub(r"【(公式latex提取失败)】", r"【公式\1】", item) item = re.sub(r"(?)", r"【图片\1】", item) item = re.sub(r"(.*?)", r"\1", item) item = re.sub(r'<([a-z]+) [a-z]+="[^<>]*?"\s*/?>', lambda x: "" if x.group(1) != "img" else x.group(0), item) # item_str = re.sub(r"<([a-z]+)>(.+?)", lambda x: x.group(2) if x.group(1) not in ["sub", "sup"] else x.group(0), item_str) item = re.sub(r'|||

', '
', item) item = re.sub(r'', '
', item) item = re.sub(r'(
)+', "
", item) # item = re.sub(r'\n+', "\n", item) # item = re.sub(r'\\n+', "\n", item) item = re.sub("\s{3,}", " ", item) # 保留原始公式和图片 text_with_imginfo = re.sub(r"【图片([^】]+?)】", r"\1", item) text_with_imginfo = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\2", text_with_imginfo) text_with_imginfo = re.sub(r"【公式([^】]+?)】", lambda x: f"【{x.group(1)}】" if "latex提取失败" in x.group(1) else x.group(1), text_with_imginfo) # 简化了公式、图片和表格 simply_text = re.sub("【图片[^】]+?】", "【图片】", item) simply_text = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\1", simply_text) simply_text = re.sub("【公式[^】]+?】", "【公式】", simply_text) simply_text = re.sub(r'_(.+?)', r'_{\1}', simply_text) simply_text = re.sub(r'^(.+?)', r'^{\1}', simply_text) # 表格简化 simply_text2 = re.sub(r'_||\s*', '', simply_text) simply_text2 = re.sub(r'\s*', '', simply_text2) simply_text2 = re.sub(r'\s*', ' ', simply_text2) simply_text2 = re.sub(r'\s*|||

', '', simply_text2) simply_text2 = re.sub(r'\s*', '', simply_text2) simply_text2 = re.sub('', '', simply_text2) simply_text2 = re.sub(" {3,}", " ", simply_text2) sents_with_imginfo = [i.strip() for i in text_with_imginfo.split("
") if i.strip()] simply_sents = [i.strip() for i in simply_text2.split("
") if i.strip()] if len(simply_sents) != len(sents_with_imginfo): simply_sents = [i.strip() for i in simply_text.split("
") if i.strip()] if len(simply_sents) != len(sents_with_imginfo): print("清洗有重大问题!!!!!") logger.info(f"【{paper_id}】清洗有重大问题") return sents_with_imginfo, simply_sents