cdZWj
/
PaperText_Segmentation


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
							import re
# import sys
from PointerNet import config
from bs4 import BeautifulSoup
from Utils.main_clear.sci_clear import non_data_latex_iter, non_data_latex_regexp


logger = config.myLog(__name__, log_cate="tmp_clear",
                         subject="clear_log").getlog()

num2circle = {"1": "①", "2": "②", "3": "③", "4": "④",
              "5": "⑤", "6": "⑥", "7": "⑦", "8": "⑧",
              "9": "⑨", "10": "⑩", "11": "⑪", "12": "⑫",
              "13": "⑬", "14": "⑭", "15": "⑮", "16": "⑯",
              "17": "⑰", "18": "⑱", "19": "⑲", "20": "⑳"}


def ltx_wash(ss):
    raw_ss = ss
    try:
        ss = non_data_latex_iter(ss)  # 拿到字符串中的latex再转maple
        ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
    except:
        try:
            ss = non_data_latex_regexp(ss)  # 拿到字符串中的latex再转maple
            ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
        except:
            pass
    if ss.replace("$", "").strip():
        ss = "${}$".format(ss)
        ss = re.sub(r'\$\s*\$(.+?)\$\s*\$', r"$\1$", ss.strip())
        ss = f"【{ss}##&##{raw_ss}】"
    else:
        ss = "【公式】"
    return ss


def simpwash(html, paper_id, need_latex=0):
    """
    # data-latex="xxx", 引号里面不要再有",否则BeautifulSoup会掉内容
    """
    imgs = []
    formula_without_ltx = 0

    html = re.sub(r'data-latex="(.*?)(?<!\\)"(?=[\s/>])', lambda x: 'data-latex="{}"'.format(
        x.group(1).replace("\"", "'")), html, flags=re.S)  # .replace("\n", "")
    html = html.replace(r'*-*', '').replace(r'\a*rg', 'arg').replace('<latex>', '$').replace('</latex>', '$')
    html = re.sub(r'<br/\s*>', "<br>", html)
    html = html.replace("&#39;", "'").replace("&nbsp;", " ").replace("\xa0", " ")
    html = re.sub(r'(<t[dr] [^<>]*?)style="[^<>]*?"', r"\1", html)
    html = re.sub(r'(<t[dr] [^<>]*?)valign="[^<>]*?"', r"\1", html)
    html = re.sub(r'(<t[dr] [^<>]*?)align="[^<>]*?"', r"\1", html)
    html = re.sub(r'(<t[dr] [^<>]*?)class="[^<>]*?"', r"\1", html)
    html = re.sub(r'(<t[dr] [^<>]*?)width="[^<>]*?"', r"\1", html)
    html = re.sub(r'(<t[dr] [^<>]*?)height="[^<>]*?"', r"\1", html)
    html = re.sub(r'<p style="[^<>]*?">\s*</p>', "", html)

    if re.search('</?(span|font|article|ul|ol|div)(\s*|\s+style=.*?"|\s+class=.*?")>|text\s*-\s*decoration: underline|border\s*-\s*bottom:'
                 '|class="gsImgLatex| type="| style="| class="', str(html)) is None:
        # content = re.sub('</p\s*>', "\n", content).strip().replace("\n\n", "\n").replace("\n", "<br/>").
        s = re.sub('<p(\s*|\s+style=.*?")>', "<p>", html)
        s = re.sub('</p><p>', "<br>", s)
        s = re.sub('</p>|<p>', "<br>", s)
        # ltx处理
        def sub4(ss):
            if ss.group(2).replace("$", "").strip():
                new_ltx = ltx_wash(ss.group(2))
                if new_ltx != "【公式】":
                    return new_ltx
            return f"【公式{ss.group(0)}】"
        s = re.sub(r' data-latex="(\\\\\[|\\\[)(.*?)(\\\]|\\\\\])"', r' data-latex="$\2$"', s)
        s = re.sub(r'<img src=((?!src).)+?data-latex="(\$?((?!["/]>).)+?\$?)".*? />', sub4, s, flags=re.S)
        
    else:
        # print("paper_id:::", paper_id)
        soup = BeautifulSoup(html, features="lxml")
        s = ''
        # print(soup.prettify())
        quan_begin_with_zero = 0
        all_parts = soup.prettify().split('\n')  # 这里必须是\n
        for nn, i in enumerate(all_parts):
            # print(i)
            if i.strip().startswith('<img'):
                s2 = BeautifulSoup(i, features="lxml")
                if s2.img:
                    s3 = s2.img.get('data-latex')
                    # print(s3)
                    if s3:
                        # s += structured(s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
                        if re.sub(r"^\\\[|\\\]$|\$|\s+", "", s3):
                            s3 = re.sub(r"^\\\[(.*?)\\\]$", r"$\1$", s3)
                            if re.match("\$.*?\$$", s3.strip()) is None:
                                s3 = "${}$".format(s3)
                            # print("latex:::", s3)
                            s3 = ltx_wash(s3)
                            if s3 == "【公式】":
                                s3 = f"【公式{s2.img}】"
                            # print("latex_washed:::", s3)
                            s += s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>').strip()
                        else:
                            if need_latex:
                                formula_without_ltx = 1
                                break
                            s += f"【公式{s2.img}】"
                    else:
                        s3 = s2.img.get('src')
                        if not s3:
                            continue
                        elif 'class="gsImgLatex mathType"' in i:
                            if len(s3.split('?')) == 2:
                                # http://tkimgs.zhixinhuixue.net/image/word/2021/11/11/1636638682578739.gif?%20-%20{e^{%20-%20x}}%20-%203x
                                s3 = ltx_wash("${}$".format(s3.split('?')[-1]))
                                if s3 == "【公式】":
                                    s3 = f"【公式{s2.img}】"
                            else:
                                if re.search('.gif("|$)', s3) is None:
                                    logger.info("【{}】特殊公式格式{}".format(paper_id, str(i)))
                                if need_latex:
                                    formula_without_ltx = 1
                                    break
                                s3 = f"【公式{s2.img}】"
                        else:
                            if 'data-type="math"' in i:
                                if need_latex:
                                    formula_without_ltx = 1
                                    break
                                s3 = f"【公式{s2.img}】"
                            elif 'class="tiankong"' in i:  # ①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳
                                serial_num = int(s2.img.get('data-num'))
                                if serial_num == 0:
                                    quan_begin_with_zero = 1
                                if quan_begin_with_zero:
                                    serial_num = serial_num + 1
                                if 1 <= serial_num <= 20:
                                    s3 = '__{}__'.format(num2circle[str(serial_num)])
                                elif int(serial_num) + 1 > 20:
                                    s3 = '__({})__'.format(serial_num)
                                else:
                                    s3 = '____'
                                    logger.info("【{}】特殊带圈符号:{}".format(paper_id, str(i)))
                            else:
                                s3 = s2.img.__str__()
                                # imgs.append(str(s2.img))
                                # if s2.img.get('height'):
                                #     img_h = s2.img.get('height')
                                # else:
                                #     try:
                                #         img_h = Image.open(io.BytesIO(requests.get(s3).content)).size[1]
                                #     except:
                                #         logger.info("【{}】图片有问题：{}".format(subject, paper_id))
                                #         img_h = 0  # 默认不要
                                # raw_img = re.findall('<img [^<>]*?src="'+s3+'"[^<>]*?/?>', html)
                                # print(str(s2.img))
                                # imgs.extend(raw_img)
                                # img_h = str(img_h).replace("px", "").replace("pt", "").replace("in", "")
                                # s3 = '【图片】'
                                # if need_latex == 0 and float(img_h) <= 50:
                                #     s3 = ''
                        s += s3
            elif i.strip().startswith("<table"):
                # if all_parts[nn+1].strip().startswith("<tbody"):
                # if re.match("<tbody|<tr", all_parts[nn + 1].strip()):
                    # s += (i.strip() + all_parts[nn+1].strip())
                    # s += ("<table>" + all_parts[nn + 1].strip())
                s += "<table>"
            elif re.match("</?t[rd][\s>]|</tbody>|</table>", i.strip()):
                if re.match("</", i.strip()):
                    s += i.strip()
                else:
                    s += i.strip() if re.search("[\s>]$", s) else " " + i.strip()
            elif i.strip().startswith('<span '):
                # print(666666666666, i)
                if "underline" in i.strip():
                    s += "_______"
            elif i.strip().startswith('<'):
                if re.match("<br\s*/?>|</p>", i.strip()):
                    s += "<br>"
                pass
            else:
                s += i.strip()
    # print(s)
    # print("****************************")
    s = re.sub(' +', " ", s)  # \s匹配任何空白字符，包括空格、制表符、换页符、换行符等
    # s = re.sub('<br>', "\n", s)
    # s = re.sub(r'\\n\s*[;；]\s*\\n', "；\n", s)
    # s = re.sub(r'\\n+', "\n", s)
    # s = re.sub('\\n+', "\n", s)
    # s = re.sub('\n[\n\s]+', "\n", s)
    # s = re.sub(r'\n+(</t[dr]>)', r"\1", s)
    s = re.sub(r'<br>\s*[;；]\s*<br>', "；<br>", s)
    s = re.sub(r'(<br>)+', "<br>", s)
    s = re.sub('<br>(<br>|\s)+', "<br>", s)
    s = re.sub(r'(<br>)+(</t[dr]>)', r"\2", s)

    s = s.replace('#', '').replace("&#39;", "'").replace("&nbsp;", " ").replace("\xa0", " ")
    s = s.replace("&lt;", "<")
    s = s.replace("&gt;", ">")
    s = s.replace('%20', '').replace('%22', '"').replace('%40', '@')
    s = s.replace('%25', '%').replace('%26', '&').replace('%23', '#').replace('%28', '(').replace('%29', ')')
    s = s.replace('%2B', '+').replace('%2C', ',').replace('%2F', '/')
    s = s.replace('%3E', '>').replace('%3F', '?').replace('%5C', '\\').replace('%7C', '|')
    s = s.replace('%3C', '<').replace('%3D', '=').replace('%3A', ':').replace('%3B', ';')
    return s.strip(), imgs, formula_without_ltx


def again_wash(item, paper_id):
    """
    对试题str再次清洗
    """    
    item = re.sub('</table>', '</table><br>', item)
    item = re.sub(r"【(<img .*?\"\s*/?>公式latex提取失败)】", r"【公式\1】", item)
    item = re.sub(r"(?<!公式)(<img .*?[\"']\s*/?>)", r"【图片\1】", item)
    item = re.sub(r"<span style=\"color: red\">(.*?)</span>", r"\1", item)
    item = re.sub(r'<([a-z]+) [a-z]+="[^<>]*?"\s*/?>', lambda x: "" if x.group(1) != "img" else x.group(0), item)
    # item_str = re.sub(r"<([a-z]+)>(.+?)</\1>", lambda x: x.group(2) if x.group(1) not in ["sub", "sup"] else x.group(0), item_str)
    item = re.sub(r'</?body>|</?head>|</?html>|<p>|</p>', '<br>', item)
    item = re.sub(r'<b\*?r\s*/?>', '<br>', item)
    item = re.sub(r'(<br>)+', "<br>", item)
    # item = re.sub(r'\n+', "\n", item)
    # item = re.sub(r'\\n+', "\n", item)
    item = re.sub("\s{3,}", "  ", item)

    # 保留原始公式和图片
    text_with_imginfo = re.sub(r"【图片([^】]+?)】", r"\1", item)
    text_with_imginfo = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\2", text_with_imginfo)
    text_with_imginfo = re.sub(r"【公式([^】]+?)】", lambda x: f"【{x.group(1)}】"
                                    if "latex提取失败" in x.group(1) else x.group(1), text_with_imginfo)
   
    # 简化了公式、图片和表格
    simply_text = re.sub("【图片[^】]+?】", "【图片】", item)
    simply_text = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\1", simply_text)
    simply_text = re.sub("【公式[^】]+?】", "【公式】", simply_text)
    simply_text = re.sub(r'<sub>(.+?)</sub>', r'_{\1}', simply_text)
    simply_text = re.sub(r'<sup>(.+?)</sup>', r'^{\1}', simply_text)
    # 表格简化
    simply_text2 = re.sub(r'<sub>|</sub>|<td>\s*</td>', '', simply_text)
    simply_text2 = re.sub(r'<td .+?["\']>\s*</td>', '', simply_text2)
    simply_text2 = re.sub(r'</td>\s*<td( .+?["\'])?>', ' ', simply_text2)
    simply_text2 = re.sub(r'<tr .+?["\']>\s*</tr>|<table>|</?tbody>|<table .+?["\']>', '', simply_text2)
    simply_text2 = re.sub(r'<tr( .+?["\'])?>\s*<td( .+?["\'])?>', '<tr>', simply_text2)
    simply_text2 = re.sub('</td></tr>', '', simply_text2)
    simply_text2 = re.sub(" {3,}", "  ", simply_text2)

    sents_with_imginfo = [i.strip() for i in text_with_imginfo.split("<br>") if i.strip()]
    simply_sents = [i.strip() for i in simply_text2.split("<br>") if i.strip()]
    if len(simply_sents) != len(sents_with_imginfo):
        simply_sents = [i.strip() for i in simply_text.split("<br>") if i.strip()]
        if len(simply_sents) != len(sents_with_imginfo):
            print("清洗有重大问题!!!!!")
            logger.info(f"【{paper_id}】清洗有重大问题")
    
    return sents_with_imginfo, simply_sents