|
@@ -0,0 +1,496 @@
|
|
|
+#!/usr/bin/env/python
|
|
|
+# -*- coding:utf-8 -*-
|
|
|
+
|
|
|
+import re
|
|
|
+import base64, os, random
|
|
|
+import time
|
|
|
+from pprint import pprint
|
|
|
+import configs
|
|
|
+from utils.field_eq2latex import get_latex
|
|
|
+from utils.html_again_parse import css_label_wash
|
|
|
+
|
|
|
+
|
|
|
+def table_label_cleal(con):
|
|
|
+ """
|
|
|
+ 去掉表格中的【换行符】
|
|
|
+ """
|
|
|
+ # print(con)
|
|
|
+ # print('------------------------------------------')
|
|
|
+ con = re.sub(r"\n(\s|\n|\t)+", "\n", con)
|
|
|
+ count = 1
|
|
|
+ while re.search(r"</?[a-z]+>\n(</?[a-z]+>|<td\s+\n*[a-z=\"\d]+>)", con, re.S) and count <= 10:
|
|
|
+ con = re.sub("(</?t[dr]>|</?table>|</?tbody>|</?div>)\n(</?t[dr]>|</div>|</?table>|</?tbody>|<p>)",
|
|
|
+ r"\1\2", con, flags=re.S)
|
|
|
+ con = re.sub(r'(</?t[rd]>)\n(<td\s.+?>)', r'\1\2', con, flags=re.S)
|
|
|
+ count += 1
|
|
|
+ # if re.search(r"<table>(.|\n)+?</table>", con, re.S|re.M):
|
|
|
+ # aa = re.search(r"(<table>(.|\n)+?</table>)", con, re.S|re.M)
|
|
|
+ # con = con.replace(aa.group(1),aa.group(1).replace("\n",""))
|
|
|
+
|
|
|
+ # 将空表格的情况去掉
|
|
|
+ con = re.sub(r'<table>[\s\n\t]*?<tbody>[\s\n\t]*?(<tr>[\s\n\t]*?<td[^<>]*?>[\s\n\t]*?<p>[\s\n\t]*?</p>'
|
|
|
+ r'[\s\n\t]*?</td>[\s\n\t]*?</tr>[\s\n\t]*?)+</tbody>[\s\n\t]*?</table>[\s\n\t]*?<p>', "", con,
|
|
|
+ flags=re.S)
|
|
|
+ con = re.sub(r'(</table><p>)\s*([((]\s*\d\s*[))])', r'\1\n\2', con)
|
|
|
+ return con
|
|
|
+
|
|
|
+
|
|
|
+def base642img(html_data, wordid):
|
|
|
+ """
|
|
|
+ 【基于mathjax渲染输出是css-html格式】
|
|
|
+ 将base64编码的图片保存到本地
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ # 二进制图片进行转化, 按“word_id”建立文件夹
|
|
|
+ # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d')
|
|
|
+ # file_path = configs.IMG_FOLDER + '/' + str(self.wordid)
|
|
|
+ # if not os.path.exists(file_path):
|
|
|
+ # os.makedirs(file_path)
|
|
|
+ # else:
|
|
|
+ # 思路1:删除图片,重建文件夹,【所有的新图片都是以base64格式传过来的】
|
|
|
+ # shutil.rmtree(file_path)
|
|
|
+ # os.makedirs(file_path)
|
|
|
+ # 思路2:每一次再解析都将base64图片保存到本地再以路径形式返回
|
|
|
+ # st = len(os.listdir(file_path)) # 不要以序号索引的形式命名
|
|
|
+
|
|
|
+ # 统计所有base64编码
|
|
|
+ all_base64_image = re.findall(r'(<img ([a-z]+="[^"]*?" )?src="(data:image[^>"]+?)"(.*?)\s*/?>)', str(html_data),
|
|
|
+ flags=re.S)
|
|
|
+ if all_base64_image:
|
|
|
+ file_path = configs.IMG_FOLDER + '/' + str(wordid)
|
|
|
+ if not os.path.exists(file_path):
|
|
|
+ os.makedirs(file_path)
|
|
|
+ # 新图片命名
|
|
|
+ name_list = random.sample(range(100000, 999999), len(all_base64_image))
|
|
|
+ for n, img in enumerate(all_base64_image):
|
|
|
+ img1 = img[2].split(",", maxsplit=1)
|
|
|
+ img_type_info = re.search("data:image/(.+?);base64", img1[0])
|
|
|
+ img_type = img_type_info.group(1) if img_type_info else ""
|
|
|
+ # 可能还有alt和style的属性,暂时先不要
|
|
|
+ w_info = re.search('( width="\d+")', img[3])
|
|
|
+ h_info = re.search('( height="\d+")', img[3])
|
|
|
+ img_data = base64.b64decode(str(img1[-1]))
|
|
|
+ if img_type:
|
|
|
+ # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape)
|
|
|
+ img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type
|
|
|
+ save_path = os.path.join(file_path, img_name)
|
|
|
+ with open(save_path, 'wb') as f:
|
|
|
+ f.write(img_data)
|
|
|
+ # self.localnewpic_list.append(save_path)
|
|
|
+ # put_key = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(img_name)
|
|
|
+ # self.put_key_list.append(save_path)
|
|
|
+ flag_behind = '" />'
|
|
|
+ if w_info and h_info:
|
|
|
+ flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />'
|
|
|
+ temp_img = '<img src="' + configs.new_img_ip + '/' + str(wordid) + '/' + img_name + flag_behind
|
|
|
+ # new_img = '<img src="http://' + configs.public_bucket_addr + put_key + '" />'
|
|
|
+ html_data = html_data.replace(img[0], temp_img)
|
|
|
+ return html_data
|
|
|
+
|
|
|
+
|
|
|
+class HtmlWash_2():
|
|
|
+ def __init__(self, html, wordid, is_reparse=0, img_url="", must_latex=0):
|
|
|
+ """
|
|
|
+ html文本清洗
|
|
|
+ 批量再解析中,新增图片信息替换的文本返回作为ocr保存文本,
|
|
|
+ 继续往下清洗的文本,则进入结构化解析逻辑中
|
|
|
+ """
|
|
|
+ # super().__init__(html, wordid, is_reparse, must_latex)
|
|
|
+ self.html = html
|
|
|
+ self.img_url = img_url
|
|
|
+ self.wordid = wordid
|
|
|
+ self.is_reparse = is_reparse
|
|
|
+ self.must_latex = must_latex
|
|
|
+ # self.put_key_list = []
|
|
|
+ # self.localnewpic_list =[]
|
|
|
+ self.sub_list = ["</?div>", "</?b>", "</?caption>", "</?center>", "</?cite>", "</?code>", "</?colgroup>",
|
|
|
+ "</?menu>", "</?dd>", "</?dir>", "</?li>", "</?em>", "</?article>", "</?header>", "</?ruby>",
|
|
|
+ "</?summary>", "</?details>", "</?strong>", "</?strike>", "</?small>", "</?select>",
|
|
|
+ "</?section>", "</?script>", "</?[su]>", "</?var>", "</?ul>", "</?tt>", "</?title>",
|
|
|
+ "</?thead>",
|
|
|
+ "</?tfoot>", "<hr />", "<hr>", ""]
|
|
|
+ self.sub_dd = {'×': '×',
|
|
|
+ '÷': '÷',
|
|
|
+ '°': '°',
|
|
|
+ '·': '·',
|
|
|
+ '±': '±',
|
|
|
+ 'º': 'º',
|
|
|
+ '¹': '¹',
|
|
|
+ '²': '²',
|
|
|
+ '³': '³',
|
|
|
+ '½': '1/2',
|
|
|
+ '¼': '¼',
|
|
|
+ '¾': '¾',
|
|
|
+ '¥': '¥',
|
|
|
+ 'm³': 'm³',
|
|
|
+ # '<': '<',
|
|
|
+ '£': '£',
|
|
|
+ # '∠<': '<',
|
|
|
+ '>': '>',
|
|
|
+ "A": "A",
|
|
|
+ "А": "A",
|
|
|
+ "Α": "A",
|
|
|
+ "B": "B",
|
|
|
+ "В": "B",
|
|
|
+ "в": "B",
|
|
|
+ "Β": "B",
|
|
|
+ "C": "C",
|
|
|
+ "С": "C",
|
|
|
+ "c": "c",
|
|
|
+ "с": "c",
|
|
|
+ "D": "D",
|
|
|
+ "Ε": "E",
|
|
|
+ "E": "E",
|
|
|
+ "F": "F",
|
|
|
+ "G": "G",
|
|
|
+ "g": "g",
|
|
|
+ "m": "m",
|
|
|
+ "N": "N",
|
|
|
+ "s": "s",
|
|
|
+ "t": "t",
|
|
|
+ "/": "/",
|
|
|
+ "=": "=",
|
|
|
+ "-": "-",
|
|
|
+ "2": "2", "3": "3", "4": "4", "5": "5", "6": "6",
|
|
|
+ "7": "7", "8": "8", "9": "9", "1": "1", "0": "0",
|
|
|
+ ' ': ' ',
|
|
|
+ ' ': ' ',
|
|
|
+ "〖": '【',
|
|
|
+ "〗": '】',
|
|
|
+ "題": '题',
|
|
|
+ "单项选择": '单选',
|
|
|
+ "多项选择": '多选',
|
|
|
+ # "不定项选择": '选择',
|
|
|
+ "双项选择": '多选',
|
|
|
+ "实验与探究题": '实验',
|
|
|
+ "原理综合题": '原理题',
|
|
|
+ }
|
|
|
+
|
|
|
+ def new_pic_sub(self):
|
|
|
+ """
|
|
|
+ 针对base64图片先保存到本地,入库时再换成腾讯云线上地址
|
|
|
+ # 第一版:再解析中,将二进制图片进行转化,图片怎么保存比较好,先再“天数”建立文件夹
|
|
|
+ 第一版:再解析中,根据“word_id”建立文件夹
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ if self.is_reparse:
|
|
|
+ # css 标签清洗
|
|
|
+ self.html = css_label_wash(self.html)
|
|
|
+ # 保存base64编码的图片
|
|
|
+ self.html = base642img(self.html, self.wordid)
|
|
|
+ self.new_html = self.html
|
|
|
+
|
|
|
+ def html_cleal(self):
|
|
|
+ # =======清洗mathjax标签========
|
|
|
+ if "MathJax" in self.html: # 再解析中存在mathjax公式渲染的标签
|
|
|
+ all_mathjax = re.findall('(<span class="MathJax_Preview".*?</script>(</span>)*)', self.html)
|
|
|
+ for jax in all_mathjax:
|
|
|
+ latex = re.findall('<script .+?">(((?!(</)).)*?)</script>(</span>)*', jax[0])
|
|
|
+ if latex:
|
|
|
+ latex = "${}$".format(latex[0][0])
|
|
|
+ self.html = self.html.replace(jax[0], latex)
|
|
|
+ else:
|
|
|
+ self.html = self.html.replace(jax[0], "")
|
|
|
+
|
|
|
+ # ======再解析中的新图片处理=====
|
|
|
+ self.new_pic_sub()
|
|
|
+
|
|
|
+ # =====特殊符号处理=====
|
|
|
+ html2txt = re.sub(r"|".join(self.sub_list), "", str(self.html)) # ("", " ") #2020/4/7
|
|
|
+ html2txt = re.sub("|".join(self.sub_dd.keys()), lambda x: self.sub_dd[x.group()], html2txt) # 2020/4/1,4/7,4/20
|
|
|
+ html2txt = re.sub("[不非]定[向项]选择", "不定选择", html2txt)
|
|
|
+ html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \
|
|
|
+ .replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \
|
|
|
+ .replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ") \
|
|
|
+ .replace("\u2003", " ").replace("\x7f", " ").replace("\xa0", "")
|
|
|
+ html2txt = re.sub(r"(<p>\s*)【例题(\d+)】", r"\1\2、", html2txt)
|
|
|
+ html2txt = re.sub(r"\\\(|\\\)", "$", html2txt)
|
|
|
+
|
|
|
+ # 域公式的转化处理;<sub>\<sup>可以在前端显示,不需要用latex渲染
|
|
|
+ try:
|
|
|
+ html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid, self.must_latex)
|
|
|
+ if newhml: # 存在域公式转图片时,需要将原文本的域公式也转为图片信息
|
|
|
+ self.new_html = newhml
|
|
|
+ html2txt = html2txt.replace("【omml-latex】", "")
|
|
|
+ except:
|
|
|
+ html2txt = html2txt.replace("【omml-latex】", "")
|
|
|
+
|
|
|
+ # 字符串公式的处理:如Fe<sub>2</sub>O<sub>3</sub>, 在结构化之后处理比较好
|
|
|
+ # <br/>处理
|
|
|
+ html2txt = re.sub(r"<br\s*/?>", "\n", html2txt)
|
|
|
+ html2txt = re.sub(r"[((]\s*(\d)\s*\$分\s*[))]", r"$(\1分)", html2txt)
|
|
|
+
|
|
|
+ # =====题型行的统一处理=====
|
|
|
+ # ---->>>>>题型行可能放在表格中
|
|
|
+ if len(re.findall("</table>", html2txt)) >= 8: # 这个限制还不太严谨
|
|
|
+ for tt in re.finditer('<tr>(((?!(</?tr>)).)*)</tr>', html2txt, re.S):
|
|
|
+ tt_list = re.split(r'^\s*<td[^<>]*?>|</p></td>|</td>[\n\s]*?<td[^<>]*?>'
|
|
|
+ r'|</td>\s*\n|</td>\s*$|\n\s*<td[^<>]*?>|<td[^<>]*?><p>',
|
|
|
+ tt.group(1).strip()) # </td>\s*[$\n]这样无效
|
|
|
+ tt_list = [col for col in tt_list if col.strip()]
|
|
|
+ if " ".join(tt_list).replace(" ", "") in ['得分评卷人', '评卷人得分']:
|
|
|
+ html2txt = html2txt.replace(tt.group(0), "")
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ # html2txt = html2txt.replace(tt.group(0), "<p>" + " ".join(tt_list) + "</p>")
|
|
|
+ # html2txt = re.sub(r"</?tbody>|</?table>|</?div>", "", html2txt)
|
|
|
+ # ---->>>>>end
|
|
|
+ html2txt = re.sub(r"(</table>)\s*([一二三四五六七八九十]\s*[、..、::]?.{2,6}题)", r"\1</p>\2", html2txt)
|
|
|
+ html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt)
|
|
|
+ html2txt = re.sub(r'<td[^<>]*?><p>(([一二三四五六七八九十])\s*[、..、,,::]\s*(.{2,4}题)\s*</p>)</td>[^p]*?<p>', r"\1",
|
|
|
+ str(html2txt), flags=re.S)
|
|
|
+ html2txt = re.sub(r"<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)",
|
|
|
+ r"<p>\1、\2题", html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "", html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?<!\d)(\d+分)\s*[,,。].{,50}</p>',
|
|
|
+ r"<p>【选做题】:'\1'</p>", html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*</p>', "<p>【选做题】</p>", html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*</p>',
|
|
|
+ r"<p>\1、\2题</p>", html2txt)
|
|
|
+ html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)'
|
|
|
+ r'([((]\s*本题|.*?\d分)', r"\1" + "、" + r'\2' + "题" + r"\3", html2txt)
|
|
|
+ html2txt = re.sub(r'([一二三四五六])\s*[、..、,,::]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题',
|
|
|
+ r"\1" + "、" + r'\2' + "题", html2txt)
|
|
|
+ # html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2"
|
|
|
+ # html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt)
|
|
|
+ html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)',
|
|
|
+ r"\1" + "、" + "解答题", html2txt)
|
|
|
+ html2txt = re.sub(r'(?<!<p>)\s*([一二三四五六七八九十]\s*[、..、,,::]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)',
|
|
|
+ r'</p>\n<p>\1', html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s*</p>', r"<p>\1、本大题</p>",
|
|
|
+ html2txt)
|
|
|
+
|
|
|
+ # html2txt = re.sub(r'<p>\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"<p>一、\1题", html2txt)
|
|
|
+
|
|
|
+ # =====图片的处理=====
|
|
|
+ # 1>>根据图片宽高的异常值判断删除隐藏图片
|
|
|
+ def sub1(ss):
|
|
|
+ if float(ss.group(1)) <= 3 and float(ss.group(2)) <= 3:
|
|
|
+ return ""
|
|
|
+ else:
|
|
|
+ return ss.group(0)
|
|
|
+
|
|
|
+ html2txt = re.sub(r'<img src=.*? width="([\d.]+)p[xt]" height="([\d.]+)p[xt]"\s*/?>', sub1, html2txt)
|
|
|
+
|
|
|
+ # 2>>将图片中带有的汉字去掉
|
|
|
+ html2txt = re.sub(r'(<img src=.*?) alt=".+?"', r"\1", html2txt)
|
|
|
+ # html2txt = re.sub(r'(<img src=.+?(?<!\\)\")>', r"\1 />", html2txt) # 将">换为" />
|
|
|
+ html2txt = re.sub(r'(<img src=(?!\sstyle=)+?(?<!\\)\")>', r"\1 />", html2txt) # 将">换为" />
|
|
|
+
|
|
|
+ # =====答案解析关键字的统一处理=====
|
|
|
+ html2txt = re.sub(r'【\s*(<img src=((?!/>).)+?/>\s*)*?([解答])\s*(<img src=((?!/>).)+?/>\s*)*?([析案])\s*'
|
|
|
+ r'(<img src=((?!/>).)+?/>\s*)*?】', r"【\3\6】", str(html2txt)) # 2022/4/28
|
|
|
+ html2txt = re.sub(r'<p>\s*(解\s*[::])', r"<p>【解答】", str(html2txt))
|
|
|
+ html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt))
|
|
|
+ # html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答])[^【】]*?】', r"【\1】", str(html2txt))
|
|
|
+ html2txt = re.sub(r'(\n\s*|<p>\s*|\s{2,}|\n\s*\d{,2}\s*[、..、]\s*)(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"\1【\2】",
|
|
|
+ str(html2txt))
|
|
|
+ html2txt = re.sub(r'(\n|^|<p>)\s*(([1-9]|[1-9][0-9])\s*[..、、])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]',
|
|
|
+ r"\1\2【\4】", str(html2txt))
|
|
|
+ html2txt = re.sub(r'([A-D])\s*\[\s*(解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\n【\2】", str(html2txt))
|
|
|
+ html2txt = re.sub(r'(\n|^|<p>)\s*(分析)\s*[::]', r"【\2】", str(html2txt))
|
|
|
+ if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt:
|
|
|
+ html2txt = re.sub(r'【解答】', "【解析】", str(html2txt))
|
|
|
+
|
|
|
+ # =====其他关键字的处理=====
|
|
|
+ html2txt = re.sub(r'<p>\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?</p>', "", str(html2txt))
|
|
|
+ html2txt = re.sub(r'<p>\s*(选修[\d-]*?[::].{2,15})\s*</p>', r"<p>【章节】\1</p>", html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*</p>',
|
|
|
+ r"<p>【章节】\2</p>", html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*(基础|中档|综合)题[^p题]*?</p>|<p>\s*【(考点|专题)】[^p]*?</p>', "", str(html2txt))
|
|
|
+ html2txt = re.sub(r'<p>\s*(基础训练|提升训练|探究培优)</p>', "", str(html2txt))
|
|
|
+ html2txt = re.sub(r'<p>注意事项[::]\s*</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt, flags=re.S)
|
|
|
+ html2txt = re.sub(r'<p>注意事项[::]\s*\d\s*[、..、][^/]+?</p>(\n+\s*<p>\s*\d\s*[、..、][^/]+?</p>){1,}', "", html2txt,
|
|
|
+ flags=re.S)
|
|
|
+ html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt)
|
|
|
+ html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt)
|
|
|
+ html2txt = re.sub(r'\[来源:.*?\]', "", html2txt)
|
|
|
+ html2txt = re.sub('<p>欢迎访问.*?</p>', '', html2txt)
|
|
|
+ html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?<!["“=\'])http:.*?\.(com|cn|org)', "",
|
|
|
+ html2txt) # ww w.gkstk.c om
|
|
|
+ html2txt = re.sub(r'<(table|tr) [a-z]+="\d+">', r'<\1>', html2txt) # <td rowspan="2">保留
|
|
|
+ html2txt = re.sub(r'<(table)( [a-z]+=".*?")+>', r'<\1>', html2txt)
|
|
|
+ html2txt = re.sub(r'<p>\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s*</p>', "<p>【非选择题】</p>", html2txt)
|
|
|
+ # == == =对可能的题型行的处理 == ==
|
|
|
+ html2txt = re.sub("<p>【非选择题】</p>((\s|\n|<p>|</p>)*\d{1,2}\s*[..、、].+?)", r"<p>二、解答题</p>\1", html2txt) \
|
|
|
+ .replace("【非选择题】", "")
|
|
|
+
|
|
|
+ # =====选项的处理=====
|
|
|
+ html2txt = re.sub(r'(<p>\s*([1-9]|[1-9][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*?</p>)',
|
|
|
+ r"\1</p>\n<p>\3", str(html2txt))
|
|
|
+
|
|
|
+ # =====题号的处理=====
|
|
|
+ html2txt = re.sub(r'([ED]\s*[、..、].*?((?<![::])\s+|</su[pb]>\s*))(([1-9]|[1-9][0-9])\s*[、..、])',
|
|
|
+ r"\1</p>\n<p>\3", html2txt)
|
|
|
+ html2txt = re.sub(r'((</?p>|\n)\s*(<img src=.*?"\s*/?>\s*)?([1-9]|[1-9][0-9]))\s*'
|
|
|
+ r'([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)', r"</p>\1、\5", html2txt)
|
|
|
+ html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\1、\2",
|
|
|
+ html2txt)
|
|
|
+ html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::]|\[(答案|解析)\])", r"<p>\1、\2",
|
|
|
+ html2txt)
|
|
|
+ html2txt = re.sub(r"<p>\s*([1-9]|[1-9][0-9])\s*([((]\s*\d+\s*分?\s*[))])?(【(解析?|答案?)】|(解析?|答案?)\s*[::]"
|
|
|
+ r"|\[(答案|解析)\])", r"<p>\1、\2\3", html2txt)
|
|
|
+ # 图片和题号相连情况
|
|
|
+ html2txt = re.sub(r"<p>\s*(<img src=((?!/>).)+?/>)\s*([1-9]|[1-9][0-9])\s*"
|
|
|
+ r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\3、\1\4", html2txt) # 2024.5.6
|
|
|
+ html2txt = re.sub(r'<p>((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+p[tx]"\s*/?>(\s*</su[bp]>)?\s*)'
|
|
|
+ r'(([1-9]|[1-9][0-9])\s*[、..、])', r"<p>\4\1", html2txt) # 2024.5.6
|
|
|
+ html2txt = re.sub(r"(</p>|\n)\s*(<img src=((?!/>).)+?/>)\s*([1-9]|[1-9][0-9])\s*"
|
|
|
+ r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"<p>\2</p>" + "\n" + r"<p>\4、\5",
|
|
|
+ html2txt) # 【susp_img】
|
|
|
+ html2txt = re.sub(r'(</p>|\n)((\s*<su[bp]>\s*)?<img src=.*? height="[\d.]+p[tx]"\s*/?>(\s*</su[bp]>)?\s*)'
|
|
|
+ r'(([1-9]|[1-9][0-9])\s*[、..、])', r"</p>\2</p>" + "\n" + r"\5", html2txt)
|
|
|
+ html2txt = re.sub(r"(<p>((?!<p>).)+?(\s|[/\"]>))(([1-9]|[1-9][0-9])\s*[、..、].{,20}本[大小]?题\d+分)",
|
|
|
+ r"\1</p>" + "\n<p>" + r"\4", html2txt)
|
|
|
+ # 多张图片和题号相连情况
|
|
|
+ html2txt = re.sub(r"</?p>((\s*<su[bp]>\s*)?<img src=.*?/>(\s*</su[bp]>)?"
|
|
|
+ r"((\s*<su[bp]>\s*)?<img src=((?!/>).)+?/>(\s*</su[bp]>)?)*?\s*)\s*(([1-9]|[1-9][0-9])\s*[、..、])",
|
|
|
+ r"</p>\1</p>" + "\n<p>" + r"\8", html2txt, flags=re.S)
|
|
|
+ html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>',
|
|
|
+ r"\1</p>\n<p>\2</p>", html2txt)
|
|
|
+ html2txt = re.sub(r'(<p>\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>',
|
|
|
+ r"\1</p>\n<p>\2</p>", html2txt)
|
|
|
+ html2txt = re.sub(r'(<p>.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)</p>', r"\1</p>\n<p>\2</p>",
|
|
|
+ html2txt)
|
|
|
+ html2txt = re.sub(r'([1-9]|[1-9][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2",
|
|
|
+ html2txt)
|
|
|
+
|
|
|
+ # 3>>建立图片id字典,对原图片信息第一次替换
|
|
|
+ html2txt = re.sub(r'( src=".*?files)\\image', r"\1/image", html2txt)
|
|
|
+ # all_image = re.findall(r'<img src=".*?image[\da-z]+\..*?[/\"]>', html2txt)
|
|
|
+ # src2subs = {}
|
|
|
+ # subs2src = {}
|
|
|
+ # for src in all_image:
|
|
|
+ # # 校本题库上传的图片名称是随机数,故设置映射
|
|
|
+ # # kk = re.search('(<img src=".*?image\d+\.(png|gif|jpg|jpeg))', src)
|
|
|
+ # # new_src = src.replace(kk.group(1), self.img_url[kk.group(1)]) if type(self.img_url) == dict and kk else src
|
|
|
+ # # 图片信息简化替换
|
|
|
+ # print(src)
|
|
|
+ # new_src = re.sub(r'( data-latex)="\s*\\\[(.*?)\\\]\s*"', r'\1="$\2$"', src)
|
|
|
+ # new_src = re.sub(r'( data-latex="\$[^"]+?\$")',
|
|
|
+ # lambda x: x.group(1).replace("<", " \lt ").replace(" ", " "), new_src)
|
|
|
+ # latex_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?(data-latex=".*?")', src)
|
|
|
+ # mathpix = " " + latex_info.group(3).replace("\n", "").strip().replace(" ", " ") if latex_info else ""
|
|
|
+ # if mathpix and len(mathpix) > 20:
|
|
|
+ # mathpix = ""
|
|
|
+ # w_h_info = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\..*?width="([\d.]+)[pxt]*?"\s*height="([\d.]+)[pxt]*?"', src)
|
|
|
+ # w_h = " w_h=" + w_h_info.group(3).split('.')[0] + "*" + w_h_info.group(4).split('.')[0] \
|
|
|
+ # if w_h_info and not mathpix else "" # w_h 和 mathpix只存在一个
|
|
|
+ # # image_id = re.search(r'<img src=".*?/(new_)?image([\da-z]+)\.', src).group(2)
|
|
|
+ # image_info = re.search(r'<img src=".*?/([^/]+?)/(new_)?image([\da-z]+)\.', src) # 2023.12.1
|
|
|
+
|
|
|
+ # print(image_info.groups())
|
|
|
+ # image_id = image_info.group(1) + image_info.group(3)
|
|
|
+ # if len(image_id) > 10:
|
|
|
+ # image_id = image_id[-10:]
|
|
|
+ # src2subs[src] = '<imgsrc' + image_id + w_h + mathpix + "/>"
|
|
|
+ # subs2src['<imgsrc' + image_id + w_h + mathpix + "/>"] = new_src
|
|
|
+ # for k, v in src2subs.items():
|
|
|
+ # html2txt = html2txt.replace(k, v)
|
|
|
+ # ------------------------------------------------------------------------
|
|
|
+
|
|
|
+ # ========html 转 list=========
|
|
|
+ html2txt = re.sub(r'(</?div>|</table>|</?body>)(\n\s*)*?<p>', r"\1</p>" + "\n<p>", html2txt, flags=re.S)
|
|
|
+ # >>>>>> <table>先替换后再切割
|
|
|
+ # 不能简单按 \n 切割,表格里面也可能有换行,应该先替换后再切割
|
|
|
+ subs2table = {}
|
|
|
+ all_table = re.findall(r'<table>.*?</table>', html2txt, flags=re.S)
|
|
|
+ for k, v in enumerate(all_table):
|
|
|
+ html2txt = html2txt.replace(v, "<t{}b>".format(k))
|
|
|
+ # 将表格中的换行去掉
|
|
|
+ v = re.sub(r'<p>\s*(</?t[drh]( .*?")?>|</?table>|</?tbody>)\s*</p>', r"\1", v)
|
|
|
+ v = re.sub(r'</td></p>[\n\s]*<p><td>', "</td><td>", v)
|
|
|
+ v = re.sub(r'<td>(<p>|\s|</p>|\n)*</td>', "<td> </td>", v)
|
|
|
+ v = re.sub(r'</tbody></?p></table>', "</tbody></table>", v)
|
|
|
+ v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(\s*<p>\s*</p>)[\s\n]*?(<br\s*/?>|\n)+', r"\1", v,
|
|
|
+ flags=re.S)
|
|
|
+ v = re.sub(r'(</?t[drh]( .*?")?>|</?table>|</?tbody>)(<br\s*/?>|\n|</p>|\s)+', r"\1", v, flags=re.S)
|
|
|
+ v = re.sub(r'(</t[drh]( .*?")?>|</table>|</tbody>)(<br\s*/?>|\n|<p>|\s)+', r"\1", v, flags=re.S)
|
|
|
+
|
|
|
+ # 暂时还有table标签首尾的换行没去掉
|
|
|
+ subs2table["<t{}b>".format(str(k))] = v
|
|
|
+
|
|
|
+ # <造成的css标签冲突处理 2021-10-13
|
|
|
+ def sub2(ss):
|
|
|
+ if re.search(r'^(img|/?h[123456]|/?su[bp]>|t\d+b>|br\s*/?>'
|
|
|
+ r'|/?(p|span|font|article|ul|ol|div|table|t?body|html|head|t[drh])(\s*|\s+style=.*?")>'
|
|
|
+ r'|/?[a-z]+ style=.*?">)', ss.group(1)) is None:
|
|
|
+ return "<{}".format(ss.group(1))
|
|
|
+ else:
|
|
|
+ return "<{}".format(ss.group(1))
|
|
|
+
|
|
|
+ html2txt = re.sub("<([^<]{1,30})", sub2, html2txt)
|
|
|
+ if subs2table:
|
|
|
+ html2txt = re.sub(r"|".join(subs2table.keys()), lambda x: subs2table[x.group()], html2txt)
|
|
|
+ # print(html2txt)
|
|
|
+ # >>>>>> html 切割
|
|
|
+ con_list = sum([re.split('<p>|<h[12345]>', i) if len(re.findall("<p>|<h[12345]>", i)) > 1 else [i] for i in
|
|
|
+ re.split(r"\n+|</p>(?!</td>)|</h[12345]>", html2txt)], []) # html2txt)[:-1]
|
|
|
+ con_list = [re.sub(r"^\n*\s*(<p>|<h[12345]>)+", "", ii) for ii in con_list]
|
|
|
+ # 剩余个别标签处理
|
|
|
+ con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*</\1>$", "", i.strip()) for i in con_list] # 2020/4/7,14
|
|
|
+ con_list = [re.sub(r"^(<table>|</td>|<td[^<>]*?>|</?tr>)+?(.|\n)+?([一二三四五六七八九十])\s*[、..、]\s*(.{2,4}题)(.|\n)+?</table>",
|
|
|
+ r"\3、\4", i.strip())
|
|
|
+ for i in con_list]
|
|
|
+ # 把最后可能还存在的</?p>或考号信息去掉
|
|
|
+ con_list = [re.sub("</?p>|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$"
|
|
|
+ "|((学校|班级|姓名|座位号|准考号|[学考]号)[\s::_]*?){2,}$", "", i.strip()) for i in con_list]
|
|
|
+ # =====答案行格式处理====
|
|
|
+ temp_list = [re.split(r"^((\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+)", v.strip(), maxsplit=1)[1::3]
|
|
|
+ if re.match(r'(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$'
|
|
|
+ r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?评分标准'
|
|
|
+ r'|(\s*<imgsrcw_h=[^/\"]*?(data-latex=.*?)?\s*[/\"]>\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$',
|
|
|
+ re.sub(r"[上下]?学[年期]|[\d—【】..、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "",
|
|
|
+ v.strip())) else [v] for v in con_list]
|
|
|
+ con_list = sum(temp_list, [])
|
|
|
+ # =====对可能的题号的处理==== 如2、3、4、5、 加了【fei】 # 重新修改!!!!!!!!!!
|
|
|
+ con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[..、、])", r"【fei】\1", i.strip())
|
|
|
+ if (len(re.findall(r"(^|\s*[..、、])\s*[1-9][0-9]?\s*[..、、]", i)) >= 3
|
|
|
+ and len(re.sub(r"[\d..、、\s]", "", i)) < 2) else i for i in con_list]
|
|
|
+
|
|
|
+ # =====头尾清除没用的信息=====
|
|
|
+ if con_list and re.search(r"[\u4e00-\u9fa5]|<img ", con_list[0]) is None:
|
|
|
+ con_list = con_list[1:]
|
|
|
+ while con_list and re.search(r"声明[::].*?著作权属.*?所有|(邮箱|用户|日期|QQ)\s*[::].+?", con_list[-1]):
|
|
|
+ con_list = con_list[:-1]
|
|
|
+ return html2txt, con_list, self.new_html # subs2table
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ # -------------生成requirements.txt---------------
|
|
|
+ # pip freeze > requirements.txt
|
|
|
+ # import os, sys
|
|
|
+ #
|
|
|
+ # project_root = os.path.dirname(os.path.realpath(__file__)) # 找到当前目录
|
|
|
+ # print(project_root)
|
|
|
+ #
|
|
|
+ # # 找到解释器,虚拟环境目录
|
|
|
+ # python_root = sys.exec_prefix
|
|
|
+ # print(python_root)
|
|
|
+ #
|
|
|
+ # # 拼接生成requirements命令
|
|
|
+ # command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt'
|
|
|
+ # print(command)
|
|
|
+ #
|
|
|
+ # # 执行命令。
|
|
|
+ # os.system(command)
|
|
|
+
|
|
|
+ # ----------------一键安装 requirements.txt------------
|
|
|
+ # pip install -r requirement.txt
|
|
|
+ # python_root + '\Scripts\' + pip install -r requirements.txt
|
|
|
+
|
|
|
+ # import os
|
|
|
+ # rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx")
|
|
|
+ # print(rrr)
|
|
|
+ # item = "<a 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$ <img 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$ <a 我没发你的接口 $2366<a$"
|
|
|
+ # item = r"2.下列选项中,使不等式\( x<\frac{1}{x}< x_{2} \)"
|
|
|
+ # ww = css_conflict_deal(item)
|
|
|
+ # print(ww)
|
|
|
+ p1 = r"/home/cv/workspace/tujintao/document_segmentation/Data/samples/真实样例/6264fa25f84c0e279ac643ef.html"
|
|
|
+ t1 = open(p1, 'r', encoding="utf8").read()
|
|
|
+ row_list, new_html = HtmlWash_2(t1, '11111111', is_reparse=1, must_latex=1).html_cleal()
|
|
|
+ row_list = list(filter(lambda x: x.strip() != "", row_list))
|
|
|
+ pprint(row_list)
|
|
|
+ print(len(row_list))
|
|
|
+ # html, wordid, is_reparse=0, img_url="", must_latex=0)
|
|
|
+
|