#!/usr/bin/env/python # -*- coding:utf-8 -*- # 本文件包含以下函数 # table_label_cleal:去掉表格中的换行符 # html_cleal :html文件清洗 # wash_after: 处理最终结果多余的换行符 import datetime import re import shutil # from operator import itemgetter # from itertools import groupby # from PIL import Image import base64, os, random import time import hashlib from pprint import pprint # from bs4 import BeautifulSoup import Utils.train_configs as train_configs from Utils.field_eq2latex import get_latex from Utils.html_again_parse import css_label_wash # from structure.structure_main import WordParseStructure def table_label_cleal(con): """ 去掉表格中的【换行符】 """ # print(con) # print('------------------------------------------') con = re.sub(r"\n(\s|\n|\t)+", "\n", con) count = 1 while re.search(r"?[a-z]+>\n(?[a-z]+>|
)", r"\1\2", con, flags=re.S) con = re.sub(r'(?t[rd]>)\n(
]*?>[\s\n\t]*? [\s\n\t]*? ' r'[\s\n\t]*? | [\s\n\t]*?
', "", con, flags=re.S) con = re.sub(r'(
)\s*([((]\s*\d\s*[))])', r'\1\n\2', con) return con def base642img(html_data, wordid): """ 【基于mathjax渲染输出是css-html格式】 将base64编码的图片保存到本地 :return: """ # 二进制图片进行转化, 按“word_id”建立文件夹 # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d') # file_path = configs.IMG_FOLDER + '/' + str(self.wordid) # if not os.path.exists(file_path): # os.makedirs(file_path) # else: # 思路1:删除图片,重建文件夹,【所有的新图片都是以base64格式传过来的】 # shutil.rmtree(file_path) # os.makedirs(file_path) # 思路2:每一次再解析都将base64图片保存到本地再以路径形式返回 # st = len(os.listdir(file_path)) # 不要以序号索引的形式命名 # 统计所有base64编码 all_base64_image = re.findall(r'("]+?)"(.*?)\s*/?>)', str(html_data), flags=re.S) if all_base64_image: file_path = train_configs.IMG_FOLDER + '/' + str(wordid) if not os.path.exists(file_path): os.makedirs(file_path) # 新图片命名 name_list = random.sample(range(100000, 999999), len(all_base64_image)) for n, img in enumerate(all_base64_image): img1 = img[2].split(",", maxsplit=1) img_type_info = re.search("data:image/(.+?);base64", img1[0]) img_type = img_type_info.group(1) if img_type_info else "" # 可能还有alt和style的属性,暂时先不要 w_info = re.search('( width="\d+")', img[3]) h_info = re.search('( height="\d+")', img[3]) img_data = base64.b64decode(str(img1[-1])) if img_type: # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape) img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type save_path = os.path.join(file_path, img_name) with open(save_path, 'wb') as f: f.write(img_data) # self.localnewpic_list.append(save_path) # put_key = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(img_name) # self.put_key_list.append(save_path) flag_behind = '" />' if w_info and h_info: flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />' temp_img = '' html_data = html_data.replace(img[0], temp_img) return html_data class HtmlWash(): def __init__(self, html, wordid, is_reparse=0, img_url="", must_latex=0): """ html文本清洗 批量再解析中,新增图片信息替换的文本返回作为ocr保存文本, 继续往下清洗的文本,则进入结构化解析逻辑中 """ # super().__init__(html, wordid, is_reparse, must_latex) self.html = html self.img_url = img_url self.wordid = wordid self.is_reparse = is_reparse self.must_latex = must_latex # self.put_key_list = [] # self.localnewpic_list =[] self.sub_list = ["?div>", "?b>", "?caption>", "?center>", "?cite>", "?code>", "?colgroup>", "?menu>", "?dd>", "?dir>", "?li>", "?em>", "?article>", "?header>", "?ruby>", "?summary>", "?details>", "?strong>", "?strike>", "?small>", "?select>", "?section>", "?script>", "?[su]>", "?var>", "?ul>", "?tt>", "?title>", "?thead>", "?tfoot>", "
\s*)【例题(\d+)】", r"\1\2、", html2txt)
html2txt = re.sub(r"\\\(|\\\)", "$", html2txt)
# 域公式的转化处理;\可以在前端显示,不需要用latex渲染
try:
html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid, self.must_latex)
if newhml: # 存在域公式转图片时,需要将原文本的域公式也转为图片信息
self.new_html = newhml
html2txt = html2txt.replace("【omml-latex】", "")
except:
html2txt = html2txt.replace("【omml-latex】", "")
# 字符串公式的处理:如Fe2O3, 在结构化之后处理比较好
#
处理
html2txt = re.sub(r"
", "\n", html2txt)
html2txt = re.sub(r"[((]\s*(\d)\s*\$分\s*[))]", r"$(\1分)", html2txt)
# =====题型行的统一处理=====
# ---->>>>>题型行可能放在表格中
if len(re.findall("", html2txt)) >= 8: # 这个限制还不太严谨
for tt in re.finditer('(((?!(?tr>)).)*) ', html2txt, re.S):
tt_list = re.split(r'^\s*]*?>| |
', tt.group(1).strip()) #
" + " ".join(tt_list) + "
") # html2txt = re.sub(r"?tbody>|?table>|?div>", "", html2txt) # ---->>>>>end html2txt = re.sub(r"()\s*([一二三四五六七八九十]\s*[、..、::]?.{2,6}题)", r"\1\2", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt) html2txt = re.sub(r'(([一二三四五六七八九十])\s*[、..、,,::]\s*(.{2,4}题)\s*
)', r"\1", str(html2txt), flags=re.S) html2txt = re.sub(r"
\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)", r"
\1、\2题", html2txt) html2txt = re.sub(r'
\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*
', "", html2txt) html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?', r"
【选做题】:'\1'
", html2txt) html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*
', "【选做题】
", html2txt) html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*
', r"\1、\2题
", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)' r'([((]\s*本题|.*?\d分)', r"\1" + "、" + r'\2' + "题" + r"\3", html2txt) html2txt = re.sub(r'([一二三四五六])\s*[、..、,,::]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题', r"\1" + "、" + r'\2' + "题", html2txt) # html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2" # html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)', r"\1" + "、" + "解答题", html2txt) html2txt = re.sub(r'(?)\s*([一二三四五六七八九十]\s*[、..、,,::]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)', r'
\n\1', html2txt) html2txt = re.sub(r'
\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s*
', r"\1、本大题
", html2txt) # html2txt = re.sub(r'\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"
一、\1题", html2txt) # =====图片的处理===== # 1>>根据图片宽高的异常值判断删除隐藏图片 def sub1(ss): if float(ss.group(1)) <= 3 and float(ss.group(2)) <= 3: return "" else: return ss.group(0) html2txt = re.sub(r'', sub1, html2txt) # 2>>将图片中带有的汉字去掉 html2txt = re.sub(r'(', r"\1 />", html2txt) # 将">换为" /> html2txt = re.sub(r'(', r"\1 />", html2txt) # 将">换为" /> # =====答案解析关键字的统一处理===== html2txt = re.sub(r'【\s*().)+?/>\s*)*?([解答])\s*().)+?/>\s*)*?([析案])\s*' r'().)+?/>\s*)*?】', r"【\3\6】", str(html2txt)) # 2022/4/28 html2txt = re.sub(r'
\s*(解\s*[::])', r"
【解答】", str(html2txt)) html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt)) html2txt = re.sub(r'(\n\s*|
\s*|\s{2,}|\n\s*\d{,2}\s*[、..、]\s*)(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"\1【\2】", str(html2txt)) html2txt = re.sub(r'(\n|^|
)\s*(([1-9]|[1-9][0-9])\s*[..、、])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\2【\4】", str(html2txt)) html2txt = re.sub(r'([A-D])\s*\[\s*(解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\n【\2】", str(html2txt)) html2txt = re.sub(r'(\n|^|
)\s*(分析)\s*[::]', r"【\2】", str(html2txt)) if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt: html2txt = re.sub(r'【解答】', "【解析】", str(html2txt)) # =====其他关键字的处理===== html2txt = re.sub(r'
\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?
', "", str(html2txt)) html2txt = re.sub(r'\s*(选修[\d-]*?[::].{2,15})\s*
', r"【章节】\1
", html2txt) html2txt = re.sub(r'\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*
', r"【章节】\2
", html2txt) html2txt = re.sub(r'\s*(基础|中档|综合)题[^p题]*?
|\s*【(考点|专题)】[^p]*?
', "", str(html2txt)) html2txt = re.sub(r'\s*(基础训练|提升训练|探究培优)
', "", str(html2txt)) html2txt = re.sub(r'注意事项[::]\s*
(\n+\s*\s*\d\s*[、..、][^/]+?
){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'注意事项[::]\s*\d\s*[、..、][^/]+?
(\n+\s*\s*\d\s*[、..、][^/]+?
){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt) html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt) html2txt = re.sub(r'\[来源:.*?\]', "", html2txt) html2txt = re.sub('欢迎访问.*?
', '', html2txt) html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?', r'<\1>', html2txt) #\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s*
', "【非选择题】
", html2txt) # == == =对可能的题型行的处理 == == html2txt = re.sub("【非选择题】
((\s|\n||
)*\d{1,2}\s*[..、、].+?)", r"二、解答题
\1", html2txt)\ .replace("【非选择题】", "") # =====选项的处理===== html2txt = re.sub(r'(\s*([1-9]|[1-9][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*?
)', r"\1\n\3", str(html2txt)) # =====题号的处理===== html2txt = re.sub(r'([ED]\s*[、..、].*?(\s|\s*))(([1-9]|[1-9][0-9])\s*[、..、])', r"\1
\n\3", html2txt) html2txt = re.sub(r'((?p>|\n)\s*(\s*)?([1-9]|[1-9][0-9]))\s*' r'([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)', r"
\1、\5", html2txt) html2txt = re.sub(r"\s*([1-9]|[1-9][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"
\1、\2", html2txt) html2txt = re.sub(r"
\s*([1-9]|[1-9][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::]|\[(答案|解析)\])", r"
\1、\2", html2txt) html2txt = re.sub(r"
\s*([1-9]|[1-9][0-9])\s*([((]\s*\d+\s*分?\s*[))])?(【(解析?|答案?)】|(解析?|答案?)\s*[::]" r"|\[(答案|解析)\])", r"
\1、\2\3", html2txt) # 图片和题号相连情况 html2txt = re.sub(r"
\s*().)+?/>)\s*([1-9]|[1-9][0-9])\s*" r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"
\3、\1\4", html2txt) # 2024.5.6 html2txt = re.sub(r'
((\s*
\4\1", html2txt) # 2024.5.6 html2txt = re.sub(r"(
|\n)\s*().)+?/>)\s*([1-9]|[1-9][0-9])\s*" r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"\2
" + "\n" + r"\4、\5", html2txt) # 【susp_img】 html2txt = re.sub(r'(
|\n)((\s*((?!
).)+?(\s|[/\"]>))(([1-9]|[1-9][0-9])\s*[、..、].{,20}本[大小]?题\d+分)", r"\1
" + "\n" + r"\4", html2txt)
# 多张图片和题号相连情况
html2txt = re.sub(r"?p>((\s*
" + r"\8", html2txt, flags=re.S) html2txt = re.sub(r'(
\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)
', r"\1\n\2
", html2txt) html2txt = re.sub(r'(\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)
', r"\1\n\2
", html2txt) html2txt = re.sub(r'(.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)
', r"\1\n\2
", html2txt) html2txt = re.sub(r'([1-9]|[1-9][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2", html2txt) # 3>>建立图片id字典,对原图片信息第一次替换 html2txt = re.sub(r'( src=".*?files)\\image', r"\1/image", html2txt) # all_image = re.findall(r'', html2txt) # src2subs = {} # subs2src = {} # for src in all_image: # # 校本题库上传的图片名称是随机数,故设置映射 # # kk = re.search('( 20: # mathpix = "" # w_h_info = re.search(r' 10: # image_id = image_id[-10:] # src2subs[src] = '', r"\1
" + "\n", html2txt, flags=re.S) # >>>>>>
\s*(?t[drh]( .*?")?>|?table>|?tbody>)\s*
', r"\1", v) v = re.sub(r'[\n\s]*|\s|
|\n)*\s*
)[\s\n]*?(|\s)+', r"\1", v, flags=re.S)
# 暂时还有table标签首尾的换行没去掉
subs2table[" | |
|