#!/usr/bin/env/python # -*- coding:utf-8 -*- # 本文件包含以下函数 # table_label_cleal:去掉表格中的换行符 # html_cleal :html文件清洗 # wash_after: 处理最终结果多余的换行符 import re import shutil # from operator import itemgetter # from itertools import groupby # from PIL import Image import base64, os, random import time import hashlib from pprint import pprint # from bs4 import BeautifulSoup # UPLOAD_FOLDER = config.UPLOAD_FOLDER import configs from structure.stems_to_groups import parse_split2group, suojin from utils.equation_extract import get_equation_instr, get_simpstr2eqn from utils.field_eq2latex import get_latex from utils.html_again_parse import css_label_wash from structure import stems_to_groups def table_label_cleal(con): """ 去掉表格中的【换行符】 """ # print(con) # print('------------------------------------------') con = re.sub(r"\n(\n|\t)+", "\n", con) count = 1 while re.search(r"?[a-z]+>\n(?[a-z]+>|
)", r"\1\2", con, flags=re.S) con = re.sub(r'(?t[rd]>)\n(
]*?>[\s\n\t]*? [\s\n\t]*? ' r'[\s\n\t]*? | [\s\n\t]*?
', "", con, flags=re.S) con = re.sub(r'(
)\s*([((]\s*\d\s*[))])', r'\1\n\2', con) return con def base642img(html_data, wordid): """ 【基于mathjax渲染输出是css-html格式】 将base64编码的图片保存到本地 :return: """ # 二进制图片进行转化, 按“word_id”建立文件夹 # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d') # file_path = configs.IMG_FOLDER + '/' + str(self.wordid) # if not os.path.exists(file_path): # os.makedirs(file_path) # else: # 思路1:删除图片,重建文件夹,【所有的新图片都是以base64格式传过来的】 # shutil.rmtree(file_path) # os.makedirs(file_path) # 思路2:每一次再解析都将base64图片保存到本地再以路径形式返回 # st = len(os.listdir(file_path)) # 不要以序号索引的形式命名 # 统计所有base64编码 all_base64_image = re.findall(r'("]+?)"(.*?)\s*/?>)', str(html_data), flags=re.S) if all_base64_image: file_path = configs.IMG_FOLDER + '/' + str(wordid) if not os.path.exists(file_path): os.makedirs(file_path) # 新图片命名1、时间戳+随机数;2、md5值命名 # name_list = random.sample(range(100000, 999999), len(all_base64_image)) for n, img in enumerate(all_base64_image): img1 = img[2].split(",", maxsplit=1) img_type_info = re.search("data:image/(.+?);base64", img1[0]) img_type = img_type_info.group(1) if img_type_info else "" # 图片格式 # 可能还有alt和style的属性,暂时先不要 w_info = re.search('( width="\d+")', img[3]) h_info = re.search('( height="\d+")', img[3]) img_data = base64.b64decode(str(img1[-1])) if img_type: # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape) pmd5 = hashlib.md5(img_data) # img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type # 时间戳+随机数 img_name = "new_image" + pmd5.hexdigest() + "." + img_type save_path = os.path.join(file_path, img_name) with open(save_path, 'wb') as f: f.write(img_data) flag_behind = '" />' if w_info and h_info: flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />' temp_img = '", "?b>", "?caption>", "?center>", "?cite>", "?code>", "?colgroup>", "?menu>", "?dd>", "?dir>", "?li>", "?article>", "?header>", "?ruby>", "?summary>", "?details>", "?strike>", "?small>", "?select>", "?section>", "?script>", "?[su]>", "?var>", "?ul>", "?tt>", "?title>", "?thead>", "?tfoot>", "
\s*)【例题(\d+)】", r"\1\2、", html2txt) html2txt = re.sub(r"\\\(|\\\)", "$", html2txt) # 把格式标签单独拆分为一行,再在题目切分为小题完了之后再组起来!!!!! html2txt = re.sub(r'(
)', r"\1
", html2txt)
# 域公式的转化处理;\可以在前端显示,不需要用latex渲染
try:
html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid)
if newhml: # 存在域公式转图片时,需要将原文本的域公式也转为图片信息
self.new_html = newhml
html2txt = html2txt.replace("【omml-latex】", "")
except:
html2txt = html2txt.replace("【omml-latex】", "")
# 字符串公式的处理:如Fe2O3, 在结构化之后处理比较好
#
处理
html2txt = re.sub(r"
", "\n", html2txt)
# =====题型行的统一处理=====
# ---->>>>>题型行可能放在表格中
if len(re.findall("", html2txt)) >= 8: # 这个限制还不太严谨
for tt in re.finditer('(((?!(?tr>)).)*) ', html2txt, re.S):
tt_list = re.split(r'^\s*]*?>| |
', tt.group(1).strip()) #
" + " ".join(tt_list) + "
") # html2txt = re.sub(r"?tbody>|?table>|?div>", "", html2txt) # ---->>>>>end html2txt = re.sub(r"()\s*([一二三四五六七八九十]\s*[、..、::]?.{2,6}题)", r"\1\2", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt) html2txt = re.sub(r'(([一二三四五六七八九十])\s*[、..、,,::]\s*(.{2,4}题)\s*
)', r"\1", str(html2txt), flags=re.S) html2txt = re.sub(r'
\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)", r"
\1、\2题", html2txt) html2txt = re.sub(r'
\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*
', "", html2txt) html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?', r"
【选做题】:'\1'
", html2txt) html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*
', "【选做题】
", html2txt) html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*
', r"\1、\2题
", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)' r'([((]\s*本题|.*?\d分)', r"\1" + "、" + r'\2' + "题" + r"\3", html2txt) html2txt = re.sub(r'([一二三四五六])\s*[、..、,,::]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题', r"\1" + "、" + r'\2' + "题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2" html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt) if self.subject != "语文": html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)', r"\1" + "、" + "解答题", html2txt) else: html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)', r"\1" + "、" + "综合题", html2txt) html2txt = re.sub(r'(?)\s*([一二三四五六七八九十]\s*[、..、,,::]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)', r'
\n\1', html2txt) # html2txt = re.sub(r'
\s*第[Ⅰ1I]卷\s*
\n?\s*本卷共\d{1,2}个?小?题.*?四个选项.*?\n?
\s*(\d\s*[、..、])' # r'|
第[Ⅰ1I]卷\s*[((]选择题[))]
\n?本卷共\d{1,2}个?小?题.*?\n?
\s*(\d\s*[、..、])', # r"
一、选择题
\n\1\2", html2txt) # html2txt = re.sub(r'
\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"
一、\1题", html2txt) # =====答案解析关键字的统一处理===== html2txt = re.sub(r'【\s*().)+?/>\s*)*?([解答])\s*().)+?/>\s*)*?([析案])\s*' r'().)+?/>\s*)*?】', r"【\3\6】", str(html2txt)) # 2022/4/28 html2txt = re.sub(r'
\s*(解\s*[::])', r"
【解答】", str(html2txt)) html2txt = re.sub(r'(
|
|\n)\s*(参考译文\s*[::])', r"\1【参考译文】", str(html2txt))
html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt))
html2txt = re.sub(r'(\n\s*|\s{2,})(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"\1【\2】", str(html2txt))
html2txt = re.sub(r'(\n|^|
)\s*(([1-9]|[1-4][0-9])\s*[..、、])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\2【\4】", str(html2txt)) html2txt = re.sub(r'(\n|^)\s*(分析)\s*[::]', r"【\2】", str(html2txt)) if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt: html2txt = re.sub(r'【解答】', "【解析】", str(html2txt)) # =====其他关键字的处理===== html2txt = re.sub(r'
\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?
', "", str(html2txt)) html2txt = re.sub(r'\s*(选修[\d-]*?[::].{2,15})\s*
', r"【章节】\1
", html2txt) html2txt = re.sub(r'\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*
', r"【章节】\2
", html2txt) html2txt = re.sub(r'\s*(基础|中档|综合)题[^p题]*?
|\s*【(考点|专题)】[^p]*?
', "", str(html2txt)) html2txt = re.sub(r'\s*(基础训练|提升训练|探究培优)
', "", str(html2txt)) html2txt = re.sub(r'注意事项[::]\s*
(\n+\s*\s*\d\s*[、..、][^/]+?
)+', "", html2txt, flags=re.S) html2txt = re.sub(r'注意事项[::]\s*\d\s*[、..、][^/]+?
(\n+\s*\s*\d\s*[、..、][^/]+?
)+', "", html2txt, flags=re.S) html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt) html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt) html2txt = re.sub(r'\[来源:.*?\]', "", html2txt) html2txt = re.sub('欢迎访问.*?
', '', html2txt) html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?', r'<\1>', html2txt) html2txt = re.sub(r'', r"\1
" + "\n", html2txt, flags=re.S) # >>>>>>