#!/usr/bin/env/python # -*- coding:utf-8 -*- # 本文件包含以下函数 # table_label_cleal：去掉表格中的换行符 # html_cleal ：html文件清洗 # wash_after: 处理最终结果多余的换行符 import datetime import re import shutil # from operator import itemgetter # from itertools import groupby # from PIL import Image import base64, os, random import time import requests import hashlib from pprint import pprint # from bs4 import BeautifulSoup # UPLOAD_FOLDER = config.UPLOAD_FOLDER import configs from utils.equation_extract import get_equation_instr, get_simpstr2eqn from utils.field_eq2latex import get_latex from utils.html_again_parse import css_label_wash # from structure.structure_main import WordParseStructure logger = configs.myLog(__name__, log_cate="ruku_log").getlog() def table_label_cleal(con): """ 去掉表格中的【换行符】 """ # print(con) # print('------------------------------------------') con = re.sub(r"\n(\s|\n|\t)+", "\n", con) count = 1 while re.search(r"\n(|)", con, re.S) and count <= 10: con = re.sub("(|||)\n(||||

)", r"\1\2", con, flags=re.S) con = re.sub(r'()\n()', r'\1\2', con, flags=re.S) count += 1 # if re.search(r"(.|\n)+?
", con, re.S|re.M): # aa = re.search(r"((.|\n)+?
)", con, re.S|re.M) # con = con.replace(aa.group(1),aa.group(1).replace("\n","")) # 将空表格的情况去掉 con = re.sub(r'[\s\n\t]*?[\s\n\t]*?([\s\n\t]*?]*?>[\s\n\t]*?
[\s\n\t]*?
' r'[\s\n\t]*?[\s\n\t]*?[\s\n\t]*?)+[\s\n\t]*?
[\s\n\t]*?

', "", con, flags=re.S) con = re.sub(r'(

)\s*([(（]\s*\d\s*[)）])', r'\1\n\2', con) return con def base642img(html_data, wordid): """ 【基于mathjax渲染输出是css-html格式】将base64编码的图片保存到本地 :return: """ # 二进制图片进行转化，按“word_id”建立文件夹 # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d') # file_path = configs.IMG_FOLDER + '/' + str(self.wordid) # if not os.path.exists(file_path): # os.makedirs(file_path) # else: # 思路1：删除图片,重建文件夹，【所有的新图片都是以base64格式传过来的】 # shutil.rmtree(file_path) # os.makedirs(file_path) # 思路2：每一次再解析都将base64图片保存到本地再以路径形式返回 # st = len(os.listdir(file_path)) # 不要以序号索引的形式命名 # 统计所有base64编码 all_base64_image = re.findall(r'()', str(html_data), flags=re.S) if all_base64_image: file_path = configs.IMG_FOLDER + '/' + str(wordid) if not os.path.exists(file_path): os.makedirs(file_path) # 新图片命名 name_list = random.sample(range(100000, 999999), len(all_base64_image)) for n, img in enumerate(all_base64_image): img1 = img[2].split(",", maxsplit=1) img_type_info = re.search("data:image/(.+?);base64", img1[0]) img_type = img_type_info.group(1) if img_type_info else "" # 可能还有alt和style的属性，暂时先不要 w_info = re.search('( width="\d+")', img[3]) h_info = re.search('( height="\d+")', img[3]) img_data = base64.b64decode(str(img1[-1])) if img_type: # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape) img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type save_path = os.path.join(file_path, img_name) with open(save_path, 'wb') as f: f.write(img_data) # self.localnewpic_list.append(save_path) # put_key = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(img_name) # self.put_key_list.append(save_path) flag_behind = '" />' if w_info and h_info: flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />' temp_img = '' html_data = html_data.replace(img[0], temp_img) return html_data class HtmlWash(): def __init__(self, html, wordid, is_reparse=0, img_url="", must_latex=0): """ html文本清洗批量再解析中，新增图片信息替换的文本返回作为ocr保存文本，继续往下清洗的文本，则进入结构化解析逻辑中 """ # super().__init__(html, wordid, is_reparse, must_latex) self.html = html self.img_url = img_url self.wordid = wordid self.is_reparse = is_reparse self.must_latex = must_latex # self.put_key_list = [] # self.localnewpic_list =[] self.sub_list = ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "

", "

", ""] self.sub_dd = {'×': '×', '÷': '÷', '°': '°', '·': '·', '±': '±', 'º': 'º', '¹': '¹', '²': '²', '³': '³', '½': '1/2', '¼': '¼', '¾': '¾', '¥': '¥', 'm³': 'm³', # '<': '<', '£': '£', # '∠<': '<', '>': '>', "Ａ": "A", "А": "A", "Α": "A", "Ｂ": "B", "В": "B", "в": "B", "Β": "B", "Ｃ": "C", "С": "C", "ｃ": "c", "с": "c", "Ｄ": "D", "Ε": "E", "Ｅ": "E", "Ｆ": "F", "Ｇ": "G", "ｇ": "g", "ｍ": "m", "Ｎ": "N", "ｓ": "s", "ｔ": "t", "／": "/", "＝": "=", "－": "-", "２": "2", "３": "3", "４":"4", "５":"5", "６":"6", "７": "7", "８": "8", "９":"9", "１":"1", "０":"0", ' ': ' ', ' ': ' ', "〖": '【', "〗": '】', "題": '题', "单项选择": '单选', "多项选择": '多选', # "不定项选择": '选择', "双项选择": '多选', "实验与探究题": '实验', "原理综合题": '原理题', } def new_pic_sub(self): """ 针对base64图片先保存到本地，入库时再换成腾讯云线上地址 # 第一版：再解析中，将二进制图片进行转化,图片怎么保存比较好，先再“天数”建立文件夹第一版：再解析中，根据“word_id”建立文件夹 :return: """ if self.is_reparse: # css 标签清洗 self.html = css_label_wash(self.html) # 保存base64编码的图片 self.html = base642img(self.html, self.wordid) self.new_html = self.html def html_cleal(self): # =======清洗mathjax标签======== if "MathJax" in self.html: # 再解析中存在mathjax公式渲染的标签 all_mathjax = re.findall('(

)*)', self.html) for jax in all_mathjax: latex = re.findall('()*', jax[0]) if latex: latex = "${}$".format(latex[0][0]) self.html = self.html.replace(jax[0], latex) else: self.html = self.html.replace(jax[0], "") # ======再解析中的新图片处理===== self.new_pic_sub() # =====特殊符号处理===== html2txt = re.sub(r"|".join(self.sub_list), "", str(self.html)) # ("", " ") #2020/4/7 html2txt = re.sub("|".join(self.sub_dd.keys()), lambda x: self.sub_dd[x.group()], html2txt) # 2020/4/1,4/7,4/20 html2txt = re.sub("[不非]定[向项]选择", "不定选择", html2txt) html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \ .replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \ .replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ")\ .replace("\u2003", " ").replace("\x7f", " ").replace("\xa0", "") html2txt = re.sub(r"(

\s*)【例题(\d+)】", r"\1\2、", html2txt) html2txt = re.sub(r"\\$|\\$", "$", html2txt) # 域公式的转化处理；_{\^{可以在前端显示，不需要用latex渲染
try:
html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid, self.must_latex)
if newhml: # 存在域公式转图片时，需要将原文本的域公式也转为图片信息
self.new_html = newhml
html2txt = html2txt.replace("【omml-latex】", "")
except:
html2txt = html2txt.replace("【omml-latex】", "")

# 字符串公式的处理：如Fe₂O₃, 在结构化之后处理比较好
#
处理
html2txt = re.sub(r"", "\n", html2txt)
html2txt = re.sub(r"[（(]\s*(\d)\s*\$分\s*[)）]", r"$(\1分)", html2txt)

# =====题型行的统一处理=====
# ---->>>>>题型行可能放在表格中
if len(re.findall("", html2txt)) >= 8: # 这个限制还不太严谨
for tt in re.finditer('(((?!()).)*)', html2txt, re.S):
tt_list = re.split(r'^\s*]*?>|}}

|[\n\s]*?]*?>' r'|\s*\n|\s*$|\n\s*]*?>|]*?>

', tt.group(1).strip()) # \s*[$\n]这样无效 tt_list = [col for col in tt_list if col.strip()] if " ".join(tt_list).replace(" ", "") in ['得分评卷人', '评卷人得分']: html2txt = html2txt.replace(tt.group(0), "") else: pass # html2txt = html2txt.replace(tt.group(0), "

" + " ".join(tt_list) + "

") # html2txt = re.sub(r"||", "", html2txt) # ---->>>>>end html2txt = re.sub(r"()\s*([一二三四五六七八九十]\s*[、.．､：:]?.{2,6}题)", r"\1

\2", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]\s*(论述|填空|探究)题?[与和､、，,\s]*?(计算题|实验题)', r"\1､\3", html2txt) html2txt = re.sub(r']*?>

(([一二三四五六七八九十])\s*[、.．､，,：:]\s*(.{2,4}题)\s*

)[^p]*?

', r"\1", str(html2txt), flags=re.S) html2txt = re.sub(r"

\s*([一二三四五六七八九十])\s*[、.．､,，：:]?\s*(计算|[解简]答|实验|作图)题?[与和、､，,\s]*?(计算|[解简]答|实验|作图)", r"

\1､\2题", html2txt) html2txt = re.sub(r'

\s*[(（]\s*[一二三四五六]\s*[)）]\s*必考题\s*(.?|.+?分\s*[.。．]?)\s*

', "", html2txt) html2txt = re.sub(r'

\s*[(（]\s*[一二三四五六]\s*[)）]\s*选考题\s*.?\s*.{,4}(?', r"

【选做题】:'\1'

", html2txt) html2txt = re.sub(r'

\s*[(（]\s*[一二三四五六]\s*[)）]\s*选考题\s*(.?|.+?分\s*[.。．]?)\s*

', "

【选做题】

", html2txt) html2txt = re.sub(r'

\s*([一二三四五六七八九十])\s*[、.．､，,：:]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*

', r"

\1､\2题

", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)' r'([(（]\s*本题|.*?\d分)', r"\1" + "､" + r'\2' + "题" + r"\3", html2txt) html2txt = re.sub(r'([一二三四五六])\s*[、.．､，,:：]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题', r"\1" + "､" + r'\2' + "题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]?\s*[(（]\s*本大题(.*?选项中)', r"\1" + "､" + "选择题", html2txt) # + r"\2" html2txt = re.sub(r'

\s*([一二三四五六七八九十])\s*[、.．､，,]?\s*[(（本大题]*?(.*?选项中)', r"\1" + "､" + "选择题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、.．､，,：:]?\s*([(（]\s*(每小题|本大?题)((?!(选项)).)+?[）)]|综合题)', r"\1" + "､" + "解答题", html2txt) html2txt = re.sub(r'(?)\s*([一二三四五六七八九十]\s*[、.．､，,：:]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)', r'

\1', html2txt) html2txt = re.sub(r'

\s*([一二三四五六七八九十])\s*[、.．､，,：:]?\s*[(（]?本?大?题((?!(选项)).)+?[)）]?\s*

', r"

\1､本大题

", html2txt) # html2txt = re.sub(r'

\s*[^一二三四五六七八九十]{,3}\s*[、.．､]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"

一､\1题", html2txt) # =====答案解析关键字的统一处理===== html2txt = re.sub(r'【\s*().)+?/>\s*)*?([解答])\s*().)+?/>\s*)*?([析案])\s*' r'().)+?/>\s*)*?】', r"【\3\6】", str(html2txt)) # 2022/4/28 html2txt = re.sub(r'

\s*(解\s*[：:])', r"

【解答】", str(html2txt)) html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt)) html2txt = re.sub(r'(\n\s*|

\s*|\s{2,}|\n\s*\d{,2}\s*[、.．､]\s*)(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[：:]', r"\1【\2】", str(html2txt)) html2txt = re.sub(r'(\n|^|

)\s*(([1-9]|[1-9][0-9])\s*[.．、､])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\2【\4】", str(html2txt)) html2txt = re.sub(r'([A-D])\s*\[\s*(解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\n【\2】", str(html2txt)) html2txt = re.sub(r'(\n|^|

)\s*(分析)\s*[：:]', r"【\2】", str(html2txt)) if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt: html2txt = re.sub(r'【解答】', "【解析】", str(html2txt)) # =====其他关键字的处理===== html2txt = re.sub(r'

\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?

', "", str(html2txt)) html2txt = re.sub(r'

\s*(选修[\d-]*?[：:].{2,15})\s*

', r"

【章节】\1

", html2txt) html2txt = re.sub(r'

\s*([一二三四五六]\s*[、.．､]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([(（]\d+分[)）])?\s*

', r"

【章节】\2

", html2txt) html2txt = re.sub(r'

\s*(基础|中档|综合)题[^p题]*?

\s*【(考点|专题)】[^p]*?

', "", str(html2txt)) html2txt = re.sub(r'

\s*(基础训练|提升训练|探究培优)

', "", str(html2txt)) html2txt = re.sub(r'

注意事项[:：]\s*

(\n+\s*

\s*\d\s*[、.．､][^/]+?

){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'

注意事项[:：]\s*\d\s*[、.．､][^/]+?

(\n+\s*

\s*\d\s*[、.．､][^/]+?

){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'[(（]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[)）]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt) html2txt = re.sub(r'[(（](\s*\d\s*\d?\s*分?\s*)[)）]', "(" + r'\1'.replace(" ", "") + ")", html2txt) html2txt = re.sub(r'\[来源:.*?\]', "", html2txt) html2txt = re.sub('

欢迎访问.*?

', '', html2txt) html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?', r'<\1>', html2txt) # 保留 html2txt = re.sub(r'<(table)( [a-z]+=".*?")+>', r'<\1>', html2txt) html2txt = re.sub(r'

\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([(（].*?[)）]|非?选择题.{,8})?\s*

', "

【非选择题】

", html2txt) # == == =对可能的题型行的处理 == == html2txt = re.sub("

【非选择题】

((\s|\n|

)*\d{1,2}\s*[.．、､].+?)", r"

二、解答题

\1", html2txt)\ .replace("【非选择题】", "") # =====选项的处理===== html2txt = re.sub(r'(

\s*([1-9]|[1-9][0-9])\s*[.．、､].+?[(（]\s*[）)])\s*(A\s*[.．、､][^/]*?

)', r"\1

\3", str(html2txt)) # =====题号的处理===== html2txt = re.sub(r'([ED]\s*[、.．､].*?(\s|\s*))(([1-9]|[1-9][0-9])\s*[、.．､])', r"\1

\3", html2txt) html2txt = re.sub(r'((|\n)\s*(\s*)?([1-9]|[1-9][0-9]))\s*' r'([（(]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[)）]|解析?\s*[:：]|【解析】)', r"

\1､\5", html2txt) html2txt = re.sub(r"

\s*([1-9]|[1-9][0-9])\s*([(（]20\d{2}\s*[\u4e00-\u9fa5、､]{2,9}[)）])", r"

\1､\2", html2txt) html2txt = re.sub(r"

\s*([1-9]|[1-9][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[:：]|\[(答案|解析)\])", r"

\1､\2", html2txt) html2txt = re.sub(r"

\s*([1-9]|[1-9][0-9])\s*([(（]\s*\d+\s*分?\s*[)）])?(【(解析?|答案?)】|(解析?|答案?)\s*[:：]" r"|\[(答案|解析)\])", r"

\1､\2\3", html2txt) html2txt = re.sub(r"(|\n)\s*().)+?/>)\s*([1-9]|[1-9][0-9])\s*" r"([(（]20\d{2}\s*[\u4e00-\u9fa5、､]{2,9}[)）])", r"

" + "\n" + r"

\4､\5", html2txt) # 【susp_img】 html2txt = re.sub(r'(|\n)((\s*\s*)?(\s*)?\s*)' r'(([1-9]|[1-9][0-9])\s*[、.．､])', r"

" + "\n" + r"\5", html2txt) html2txt = re.sub(r"(

((?!

).)+?(\s|[/\"]>))(([1-9]|[1-9][0-9])\s*[、.．､].{,20}本[大小]?题\d+分)", r"\1

" + "\n

" + r"\4", html2txt) html2txt = re.sub(r"((\s*\s*)?(\s*)?" r"((\s*\s*)?).)+?/>(\s*)?)*?\s*)\s*(([1-9]|[1-9][0-9])\s*[、.．､])", r"

" + "\n

" + r"\8", html2txt, flags=re.S) html2txt = re.sub(r'(

\s*[一二三四五六七八九十].*?题\s*$.+?分.*?$)\s*(([1-9]|[1-9][0-9])\s*[、.．､].*?)

', r"\1

", html2txt) html2txt = re.sub(r'(

\s*[一二三四五六七八九十].*?题\s*$.+?分.*?$)\s*(([1-9]|[1-9][0-9])\s*[、.．､].*?)

', r"\1

", html2txt) html2txt = re.sub(r'(

.*?[.．]{6,}\s*\d+分)\s*(([1-9]|[1-9][0-9])\s*[、.．､].*?)

', r"\1

", html2txt) html2txt = re.sub(r'([1-9]|[1-9][0-9])\s*([（(]\s*\d{1,2}[.\s\d]*?分\s*[)）])\s*[、.．､]', r"\1" + "､" + r"\2", html2txt) # =====图片的处理===== # 1>>根据图片宽高的异常值判断删除隐藏图片 def sub1(ss): if float(ss.group(1)) <= 3 and float(ss.group(2)) <= 3: return "" else: return ss.group(0) html2txt = re.sub(r'

', sub1, html2txt) # 2>>将图片中带有的汉字去掉 html2txt = re.sub(r'( .+?

', r"\1 />", html2txt) # 将">换为" /> html2txt = re.sub(r'(

', r"\1 />", html2txt) # 将">换为" /> # 3>>建立图片id字典,对原图片信息第一次替换 html2txt = re.sub(r'( src=".*?files)\\image', r"\1/image", html2txt) all_image = re.findall(r'

', html2txt) src2subs = {} subs2src = {} for src in all_image: # 校本题库上传的图片名称是随机数,故设置映射 # kk = re.search('( $\$[^$ 20: mathpix = "" w_h_info = re.search(r'

10: image_id = image_id[-10:] src2subs[src] = '" subs2src['"] = new_src for k, v in src2subs.items(): html2txt = html2txt.replace(k, v) # ------------------------------------------------------------------------ # ========html 转 list========= html2txt = re.sub(r'(||)(\n\s*)*?

', r"\1

" + "\n

", html2txt, flags=re.S) # >>>>>> 先替换后再切割 # 不能简单按 \n 切割，表格里面也可能有换行，应该先替换后再切割 subs2table = {} all_table = re.findall(r'
.*?
', html2txt, flags=re.S) for k, v in enumerate(all_table): html2txt = html2txt.replace(v, "".format(k)) # 将表格中的换行去掉 v = re.sub(r'

\s*(||)\s*

', r"\1", v) v = re.sub(r'

[\n\s]*

', "", v) v = re.sub(r'(

|\s|

|\n)*', " ", v) v = re.sub(r'', "", v) v = re.sub(r'(||)(\s*

\s*

)[\s\n]*?(|\n)+', r"\1", v, flags=re.S) v = re.sub(r'(||)(|\n|

|\s)+', r"\1", v, flags=re.S) v = re.sub(r'(||)(|\n|

|\s)+', r"\1", v, flags=re.S) # 暂时还有table标签首尾的换行没去掉 subs2table["".format(str(k))] = v # <造成的css标签冲突处理 2021-10-13 def sub2(ss): if re.search(r'^(img|/?h[123456]|/?su[bp]>|t\d+b>|br\s*/?>' r'|/?(p|span|font|article|ul|ol|div|table|t?body|html|head|t[drh])(\s*|\s+style=.*?")>' r'|/?[a-z]+ style=.*?">)', ss.group(1)) is None: return "<{}".format(ss.group(1)) else: return "<{}".format(ss.group(1)) html2txt = re.sub("<([^<]{1,30})", sub2, html2txt) # print(html2txt) # >>>>>> html 切割 con_list = sum([re.split('

|', i) if len(re.findall("

|", i)) > 1 else [i] for i in re.split(r"\n+|

(?!)|", html2txt)], []) # html2txt)[:-1] con_list = [re.sub(r"^\n*\s*(

|)+", "", ii) for ii in con_list] # >>>>>> 替换回去 if subs2table: con_list = [re.sub(r"|".join(subs2table.keys()), lambda x: subs2table[x.group()], ii) for ii in con_list] # 剩余个别标签处理 con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*$", "", i.strip()) for i in con_list] # 2020/4/7,14 con_list = [re.sub(r"^(
||]*?>|)+?(.|\n)+?([一二三四五六七八九十])\s*[、.．､]\s*(.{2,4}题)(.|\n)+?
", r"\3､\4", i.strip()) for i in con_list] # 把最后可能还存在的或考号信息去掉 con_list = [re.sub("|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$" "|((学校|班级|姓名|座位号|准考号|[学考]号)[\s：:_]*?){2,}$", "", i.strip()) for i in con_list] # =====答案行格式处理==== temp_list = [re.split(r"^((\s*\s*)+)", v.strip(), maxsplit=1)[1::3] if re.match(r'(\s*\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$' r'|(\s*\s*)+?评分标准' r'|(\s*\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$', re.sub(r"[上下]?学[年期]|[\d—【】.．、､：:(（）)年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "", v.strip())) else [v] for v in con_list] con_list = sum(temp_list, []) # =====对可能的题号的处理==== 如2、3、4、5、加了【fei】 # 重新修改！！！！！！！！！！ con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[.．、､])", r"【fei】\1", i.strip()) if (len(re.findall(r"(^|\s*[.．、､])\s*[1-9][0-9]?\s*[.．、､]", i)) >= 3 and len(re.sub(r"[\d.．、､\s]", "", i)) < 2) else i for i in con_list] # =====头尾清除没用的信息===== if con_list and re.search(r"[\u4e00-\u9fa5]|", "", ii) for ii in html2txt.split("

")[:-1]] # pprint(con_list) if re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None: con_list = con_list[1:] return con_list def get_md5(image_id): """ 由于hash不处理unicode编码的字符串（python3默认字符串是unicode）所以这里判断是否字符串，如果是则进行转码初始化md5、将image_name进行加密、然后返回加密字串 """ image_name = str(image_id) + str(time.time()) + str(random.random()) image_name = image_name.encode("utf-8") md = hashlib.md5() md.update(image_name) return str(md.hexdigest()) def wash_after(res_dict, paperid,subject="数学"): """ 1.处理最终结果多余的换行符;2.对题文中已给答案的选择填空进行替换；3.选择题的细分 :param res_dict: :return: """ pattern1 = re.compile( r"([是为点]|等于|=|＝|有|存在)\s*_+((

).)+?[/\"]>|[^_;；。?!，\n])+?)(?| 1000 and ( len(re.findall(r"\n\s*([1-9]|1[0-9])\s*[.．、､].+?", end_con)) > 4 or len(re.findall(r"[(（]\s*[）)]|_{2,}", end_con)) > 6): sr['errmsgs'].append("原试卷格式有问题，导致本题可能包含了很多非本题的题文") if not re.sub(r"[(（)）\n\s]", "", sr["stem"]): sr['errmsgs'].append("本题没有题干，请检查题干格式是否正确") if "-" in str(sr["item_id"]) and sr['type'] in ["选择题", "填空题"]: if (not sr["key"] or sr["key"]=="见解析") and re.search("[A-H]+", re.sub("[;；、､\n(（)）\s]|\d+分", "", sr["parse"])): sr["key"] = re.sub("[;；、､\n(（)）\s]|\d+分", "", sr["parse"]) sr["parse"] = "" # 把首尾的换行都去掉 # sr["stem"] = table_label_cleal(re.sub(r"\n\s*","
",sr.get("stem", "").lstrip())) # 将选择题和填空题中的题干中出现答案的情况去掉答案 kuo_con1 = re.search(r'([是为]|等于|[＝=有]|表示)\s*[(（]\s*([A-Zc][A-Zc;；和与、､\s]*?)[）)]\s*(.?($|\n|
|))", sr["stem"]) if sr['type'].replace("题", "") in ["单选", "多选", "选择", "不定选择"]: # sr["type"] = "选择" # 针对选择题在题文中已给出答案的处理 if kuo_con1: sr["stem"] = sr["stem"].replace(kuo_con1.group(0), kuo_con1.group(1) + "( )" + kuo_con1.group(3)) sr["key"] = kuo_con1.group(2).replace("c", "C") if not sr["key"] else sr["key"] elif kuo_con2: sr["stem"] = sr["stem"].replace(kuo_con2.group(0), "( )" + kuo_con2.group(2)) sr["key"] = kuo_con2.group(1).replace("c", "C") if not sr["key"] else sr["key"] # sr['options_text'] = "" elif sr['type'] == '填空题': # sr["type"] = "填空" ans_list = [] # 针对填空题在题文中已给出答案的处理 sub_n = 0 while re.search(pattern1, sr["stem"]): blank_con1 = re.search(pattern1, sr["stem"]) sr["stem"] = sr["stem"].replace(blank_con1.group(0), blank_con1.group(1) + "____" + blank_con1.group(5)) ans_list.append(blank_con1.group(2)) sub_n += 1 if sub_n > 5: break while re.search(pattern2, sr["stem"]): blank_con2 = re.search(pattern2, sr["stem"]) # 这里的限制条件易出错，可以再判断一下 sr["stem"] = sr["stem"].replace(blank_con2.group(0), blank_con2.group(1) + "____" + blank_con2.group(4)) ans_list.append(blank_con2.group(2)) if re.findall(r"_{2,}", sr["stem"]): sr["blank_num"] = len(re.findall(r"_{2,}", sr["stem"])) if not sr["key"] and ans_list: sr["key"] = "; ".join(ans_list) # 已知题型是错误的情况，如解答题，放在填空题中 if 'blank_num' not in sr and re.search("_+([^_]*?)_+", sr['stem']) is None: sr['errmsgs'].append("填空题题干中没有下划线(__)，与题型(填空题)不符") # stem_c = re.sub("

|[,，.。．、､]", "", sr["stem"]) # if len(stem_c) > 2: # 不自动纠错 # sr["type"] = "解答题" # sr["type"] = "解答" # else: # 大题题型先不做范围判断 # if sr['type'] and sr['type'].replace("题", "") not in ["解答", "计算", "实验", "作图"]: # sr["type1"] = "解答" # else: # sr["type1"] = sr['type'].replace("题", "") # if "is_optional" not in sr: # sr["is_optional"] = is_optional # sr["option_str"] = "" # 换行符处理！ sr["stem"] = sr.get("stem", "").strip().replace("\n\n", "\n").replace("\n", "
") # 2020/4/10 gai # sr["stem"] = get_equation_instr(sr["stem"]) if "options" in sr: # 对选项部分进行格式处理 for i in range(len(sr['options'])): sr['options'][i] = get_simpstr2eqn(sr['options'][i].strip()).replace("\n\n", "\n").replace("\n", "
") # sr['options'][i] = get_equation_instr(sr['options'][i].strip()).replace("\n\n", "\n").replace("\n", "
") if "slave" in sr and sr["slave"]: # 带小题的大题，格式处理，高中数学没有这一功能 for s in sr["slave"]: s["stem"] = s.get("stem", "").strip().replace("\n\n", "\n").replace("\n", "
") # 已分小问了的题号，是不会带小题号的，故不需要替换 # s["stem"] = re.sub(r"[(（]\s*(\d|ⅰⅱⅲⅳ|i{1,3})\s*[)）]|[①②③④]\s*(?![+-])", "", s["stem"][:5]) + s["stem"][5:] s["parse"] = s.get("parse", "").strip().replace("\n\n", "\n").replace("\n", "
")\ .replace("解答:解：", "解答:").replace("解答:解:", "解答:") s["key"] = s.get("key", "").strip().replace("\n\n", "\n").replace("\n", "
") # sr["slave"] = sr.get("slave", "").replace("\n", "
") if "answer_type" in s: s["answer_type"] = configs.answer_type[s["answer_type"]] else: # s["parse"] = css_conflict_deal(s["parse"]) # "css 冲突标签处理" sr["parse"] = sr.get("parse", "").lstrip().replace("\n\n", "\n").replace("\n", "
") sr["parse"] = re.sub("^【解[答析]】\s*", "", sr["parse"]) # sr["parse"] = get_equation_instr(sr["parse"]) sr["key"] = sr.get("key", "").lstrip().replace("\n\n", "\n").replace("\n", "
") # sr["key"] = get_equation_instr(sr["key"]) if "answer_type" in sr: sr["answer_type"] = configs.answer_type[sr["answer_type"]] if not sr["parse"] and not sr["key"]: # 答案和解析都没有 # sr["parse"] = "略" # sr["key"] = "略" sr['errmsgs'].append("本题缺少答案和解析") elif not sr["key"] and sr["parse"]: sr["key"] = "" # 见解析 elif re.sub("见解析|略|空|无|没有|答案", "", sr["key"]) and not sr["parse"]: sr["parse"] = "略" # if "本选做题缺少解析" not in sr['errmsgs'] and "本题缺少解析" not in sr['errmsgs']: # sr['errmsgs'].append("本题缺少解析") # 辅助标签处理 # sr["analysis"] = "" if "analy" in sr: # 存在题目分析时，将其放在解析里 sr["analy"] = sr.get("analy", "").strip().replace("\n\n", "\n") if len(sr["analy"].replace(" ", "")) >= 10: sr["parse"] = "【分析】"+sr["analy"].replace("\n", "
") + "
【详解】" + sr["parse"] del sr["analy"] if "chapter" in sr: # 如选修4－5：不等式选讲 if sr['item_id'] + 1 <= len(res_dict): chapter_no[sr['item_id']] = sr["chapter"] del sr["chapter"] # 是否为选做题"is_optional"，两种形式不会同时出现 if "option_st" in sr: # 带有此标签的后面的题目都是选做题option_score # option_st = sr['item_id'] # is_optional = True # if "," in sr["option_st"]: # option_score = int(sr["option_st"].split(",")[-1]) del sr["option_st"] # elif sr['type'] == '选做题': # 题型是选做题如五、选做题 # select_type_id.append(sr['item_id']) # sr['is_optional'] = 'true' # sr['score'] = option_score # elif "type1" in sr and sr["type1"] == "解答" and "is_optional" not in sr: # sr["is_optional"] = is_optional # if is_optional: # sr['score'] = option_score # if "type1" in sr: # del sr["type1"] # 题型纠正 # 将选择题改为单选或多选,"is_multiple_choice" sr['type'] = re.sub("([单多])项选择题?", r"\1选题", sr['type']) sr['type'] = sr['type'].replace("题题", "题") # .replace("简答", "解答") # sr['type'] = re.sub("(计算|简答)题?", "解答题", sr['type']) if sr['type'] in ["选择", "选择题"]: # 有的科目只有选择题，不分单选和多选 if len(re.findall("[A-Z]", sr["key"])) > 1: sr['type'] = '多选题' elif len(re.findall("[A-Z]", sr["key"])) == 1: sr['type'] = '单选题' elif "数学" in subject or "物理" in subject: sr['type'] = '单选题' info_x = re.search("^[（(](多)选题?[）)]", sr["stem"].replace(" ", "")) if info_x: sr['type'] = '{}选题'.format(info_x.group(1)) if sr['type'] == '多选题': if len(re.findall("[A-Z]", sr["key"])) == 1: sr['errmsgs'].append("本题答案个数与题型(多选题)不符") # sr["is_multiple_choice"] = 'true' elif sr['type'] == '单选题': # sr["is_multiple_choice"] = 'false' if "options" in sr and len(sr["options"]) > 4: sr['errmsgs'].append("选项个数多于4个，与题型(单选题)不符") if len(re.findall("[A-Z]", sr["key"])) > 1: sr['errmsgs'].append("本题答案个数与题型(单选题)不符") elif sr['type'] == '不定选择题': if len(re.findall("[A-Z]", sr["key"])) > 1: sr['type'] = '多选题' elif len(re.findall("[A-Z]", sr["key"])) == 1: sr['type'] = '单选题' elif "数学" in subject or "物理" in subject: sr['type'] = '单选题' else: sr['type'] = '选择题' if "缺少答案" not in "".join(sr['errmsgs']): sr['errmsgs'].append("本题缺少答案") elif "数学" in subject: if sr['type'].replace("题", "") == "填空": if sr['blank_num'] > 1: sr['type'] = "多空题" else: sr['type'] = "单空题" elif sr['type'].replace("题", "") not in ["单空", "多空"]: sr['type'] = "解答题" # elif "物理" in subject: # # 用第一版模型预测 # content = sr['stem'] # if "options" in sr and sr["options"]: # content+= "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option) # for idm, option in enumerate(sr["options"])]) # try: # r = requests.post(url=configs.phy_topicType_ip, # json={"content": content, "period": "高中", # "topic_type": sr['type']}) # sr['type'] = r.json()["res"] # if sr['type'] == "简答题": # sr['type'] = "解答题" # except Exception as e: # print(e) # if sr['type'].replace("题", "") in ["单空", "多空", "填空"]: # sr['type'] = "填空题" # else: # sr['type'] = "解答题" elif sr['type'].replace("题", "") in ["单空", "多空", "填空"]: sr['type'] = "填空题" elif sr['type'] not in ["选择", "选择题"]: sr['type'] = "解答题" content = sr['stem'] if "options" in sr and sr["options"]: content += "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option) for idm, option in enumerate(sr["options"])]) all_content_str_list.append(content) topic_type_list.append(sr['type']) # """按照原先高中数学解析的最后输出格式整理输出""" # sr["type"] = sr['type'].replace("非选择", "解答").replace("题题", "题") # sr["topic_num"] = sr['item_id'] sr['errmsgs'] = "；".join(sr['errmsgs']) sr["parse"] = re.sub(r"试题【([分解]析)】", r"试题\1:", sr["parse"]) # 解析 sr["key"] = re.sub("([;；]|
)\s*$", "", sr["key"]) if 'susp_pic' in sr: del sr['susp_pic'] if 'is_optional' in sr: del sr['is_optional'] if 'spliterr_point' in sr: del sr['spliterr_point'] if 'score' in sr: del sr['score'] del sr['item_id'] # ---------------------字符串公式处理-------------------------------- # sr["stem"] = get_equation_instr(sr["stem"]) # sr["key"] = get_equation_instr(sr["key"]) # sr["parse"] = get_equation_instr(sr["parse"]) # if "options" in sr: # sr["options"] = list(map(get_equation_instr, sr["options"])) # ---------------------------------------------------------------- # 物理题型批量调接口:节约时间 if "物理" in subject: t1 = time.time() epoches = int(len(all_content_str_list) / 10) pred_topic_types = [] if epoches > 0: last = 0 for epoch in range(epoches): input_data = {"content": all_content_str_list[last:(epoch+1)*10], "period": "高中", "topic_type": topic_type_list[last:(epoch+1)*10]} last = (epoch+1)*10 try: r = requests.post(url=configs.phy_topicType_ip, json=input_data) pred_topic_types.extend(r.json()["res"]) except Exception as e: print(e) pred_topic_types.extend([""]*10) rest_con = all_content_str_list[last:] rest_topic_type = topic_type_list[last:] else: rest_con = all_content_str_list rest_topic_type = topic_type_list if rest_con: input_data = {"content": rest_con, "period": "高中", "topic_type": rest_topic_type} try: r = requests.post(url=configs.phy_topicType_ip, json=input_data) pred_topic_types.extend(r.json()["res"]) except Exception as e: print(e) pred_topic_types.extend([""] * len(rest_con)) # 将预测题型替换到res_dict中 if any([True for i in pred_topic_types if i]) and len(pred_topic_types) == len(res_dict): for idx, pred_type in enumerate(pred_topic_types): if pred_type and res_dict[idx]['type'] in ["填空题", "解答题"]: if pred_type == "简答题": pred_type = "解答题" res_dict[idx]['type'] = pred_type logger.info("----【paper_id:{}】采用题型预测服务花费time:{}".format(paperid, time.time() - t1)) # -------------------------------------------------------------- # 换行符替换 convert_huanhang(res_dict) # ------------------------------------------------------------------------ # if chapter_no: # 章节标签下移一位 # for c, v in chapter_no.items(): # res_dict[c]["chapter"] = v # 选做题"option_str"处理 # if select_type_id: # for s in select_type_id: # if len(select_type_id) == 2: # res_dict[s - 1]['option_str'] = "2选1" # elif len(select_type_id) == 4: # res_dict[s - 1]['option_str'] = "4选2" # else: # res_dict[s - 1]['errmsgs'] += ";
选做题不是“2选1”和“4选2”类型" # if option_st: # print("option_st:", option_st) # for s in range(option_st, len(res_dict)): # if (len(res_dict) - option_st) == 2: # res_dict[s]['option_str'] = "2选1" # elif (len(res_dict) - option_st) == 4: # res_dict[s]['option_str'] = "4选2" # else: # res_dict[s]['errmsgs'] += ";
选做题不是“2选1”和“4选2”类型" # 再解析中的新图片上传腾讯云 # 再设置一个入库接口，点击入库，才开始从本地上传图片 return res_dict def convert_huanhang(items_list): """ 递归换行符替换：\n -->
:param items_list: :return: """ if isinstance(items_list, list): for k, one_i in enumerate(items_list): items_list[k] = convert_huanhang(one_i) elif isinstance(items_list, dict): for k, v in items_list.items(): if k == "answer_type" and type(v) == str: items_list[k] = configs.answer_type[v] else: items_list[k] = convert_huanhang(v) if "answer_type" in items_list and items_list["answer_type"] == 2: if ("slave" not in items_list or not items_list["slave"]) and "stem" in items_list: items_list["stem"] = re.sub(r"(__{2,})", r'\1', items_list["stem"]) elif isinstance(items_list, str): item_str = items_list.strip().replace("\n\n", "\n") item_str = re.sub(r'()(|\n)+', r"\1", item_str) return item_str.replace("\n", "
") else: return items_list return items_list def css_conflict_deal(item): """ 针对", ">") # 2021-8-24 # item = re.sub("<(?!img src)", "<", item) # 还有表格 item = item.replace("$<$", "【*_*】") # 多次单题解析时会出现$<$ item = re.sub(r"<(/?su[bp]|br\s*/?|/?table( .*?)?|/?tbody( .*?)?|/?t[rhd]( .*?)?)>", r"【\1】", item) if re.search(r"(?", item) item = re.sub(r"(\s*|\n\s*)+<(/?table( .*?)?|/?tbody( .*?)?|/?t[rhd]( .*?)?)>\s*(\s*|\n\s*)+", r"<\2>", item) item = item.replace("$<$span class=", " item_list[i]: add_n += 1 else: break return add_n # def find_seq_num(num_list): # """ # 针对切分题号时切错的序号进行纠正，考虑序号是连续且正常的情况下 # 将连续的数字进行分组 # :param num_list:输入[3, 4, 8, 9, 12, 13, 14] # :return: [[3, 4],[8, 9],[12, 13, 14]] # """ # seq_ranges = [] # for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]): # group = (map(itemgetter(1), g)) # group = list(map(int, group)) # seq_ranges.append(group) # return seq_ranges # def del_exception_value(item_list): # """ # 去列表中的异常值,题目越多，越容易突出异常值 # :return: # """ # import numpy as np # max_v = max(item_list) # arr_mean = np.mean(item_list) # 均值 # arr_var = np.var(item_list) # 方差 # while max_v > len(item_list)+4: # item_list.remove(max_v) # print(item_list) # arr_mean = np.mean(item_list) # 去最大值后的均值 # arr_var = np.var(item_list) # 去最大值后的方差 # max_v = max(item_list) # # print("均值与方差：",arr_mean,arr_var) # if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3: # return item_list # else: # exception_value = [] # for i in item_list: # # print(abs((i - arr_mean) / arr_var), i) # if(abs((i - arr_mean)/arr_var)) > 0.3: # exception_value.append(i) # right_seq = [i for i in item_list if i not in exception_value] # return right_seq def pic_transfer(con_list): aft_opt = [] # 针对选项后是题目图片的情况,进行移位 if "\n" in con_list[-1]: ccon = re.split("\n+", con_list[-1]) while re.match("

0 and v['item_id'] - item_list[k-1]['item_id']>1: # if if __name__ == '__main__': # -------------生成requirements.txt--------------- # pip freeze > requirements.txt # import os, sys # # project_root = os.path.dirname(os.path.realpath(__file__)) # 找到当前目录 # print(project_root) # # # 找到解释器，虚拟环境目录 # python_root = sys.exec_prefix # print(python_root) # # # 拼接生成requirements命令 # command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt' # print(command) # # # 执行命令。 # os.system(command) # ----------------一键安装 requirements.txt------------ # pip install -r requirement.txt # python_root + '\Scripts\' + pip install -r requirements.txt # import os # rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx") # print(rrr) # item = "