#!/usr/bin/env/python # -*- coding:utf-8 -*- # 本文件包含以下函数 # table_label_cleal:去掉表格中的换行符 # html_cleal :html文件清洗 # wash_after: 处理最终结果多余的换行符 import datetime import re import shutil # from operator import itemgetter # from itertools import groupby # from PIL import Image import base64, os, random import time import requests import hashlib from pprint import pprint # from bs4 import BeautifulSoup # UPLOAD_FOLDER = config.UPLOAD_FOLDER import configs from utils.equation_extract import get_equation_instr, get_simpstr2eqn from utils.field_eq2latex import get_latex from utils.html_again_parse import css_label_wash # from structure.structure_main import WordParseStructure logger = configs.myLog(__name__, log_cate="ruku_log").getlog() def table_label_cleal(con): """ 去掉表格中的【换行符】 """ # print(con) # print('------------------------------------------') con = re.sub(r"\n(\s|\n|\t)+", "\n", con) count = 1 while re.search(r"?[a-z]+>\n(?[a-z]+>|
)", r"\1\2", con, flags=re.S) con = re.sub(r'(?t[rd]>)\n(
]*?>[\s\n\t]*? [\s\n\t]*? ' r'[\s\n\t]*? | [\s\n\t]*?
', "", con, flags=re.S) con = re.sub(r'(
)\s*([((]\s*\d\s*[))])', r'\1\n\2', con) return con def base642img(html_data, wordid): """ 【基于mathjax渲染输出是css-html格式】 将base64编码的图片保存到本地 :return: """ # 二进制图片进行转化, 按“word_id”建立文件夹 # time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d') # file_path = configs.IMG_FOLDER + '/' + str(self.wordid) # if not os.path.exists(file_path): # os.makedirs(file_path) # else: # 思路1:删除图片,重建文件夹,【所有的新图片都是以base64格式传过来的】 # shutil.rmtree(file_path) # os.makedirs(file_path) # 思路2:每一次再解析都将base64图片保存到本地再以路径形式返回 # st = len(os.listdir(file_path)) # 不要以序号索引的形式命名 # 统计所有base64编码 all_base64_image = re.findall(r'("]+?)"(.*?)\s*/?>)', str(html_data), flags=re.S) if all_base64_image: file_path = configs.IMG_FOLDER + '/' + str(wordid) if not os.path.exists(file_path): os.makedirs(file_path) # 新图片命名 name_list = random.sample(range(100000, 999999), len(all_base64_image)) for n, img in enumerate(all_base64_image): img1 = img[2].split(",", maxsplit=1) img_type_info = re.search("data:image/(.+?);base64", img1[0]) img_type = img_type_info.group(1) if img_type_info else "" # 可能还有alt和style的属性,暂时先不要 w_info = re.search('( width="\d+")', img[3]) h_info = re.search('( height="\d+")', img[3]) img_data = base64.b64decode(str(img1[-1])) if img_type: # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape) img_name = "new_image" + str(int(time.time())) + str(name_list[n]) + "." + img_type save_path = os.path.join(file_path, img_name) with open(save_path, 'wb') as f: f.write(img_data) # self.localnewpic_list.append(save_path) # put_key = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(img_name) # self.put_key_list.append(save_path) flag_behind = '" />' if w_info and h_info: flag_behind = '"' + w_info.group(1) + h_info.group(1) + ' />' temp_img = '' html_data = html_data.replace(img[0], temp_img) return html_data class HtmlWash(): def __init__(self, html, wordid, is_reparse=0, img_url="", must_latex=0): """ html文本清洗 批量再解析中,新增图片信息替换的文本返回作为ocr保存文本, 继续往下清洗的文本,则进入结构化解析逻辑中 """ # super().__init__(html, wordid, is_reparse, must_latex) self.html = html self.img_url = img_url self.wordid = wordid self.is_reparse = is_reparse self.must_latex = must_latex # self.put_key_list = [] # self.localnewpic_list =[] self.sub_list = ["?div>", "?b>", "?caption>", "?center>", "?cite>", "?code>", "?colgroup>", "?menu>", "?dd>", "?dir>", "?li>", "?em>", "?article>", "?header>", "?ruby>", "?summary>", "?details>", "?strong>", "?strike>", "?small>", "?select>", "?section>", "?script>", "?[su]>", "?var>", "?ul>", "?tt>", "?title>", "?thead>", "?tfoot>", "
\s*)【例题(\d+)】", r"\1\2、", html2txt)
html2txt = re.sub(r"\\\(|\\\)", "$", html2txt)
# 域公式的转化处理;\可以在前端显示,不需要用latex渲染
try:
html2txt, newhml = get_latex(html2txt, self.is_reparse, self.wordid, self.must_latex)
if newhml: # 存在域公式转图片时,需要将原文本的域公式也转为图片信息
self.new_html = newhml
html2txt = html2txt.replace("【omml-latex】", "")
except:
html2txt = html2txt.replace("【omml-latex】", "")
# 字符串公式的处理:如Fe2O3, 在结构化之后处理比较好
#
处理
html2txt = re.sub(r"
", "\n", html2txt)
html2txt = re.sub(r"[((]\s*(\d)\s*\$分\s*[))]", r"$(\1分)", html2txt)
# =====题型行的统一处理=====
# ---->>>>>题型行可能放在表格中
if len(re.findall("", html2txt)) >= 8: # 这个限制还不太严谨
for tt in re.finditer('(((?!(?tr>)).)*) ', html2txt, re.S):
tt_list = re.split(r'^\s*]*?>| |
', tt.group(1).strip()) #
" + " ".join(tt_list) + "
") # html2txt = re.sub(r"?tbody>|?table>|?div>", "", html2txt) # ---->>>>>end html2txt = re.sub(r"()\s*([一二三四五六七八九十]\s*[、..、::]?.{2,6}题)", r"\1\2", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt) html2txt = re.sub(r'(([一二三四五六七八九十])\s*[、..、,,::]\s*(.{2,4}题)\s*
)', r"\1", str(html2txt), flags=re.S) html2txt = re.sub(r"
\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)", r"
\1、\2题", html2txt) html2txt = re.sub(r'
\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*
', "", html2txt) html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?', r"
【选做题】:'\1'
", html2txt) html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*
', "【选做题】
", html2txt) html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*(单项?选择?|非?选择|多项?选择?|不定选择|填空|计算|[解简]答|实验|作图)题?\s*
', r"\1、\2题
", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*(单选|单项选择|选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)' r'([((]\s*本题|.*?\d分)', r"\1" + "、" + r'\2' + "题" + r"\3", html2txt) html2txt = re.sub(r'([一二三四五六])\s*[、..、,,::]?\s*(单选|单项选择|非?选择|不定选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题', r"\1" + "、" + r'\2' + "题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2" html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,::]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)', r"\1" + "、" + "解答题", html2txt) html2txt = re.sub(r'(?)\s*([一二三四五六七八九十]\s*[、..、,,::]?\s*(单项?选择?|选择|不定选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)', r'
\n\1', html2txt) html2txt = re.sub(r'
\s*([一二三四五六七八九十])\s*[、..、,,::]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s*
', r"\1、本大题
", html2txt) # html2txt = re.sub(r'\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|不定选择|单选|多选|计算|[解简]答|实验|作图)题', r"
一、\1题", html2txt) # =====答案解析关键字的统一处理===== html2txt = re.sub(r'【\s*().)+?/>\s*)*?([解答])\s*().)+?/>\s*)*?([析案])\s*' r'().)+?/>\s*)*?】', r"【\3\6】", str(html2txt)) # 2022/4/28 html2txt = re.sub(r'
\s*(解\s*[::])', r"
【解答】", str(html2txt)) html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】', r"【\1】", str(html2txt)) html2txt = re.sub(r'(\n\s*|
\s*|\s{2,}|\n\s*\d{,2}\s*[、..、]\s*)(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"\1【\2】", str(html2txt)) html2txt = re.sub(r'(\n|^|
)\s*(([1-9]|[1-9][0-9])\s*[..、、])?\s*\[\s*(答案|解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\2【\4】", str(html2txt)) html2txt = re.sub(r'([A-D])\s*\[\s*(解析|解答|详解|点评|点睛|考点|专题)\s*\]', r"\1\n【\2】", str(html2txt)) html2txt = re.sub(r'(\n|^|
)\s*(分析)\s*[::]', r"【\2】", str(html2txt)) if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt: html2txt = re.sub(r'【解答】', "【解析】", str(html2txt)) # =====其他关键字的处理===== html2txt = re.sub(r'
\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?
', "", str(html2txt)) html2txt = re.sub(r'\s*(选修[\d-]*?[::].{2,15})\s*
', r"【章节】\1
", html2txt) html2txt = re.sub(r'\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*
', r"【章节】\2
", html2txt) html2txt = re.sub(r'\s*(基础|中档|综合)题[^p题]*?
|\s*【(考点|专题)】[^p]*?
', "", str(html2txt)) html2txt = re.sub(r'\s*(基础训练|提升训练|探究培优)
', "", str(html2txt)) html2txt = re.sub(r'注意事项[::]\s*
(\n+\s*\s*\d\s*[、..、][^/]+?
){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'注意事项[::]\s*\d\s*[、..、][^/]+?
(\n+\s*\s*\d\s*[、..、][^/]+?
){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt) html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt) html2txt = re.sub(r'\[来源:.*?\]', "", html2txt) html2txt = re.sub('欢迎访问.*?
', '', html2txt) html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?', r'<\1>', html2txt) #\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s*
', "【非选择题】
", html2txt) # == == =对可能的题型行的处理 == == html2txt = re.sub("【非选择题】
((\s|\n||
)*\d{1,2}\s*[..、、].+?)", r"二、解答题
\1", html2txt)\ .replace("【非选择题】", "") # =====选项的处理===== html2txt = re.sub(r'(\s*([1-9]|[1-9][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*?
)', r"\1\n\3", str(html2txt)) # =====题号的处理===== html2txt = re.sub(r'([ED]\s*[、..、].*?(\s|\s*))(([1-9]|[1-9][0-9])\s*[、..、])', r"\1
\n\3", html2txt) html2txt = re.sub(r'((?p>|\n)\s*(\s*)?([1-9]|[1-9][0-9]))\s*' r'([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)', r"
\1、\5", html2txt) html2txt = re.sub(r"\s*([1-9]|[1-9][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"
\1、\2", html2txt) html2txt = re.sub(r"
\s*([1-9]|[1-9][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::]|\[(答案|解析)\])", r"
\1、\2", html2txt) html2txt = re.sub(r"
\s*([1-9]|[1-9][0-9])\s*([((]\s*\d+\s*分?\s*[))])?(【(解析?|答案?)】|(解析?|答案?)\s*[::]" r"|\[(答案|解析)\])", r"
\1、\2\3", html2txt) html2txt = re.sub(r"(?p>|\n)\s*().)+?/>)\s*([1-9]|[1-9][0-9])\s*" r"([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"
\2
" + "\n" + r"\4、\5", html2txt) # 【susp_img】
html2txt = re.sub(r'(?p>|\n)((\s*
((?!
).)+?(\s|[/\"]>))(([1-9]|[1-9][0-9])\s*[、..、].{,20}本[大小]?题\d+分)", r"\1
" + "\n" + r"\4", html2txt)
html2txt = re.sub(r"?p>((\s*
" + r"\8", html2txt, flags=re.S) html2txt = re.sub(r'(
\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)
', r"\1\n\2
", html2txt) html2txt = re.sub(r'(\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)
', r"\1\n\2
", html2txt) html2txt = re.sub(r'(.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-9][0-9])\s*[、..、].*?)
', r"\1\n\2
", html2txt) html2txt = re.sub(r'([1-9]|[1-9][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2", html2txt) # =====图片的处理===== # 1>>根据图片宽高的异常值判断删除隐藏图片 def sub1(ss): if float(ss.group(1)) <= 3 and float(ss.group(2)) <= 3: return "" else: return ss.group(0) html2txt = re.sub(r'', sub1, html2txt) # 2>>将图片中带有的汉字去掉 html2txt = re.sub(r'(', r"\1 />", html2txt) # 将">换为" /> html2txt = re.sub(r'(', r"\1 />", html2txt) # 将">换为" /> # 3>>建立图片id字典,对原图片信息第一次替换 html2txt = re.sub(r'( src=".*?files)\\image', r"\1/image", html2txt) all_image = re.findall(r'', html2txt) src2subs = {} subs2src = {} for src in all_image: # 校本题库上传的图片名称是随机数,故设置映射 # kk = re.search('( 20: mathpix = "" w_h_info = re.search(r' 10: image_id = image_id[-10:] src2subs[src] = '', r"\1
" + "\n", html2txt, flags=re.S) # >>>>>>
\s*(?t[drh]( .*?")?>|?table>|?tbody>)\s*
', r"\1", v) v = re.sub(r'[\n\s]*|\s|
|\n)*\s*
)[\s\n]*?(|\s)+', r"\1", v, flags=re.S)
# 暂时还有table标签首尾的换行没去掉
subs2table[" | |
| 替换回去
if subs2table:
con_list = [re.sub(r"|".join(subs2table.keys()), lambda x: subs2table[x.group()], ii) for ii in con_list]
# 剩余个别标签处理
con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*\1>$", "", i.strip()) for i in con_list] # 2020/4/7,14
con_list = [re.sub(r"^(
)(||
",
r"\3、\4", i.strip())
for i in con_list]
# 把最后可能还存在的?p>或考号信息去掉
con_list = [re.sub("?p>|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$"
"|((学校|班级|姓名|座位号|准考号|[学考]号)[\s::_]*?){2,}$", "", i.strip()) for i in con_list]
# =====答案行格式处理====
temp_list = [re.split(r"^((\s*]*?>|?tr>)+?(.|\n)+?([一二三四五六七八九十])\s*[、..、]\s*(.{2,4}题)(.|\n)+?
",sr.get("stem", "").lstrip()))
# 将选择题和填空题中的题干中出现答案的情况 去掉答案
kuo_con1 = re.search(r'([是为]|等于|[==有]|表示)\s*[((]\s*([A-Zc][A-Zc;;和与、、\s]*?)[))]\s*(.?($|\n|
|))", sr["stem"])
if sr['type'].replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
# sr["type"] = "选择"
# 针对选择题在题文中已给出答案的处理
if kuo_con1:
sr["stem"] = sr["stem"].replace(kuo_con1.group(0), kuo_con1.group(1) + "( )" + kuo_con1.group(3))
sr["key"] = kuo_con1.group(2).replace("c", "C") if not sr["key"] else sr["key"]
elif kuo_con2:
sr["stem"] = sr["stem"].replace(kuo_con2.group(0), "( )" + kuo_con2.group(2))
sr["key"] = kuo_con2.group(1).replace("c", "C") if not sr["key"] else sr["key"]
# sr['options_text'] = ""
elif sr['type'] == '填空题':
# sr["type"] = "填空"
ans_list = []
# 针对填空题在题文中已给出答案的处理
sub_n = 0
while re.search(pattern1, sr["stem"]):
blank_con1 = re.search(pattern1, sr["stem"])
sr["stem"] = sr["stem"].replace(blank_con1.group(0),
blank_con1.group(1) + "____" + blank_con1.group(5))
ans_list.append(blank_con1.group(2))
sub_n += 1
if sub_n > 5:
break
while re.search(pattern2, sr["stem"]):
blank_con2 = re.search(pattern2, sr["stem"])
# 这里的限制条件易出错,可以再判断一下
sr["stem"] = sr["stem"].replace(blank_con2.group(0),
blank_con2.group(1) + "____" + blank_con2.group(4))
ans_list.append(blank_con2.group(2))
if re.findall(r"_{2,}", sr["stem"]):
sr["blank_num"] = len(re.findall(r"_{2,}", sr["stem"]))
if not sr["key"] and ans_list:
sr["key"] = "; ".join(ans_list)
# 已知题型是错误的情况,如解答题,放在填空题中
if 'blank_num' not in sr and re.search("_+([^_]*?)_+", sr['stem']) is None:
sr['errmsgs'].append("填空题题干中没有下划线(__),与题型(填空题)不符")
# stem_c = re.sub("|[,,.。.、、]", "", sr["stem"])
# if len(stem_c) > 2: # 不自动纠错
# sr["type"] = "解答题"
# sr["type"] = "解答"
# else: # 大题题型先不做范围判断
# if sr['type'] and sr['type'].replace("题", "") not in ["解答", "计算", "实验", "作图"]:
# sr["type1"] = "解答"
# else:
# sr["type1"] = sr['type'].replace("题", "")
# if "is_optional" not in sr:
# sr["is_optional"] = is_optional
# sr["option_str"] = ""
# 换行符处理!
sr["stem"] = sr.get("stem", "").strip().replace("\n\n", "\n").replace("\n", "
") # 2020/4/10 gai
# sr["stem"] = get_equation_instr(sr["stem"])
if "options" in sr: # 对选项部分进行格式处理
for i in range(len(sr['options'])):
sr['options'][i] = get_simpstr2eqn(sr['options'][i].strip()).replace("\n\n", "\n").replace("\n", "
")
# sr['options'][i] = get_equation_instr(sr['options'][i].strip()).replace("\n\n", "\n").replace("\n", "
")
if "slave" in sr and sr["slave"]:
# 带小题的大题,格式处理,高中数学没有这一功能
for s in sr["slave"]:
s["stem"] = s.get("stem", "").strip().replace("\n\n", "\n").replace("\n", "
")
# 已分小问了的题号,是不会带小题号的,故不需要替换
# s["stem"] = re.sub(r"[((]\s*(\d|ⅰⅱⅲⅳ|i{1,3})\s*[))]|[①②③④]\s*(?![+-])", "", s["stem"][:5]) + s["stem"][5:]
s["parse"] = s.get("parse", "").strip().replace("\n\n", "\n").replace("\n", "
")\
.replace("解答:解:", "解答:").replace("解答:解:", "解答:")
s["key"] = s.get("key", "").strip().replace("\n\n", "\n").replace("\n", "
")
# sr["slave"] = sr.get("slave", "").replace("\n", "
")
if "answer_type" in s:
s["answer_type"] = configs.answer_type[s["answer_type"]]
else:
# s["parse"] = css_conflict_deal(s["parse"]) # "css 冲突标签处理"
sr["parse"] = sr.get("parse", "").lstrip().replace("\n\n", "\n").replace("\n", "
")
sr["parse"] = re.sub("^【解[答析]】\s*", "", sr["parse"])
# sr["parse"] = get_equation_instr(sr["parse"])
sr["key"] = sr.get("key", "").lstrip().replace("\n\n", "\n").replace("\n", "
")
# sr["key"] = get_equation_instr(sr["key"])
if "answer_type" in sr:
sr["answer_type"] = configs.answer_type[sr["answer_type"]]
if not sr["parse"] and not sr["key"]: # 答案和解析都没有
# sr["parse"] = "略"
# sr["key"] = "略"
sr['errmsgs'].append("本题缺少答案和解析")
elif not sr["key"] and sr["parse"]:
sr["key"] = "" # 见解析
elif re.sub("见解析|略|空|无|没有|答案", "", sr["key"]) and not sr["parse"]:
sr["parse"] = "略"
# if "本选做题缺少解析" not in sr['errmsgs'] and "本题缺少解析" not in sr['errmsgs']:
# sr['errmsgs'].append("本题缺少解析")
# 辅助标签处理
# sr["analysis"] = ""
if "analy" in sr: # 存在题目分析时,将其放在解析里
sr["analy"] = sr.get("analy", "").strip().replace("\n\n", "\n")
if len(sr["analy"].replace(" ", "")) >= 10:
sr["parse"] = "【分析】"+sr["analy"].replace("\n", "
") + "
【详解】" + sr["parse"]
del sr["analy"]
if "chapter" in sr: # 如选修4-5:不等式选讲
if sr['item_id'] + 1 <= len(res_dict):
chapter_no[sr['item_id']] = sr["chapter"]
del sr["chapter"]
# 是否为选做题"is_optional",两种形式不会同时出现
if "option_st" in sr: # 带有此标签的后面的题目都是选做题option_score
# option_st = sr['item_id']
# is_optional = True
# if "," in sr["option_st"]:
# option_score = int(sr["option_st"].split(",")[-1])
del sr["option_st"]
# elif sr['type'] == '选做题': # 题型是选做题 如五、选做题
# select_type_id.append(sr['item_id'])
# sr['is_optional'] = 'true'
# sr['score'] = option_score
# elif "type1" in sr and sr["type1"] == "解答" and "is_optional" not in sr:
# sr["is_optional"] = is_optional
# if is_optional:
# sr['score'] = option_score
# if "type1" in sr:
# del sr["type1"]
# 题型纠正
# 将选择题改为单选或多选,"is_multiple_choice"
sr['type'] = re.sub("([单多])项选择题?", r"\1选题", sr['type'])
sr['type'] = sr['type'].replace("题题", "题") # .replace("简答", "解答")
# sr['type'] = re.sub("(计算|简答)题?", "解答题", sr['type'])
if sr['type'] in ["选择", "选择题"]: # 有的科目只有选择题,不分单选和多选
if len(re.findall("[A-Z]", sr["key"])) > 1:
sr['type'] = '多选题'
elif len(re.findall("[A-Z]", sr["key"])) == 1:
sr['type'] = '单选题'
elif "数学" in subject or "物理" in subject:
sr['type'] = '单选题'
info_x = re.search("^[((](多)选题?[))]", sr["stem"].replace(" ", ""))
if info_x:
sr['type'] = '{}选题'.format(info_x.group(1))
if sr['type'] == '多选题':
if len(re.findall("[A-Z]", sr["key"])) == 1:
sr['errmsgs'].append("本题答案个数与题型(多选题)不符")
# sr["is_multiple_choice"] = 'true'
elif sr['type'] == '单选题':
# sr["is_multiple_choice"] = 'false'
if "options" in sr and len(sr["options"]) > 4:
sr['errmsgs'].append("选项个数多于4个,与题型(单选题)不符")
if len(re.findall("[A-Z]", sr["key"])) > 1:
sr['errmsgs'].append("本题答案个数与题型(单选题)不符")
elif sr['type'] == '不定选择题':
if len(re.findall("[A-Z]", sr["key"])) > 1:
sr['type'] = '多选题'
elif len(re.findall("[A-Z]", sr["key"])) == 1:
sr['type'] = '单选题'
elif "数学" in subject or "物理" in subject:
sr['type'] = '单选题'
else:
sr['type'] = '选择题'
if "缺少答案" not in "".join(sr['errmsgs']):
sr['errmsgs'].append("本题缺少答案")
elif "数学" in subject:
if sr['type'].replace("题", "") == "填空":
if sr['blank_num'] > 1:
sr['type'] = "多空题"
else:
sr['type'] = "单空题"
elif sr['type'].replace("题", "") not in ["单空", "多空"]:
sr['type'] = "解答题"
# elif "物理" in subject:
# # 用第一版模型预测
# content = sr['stem']
# if "options" in sr and sr["options"]:
# content+= "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
# for idm, option in enumerate(sr["options"])])
# try:
# r = requests.post(url=configs.phy_topicType_ip,
# json={"content": content, "period": "高中",
# "topic_type": sr['type']})
# sr['type'] = r.json()["res"]
# if sr['type'] == "简答题":
# sr['type'] = "解答题"
# except Exception as e:
# print(e)
# if sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
# sr['type'] = "填空题"
# else:
# sr['type'] = "解答题"
elif sr['type'].replace("题", "") in ["单空", "多空", "填空"]:
sr['type'] = "填空题"
elif sr['type'] not in ["选择", "选择题"]:
sr['type'] = "解答题"
content = sr['stem']
if "options" in sr and sr["options"]:
content += "\n" + "\n".join(["{}、{}".format(chr(ord('@') + idm + 1), option)
for idm, option in enumerate(sr["options"])])
all_content_str_list.append(content)
topic_type_list.append(sr['type'])
# """按照原先高中数学解析的最后输出格式整理输出"""
# sr["type"] = sr['type'].replace("非选择", "解答").replace("题题", "题") #
sr["topic_num"] = sr['item_id']
sr['errmsgs'] = ";".join(sr['errmsgs'])
sr["parse"] = re.sub(r"试题【([分解]析)】", r"试题\1:", sr["parse"]) # 解析
sr["key"] = re.sub("([;;]|
)\s*$", "", sr["key"])
if 'susp_pic' in sr:
del sr['susp_pic']
if 'is_optional' in sr:
del sr['is_optional']
if 'spliterr_point' in sr:
del sr['spliterr_point']
if 'score' in sr:
del sr['score']
del sr['item_id']
# ---------------------字符串公式处理--------------------------------
# sr["stem"] = get_equation_instr(sr["stem"])
# sr["key"] = get_equation_instr(sr["key"])
# sr["parse"] = get_equation_instr(sr["parse"])
# if "options" in sr:
# sr["options"] = list(map(get_equation_instr, sr["options"]))
# ----------------------------------------------------------------
# 物理题型批量调接口:节约时间
if "物理" in subject:
t1 = time.time()
epoches = int(len(all_content_str_list) / 10)
pred_topic_types = []
if epoches > 0:
last = 0
for epoch in range(epoches):
input_data = {"content": all_content_str_list[last:(epoch+1)*10], "period": "高中",
"topic_type": topic_type_list[last:(epoch+1)*10]}
last = (epoch+1)*10
try:
r = requests.post(url=configs.phy_topicType_ip, json=input_data)
pred_topic_types.extend(r.json()["res"])
except Exception as e:
print(e)
pred_topic_types.extend([""]*10)
rest_con = all_content_str_list[last:]
rest_topic_type = topic_type_list[last:]
else:
rest_con = all_content_str_list
rest_topic_type = topic_type_list
if rest_con:
input_data = {"content": rest_con, "period": "高中", "topic_type": rest_topic_type}
try:
r = requests.post(url=configs.phy_topicType_ip, json=input_data)
pred_topic_types.extend(r.json()["res"])
except Exception as e:
print(e)
pred_topic_types.extend([""] * len(rest_con))
# 将预测题型替换到res_dict中
if any([True for i in pred_topic_types if i]) and len(pred_topic_types) == len(res_dict):
for idx, pred_type in enumerate(pred_topic_types):
if pred_type and res_dict[idx]['type'] in ["填空题", "解答题"]:
if pred_type == "简答题":
pred_type = "解答题"
res_dict[idx]['type'] = pred_type
logger.info("----【paper_id:{}】采用题型预测服务花费time:{}".format(paperid, time.time() - t1))
# --------------------------------------------------------------
# 换行符替换
convert_huanhang(res_dict)
# ------------------------------------------------------------------------
# if chapter_no: # 章节标签下移一位
# for c, v in chapter_no.items():
# res_dict[c]["chapter"] = v
# 选做题"option_str"处理
# if select_type_id:
# for s in select_type_id:
# if len(select_type_id) == 2:
# res_dict[s - 1]['option_str'] = "2选1"
# elif len(select_type_id) == 4:
# res_dict[s - 1]['option_str'] = "4选2"
# else:
# res_dict[s - 1]['errmsgs'] += ";
选做题不是“2选1”和“4选2”类型"
# if option_st:
# print("option_st:", option_st)
# for s in range(option_st, len(res_dict)):
# if (len(res_dict) - option_st) == 2:
# res_dict[s]['option_str'] = "2选1"
# elif (len(res_dict) - option_st) == 4:
# res_dict[s]['option_str'] = "4选2"
# else:
# res_dict[s]['errmsgs'] += ";
选做题不是“2选1”和“4选2”类型"
# 再解析中的新图片上传腾讯云
# 再设置一个入库接口,点击入库,才开始从本地上传图片
return res_dict
def convert_huanhang(items_list):
"""
递归 换行符替换:\n -->
:param items_list:
:return:
"""
if isinstance(items_list, list):
for k, one_i in enumerate(items_list):
items_list[k] = convert_huanhang(one_i)
elif isinstance(items_list, dict):
for k, v in items_list.items():
if k == "answer_type" and type(v) == str:
items_list[k] = configs.answer_type[v]
else:
items_list[k] = convert_huanhang(v)
if "answer_type" in items_list and items_list["answer_type"] == 2:
if ("slave" not in items_list or not items_list["slave"]) and "stem" in items_list:
items_list["stem"] = re.sub(r"(__{2,})", r'\1',
items_list["stem"])
elif isinstance(items_list, str):
item_str = items_list.strip().replace("\n\n", "\n")
item_str = re.sub(r'(
|\n)+', r"\1", item_str)
return item_str.replace("\n", "
")
else:
return items_list
return items_list
def css_conflict_deal(item):
"""
针对", ">") # 2021-8-24
# item = re.sub("<(?!img src)", "<", item) # 还有表格
item = item.replace("$<$", "【*_*】") # 多次单题解析时会出现$<$
item = re.sub(r"<(/?su[bp]|br\s*/?|/?table( .*?)?|/?tbody( .*?)?|/?t[rhd]( .*?)?)>", r"【\1】", item)
if re.search(r"(?", item)
item = re.sub(r"(
\s*|\n\s*)+<(/?table( .*?)?|/?tbody( .*?)?|/?t[rhd]( .*?)?)>\s*(
\s*|\n\s*)+",
r"<\2>", item)
item = item.replace("$<$span class=", " item_list[i]:
add_n += 1
else:
break
return add_n
# def find_seq_num(num_list):
# """
# 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下
# 将连续的数字进行分组
# :param num_list:输入[3, 4, 8, 9, 12, 13, 14]
# :return: [[3, 4],[8, 9],[12, 13, 14]]
# """
# seq_ranges = []
# for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]):
# group = (map(itemgetter(1), g))
# group = list(map(int, group))
# seq_ranges.append(group)
# return seq_ranges
# def del_exception_value(item_list):
# """
# 去列表中的异常值,题目越多,越容易突出异常值
# :return:
# """
# import numpy as np
# max_v = max(item_list)
# arr_mean = np.mean(item_list) # 均值
# arr_var = np.var(item_list) # 方差
# while max_v > len(item_list)+4:
# item_list.remove(max_v)
# print(item_list)
# arr_mean = np.mean(item_list) # 去最大值后的均值
# arr_var = np.var(item_list) # 去最大值后的方差
# max_v = max(item_list)
# # print("均值与方差:",arr_mean,arr_var)
# if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3:
# return item_list
# else:
# exception_value = []
# for i in item_list:
# # print(abs((i - arr_mean) / arr_var), i)
# if(abs((i - arr_mean)/arr_var)) > 0.3:
# exception_value.append(i)
# right_seq = [i for i in item_list if i not in exception_value]
# return right_seq
def pic_transfer(con_list):
aft_opt = [] # 针对选项后是题目图片的情况,进行移位
if "\n" in con_list[-1]:
ccon = re.split("\n+", con_list[-1])
while re.match("0 and v['item_id'] - item_list[k-1]['item_id']>1:
# if
if __name__ == '__main__':
# -------------生成requirements.txt---------------
# pip freeze > requirements.txt
# import os, sys
#
# project_root = os.path.dirname(os.path.realpath(__file__)) # 找到当前目录
# print(project_root)
#
# # 找到解释器,虚拟环境目录
# python_root = sys.exec_prefix
# print(python_root)
#
# # 拼接生成requirements命令
# command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt'
# print(command)
#
# # 执行命令。
# os.system(command)
# ----------------一键安装 requirements.txt------------
# pip install -r requirement.txt
# python_root + '\Scripts\' + pip install -r requirements.txt
# import os
# rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx")
# print(rrr)
# item = "