#!/usr/bin/env/python
# -*- coding:utf-8 -*-
# 本文件包含以下函数
# table_label_cleal:去掉表格中的换行符
# html_cleal :html文件清洗
# huanhang_wash_after: 处理最终结果多余的换行符
import datetime
import random
import re
from operator import itemgetter
from itertools import groupby
from PIL import Image
# import ps_configs as config
from pprint import pprint
import base64, os, random
import time, hashlib
# UPLOAD_FOLDER = config.UPLOAD_FOLDER
import configs
from utils.field_eq2latex import get_latex
def table_label_cleal(con):
"""
去掉表格中的【换行符】
"""
# print(con)
# print('------------------------------------------')
con = re.sub(r"\n(\s|\n|\t)+", "\n", con)
count = 1
while re.search(r"?[a-z]+>\n(?[a-z]+>|
)", con, re.S) and count <= 10:
con = re.sub("(?t[dr]>|?table>|?tbody>|?div>)\n(?t[dr]>||?table>|?tbody>| )",
r"\1\2", con, flags=re.S)
con = re.sub(r'(?t[rd]>)\n( | )', r'\1\2', con, flags=re.S)
count += 1
# if re.search(r"", con, re.S|re.M):
# aa = re.search(r"()", con, re.S|re.M)
# con = con.replace(aa.group(1),aa.group(1).replace("\n",""))
# 将空表格的情况去掉
con = re.sub(r'[\s\n\t]*?[\s\n\t]*?([\s\n\t]*?[\s\n\t]*? [\s\n\t]*? '
r'[\s\n\t]*? | [\s\n\t]*? [\s\n\t]*?)+[\s\n\t]*? [\s\n\t]*?', "", con, flags=re.S)
con = re.sub(r'( )\s*([((]\s*\d\s*[))])', r'\1\n\2', con)
return con
# 标签清洗
def html_cleal(html, img_url, is_reparse):
sub_list = ["?div>", "?b>", "?caption>", "?center>", "?cite>", "?code>", "?colgroup>",
"?menu>", "?dd>", "?dir>", "?li>", "?em>", "?article>", "?header>", "?ruby>",
"?summary>", "?details>", "?strong>", "?strike>", "?small>", "?select>",
"?section>", "?script>", "?[su]>", "?var>", "?ul>", "?tt>", "?title>", "?thead>",
"?tfoot>", " ", " ",""]
sub_dd = {'×': '×',
'÷': '÷',
'°': '°',
'·': '·',
'±': '±',
'º': 'º',
'¹': '¹',
'²': '²',
'³': '³',
'½': '1/2',
'¼': '¼',
'¾': '¾',
'¥': '¥',
'm³': 'm³',
'<': '<',
'£': '£',
'∠<': '<',
'>': '>',
"A": "A",
"А": "A",
"Α": "A",
"B": "B",
"В": "B",
"в": "B",
"Β": "B",
"C": "C",
"С": "C",
"c": "c",
"с": "c",
"D": "D",
"Ε": "E",
"E": "E",
"F": "F",
"G": "G",
"g": "g",
"m": "m",
"N": "N",
"s": "s",
"t": "t",
"/": "/",
"=": "=",
"-": "-",
"2": "2",
' ': ' ',
' ': ' ',
"〖": '【',
"〗": '】',
"題": '题',
"单项选择": '单选',
"多项选择": '多选',
"不定项选择": '选择',
"双项选择": '选择',
}
# 再解析中,将二进制图片进行转化,图片怎么保存比较好,先再“天数”建立文件夹
if is_reparse:
# 按“天数”建立文件夹
time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d')
file_path = os.path.join(configs.IMG_FOLDER, time_str)
if not os.path.exists(file_path):
os.makedirs(file_path)
# 统计所有base64编码
all_base64_image = re.findall('', str(html))
for n, img in enumerate(all_base64_image):
img1 = img.split(",")
img_tape_info = re.search("data:image/(.+?);base64", img1[0])
img_tape = img_tape_info.group(1) if img_tape_info else ""
img_data = base64.b64decode(str(img1[-1]))
if img_tape:
# save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape)
img_name = "image" + get_md5(n) + "." + img_tape
save_path = os.path.join(file_path, img_name)
img_path = configs.new_img_ip + '/' + time_str + '/' + img_name
# img_file_count = 0
# if os.listdir(configs.IMG_FOLDER):
# img_file_count = max([int(i) for i in os.listdir(UPLOAD_FOLDER)]) + 1
with open(save_path, 'wb') as f:
f.write(img_data)
new_img = ''
html = html.replace(img, new_img)
# -------------------------------------------------------------------------------------
# 特殊符号处理
html2txt = re.sub(r"|".join(sub_list), "", str(html)) # ("", " ") #2020/4/7
html2txt = re.sub("|".join(sub_dd.keys()), lambda x: sub_dd[x.group()], html2txt) # 2020/4/1,4/7,4/20
html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \
.replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \
.replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ").replace("\u2003", " ") \
.replace("\x7f", " ").replace("\xa0", "")
# 域公式的转化处理
html2txt = get_latex(html2txt).replace("【域公式】", "")
# \可以在前端显示,不需要用latex渲染
# 处理
html2txt = re.sub(" ", "\n", html2txt)
# 题型行的统一处理
# ---->>>>>题型行可能放在表格中
if len(re.findall("", html2txt)) >= 6: # 这个限制还不太严谨
for tt in re.finditer('(((?!(?tr>)).)*) ', html2txt, re.S):
tt_list = re.split(r'|', tt.group(1))
tt_list = [col for col in tt_list if col.strip()]
if " ".join(tt_list).replace(" ", "") == '得分评卷人':
html2txt = html2txt.replace(tt.group(0), "")
else:
html2txt = html2txt.replace(tt.group(0), " " + " ".join(tt_list) + " ")
html2txt = re.sub(r"?tbody>|?table>|?div>", "", html2txt)
html2txt = re.sub(r"( | )\s*([一二三四五六七八九十]\s*[、..、]?.{2,6}题)", r"\1\2", html2txt)
html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt)
html2txt = re.sub(r"\s*([一二三四五六七八九十])\s*[、..、,,]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)", r" \1、\2题", html2txt)
html2txt = re.sub(r' (([一二三四五六七八九十])\s*[、..、,,]\s*(.{2,4}题)\s* ) | [^p]*?', r"\1", str(html2txt), flags=re.S)
html2txt = re.sub(r' \s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s* ', "", html2txt)
html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?', r" 【选做题】:'\1' ", html2txt)
html2txt = re.sub(r'\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s* ', "【选做题】 ", html2txt)
html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,]?\s*(单项?选择?|选择|多项?选择?|填空|计算|[解简]答|实验|作图)题?\s* ',
r"\1、\2题 ", html2txt)
html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*(单选|单项选择|选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)([((]\s*本题|.*?\d分)',
r"\1" + "、" + r'\2' + "题" + r"\3", html2txt)
html2txt = re.sub(r'([一二三四五六])\s*[、..、,,]?\s*(单选|单项选择|选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题',
r"\1" + "、" + r'\2' + "题", html2txt)
html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2"
html2txt = re.sub(r'\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt)
html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)',
r"\1" + "、" + "解答题", html2txt)
html2txt = re.sub(r'(?)\s*([一二三四五六七八九十]\s*[、..、,,]?\s*(单项?选择?|选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)',
r' \n\1', html2txt)
html2txt = re.sub(r' \s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s* ', r"\1、本大题 ", html2txt)
# html2txt = re.sub(r'\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|单选|多选|计算|[解简]答|实验|作图)题', r" 一、\1题", html2txt)
# 答案解析关键字的统一处理
html2txt = re.sub(r'【\s*(\s*(解\s*[::])', r" 【解答】", str(html2txt))
html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】】', r"【\1】", str(html2txt))
html2txt = re.sub(r'(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"【\1】", str(html2txt))
html2txt = re.sub(r'(\n|^)\s*(分析)\s*[::]', r"【\2】", str(html2txt))
if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt:
html2txt = re.sub(r'【解答】', "【解析】", str(html2txt))
# 其他关键字的处理
html2txt = re.sub(r' \s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*? ', "", str(html2txt))
html2txt = re.sub(r'\s*(选修[\d-]*?[::].{2,15})\s* ', r"【章节】\1 ", html2txt)
html2txt = re.sub(r'\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s* ',
r"【章节】\2 ", html2txt)
html2txt = re.sub(r'\s*(基础|中档|综合)题[^p题]*? |\s*【(考点|专题)】[^p]*? ', "", str(html2txt))
html2txt = re.sub(r'\s*(基础训练|提升训练|探究培优) ', "", str(html2txt))
html2txt = re.sub(r'注意事项[::]\s* (\n+\s*\s*\d\s*[、..、][^/]+? ){1,}', "", html2txt, flags=re.S)
html2txt = re.sub(r'注意事项[::]\s*\d\s*[、..、][^/]+? (\n+\s*\s*\d\s*[、..、][^/]+? ){1,}', "", html2txt, flags=re.S)
html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt)
html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt)
html2txt = re.sub(r'\[来源:.*?\]', "", html2txt)
html2txt = re.sub('欢迎访问.*? ', '', html2txt)
html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?', r'<\1>', html2txt)
html2txt = re.sub('\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s* ', "\n", html2txt)
# 选项的处理
html2txt = re.sub(r'(\s*([1-9]|[1-4][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*? )',
r"\1\n\3", str(html2txt))
# 根据图片宽高的异常值判断删除隐藏图片---------------------------------------
# a = re.search(r'', html2txt, re.S)
# while a and float(a.group(1)) <= 2 and float(a.group(2)) <= 2:
# print(a.group(1))
# html2txt = html2txt.replace(a.group(0), "")
# a = re.search(r'', html2txt, re.S)
def sub1(ss):
if float(ss.group(1)) <= 2 and float(ss.group(2)) <= 2:
return ""
else:
return ss.group(0)
html2txt = re.sub(r'',sub1, html2txt)
# -------------------------------------------------------------
# 将图片中带有的汉字去掉
html2txt = re.sub(r'(\s*))(([1-9]|[1-4][0-9])\s*[、..、])', r"\1 \n\3", html2txt)
html2txt = re.sub(r'(?p>\s*(\s*)?([1-9]|[1-4][0-9]))\s*([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)',
r" \1、\4", html2txt)
html2txt = re.sub(r"\s*([1-9]|[1-4][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r" \1、\2", html2txt)
html2txt = re.sub(r" \s*([1-9]|[1-4][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::])", r" \1、\2", html2txt)
html2txt = re.sub(r" \s*().)+?[/\"]>)\s*([1-9]|[1-4][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])",
r" \1 "+"\n"+r"\3、\4", html2txt) # 【susp_img】
html2txt = re.sub(r'?p>((\s*\s*)?(\s*)?\s*)(([1-9]|[1-4][0-9])\s*[、..、])',
r" \1" + "\n" + r"\4", html2txt)
html2txt = re.sub(r"(((?! ).)+?(\s|[/\"]>))(([1-9]|[1-4][0-9])\s*[、..、].{,20}本[大小]?题\d+分)", r"\1 " + "\n" + r"\4",
html2txt)
html2txt = re.sub(r"?p>((\s*\s*)?(\s*)?((\s*\s*)?(\s*)?)*?\s*)"
r"\s*(([1-9]|[1-4][0-9])\s*[、..、])", r" \1" + "\n" + r"\7", html2txt)
html2txt = re.sub(r'( \s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-4][0-9])\s*[、..、].*?) ',
r"\1\n\2 ", html2txt)
html2txt = re.sub(r'(\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-4][0-9])\s*[、..、].*?) ',
r"\1\n\2 ", html2txt)
html2txt = re.sub(r'(.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-4][0-9])\s*[、..、].*?) ', r"\1\n\2 ", html2txt)
html2txt = re.sub(r'([1-9]|[1-4][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2", html2txt)
# 建立图片id字典,对原图片信息第一次替换
all_image = re.findall(r'', html2txt)
src2subs = {}
subs2src = {}
for src in all_image:
kk = re.search('(', src)
# if re.search(" data-latex=", src) and px_info and type(img_url) == str:
# if int(px_info.group(4)) < ref_v - 2: # 图片太小
# h_pt = (ref_v - 1) * 72 / 96
# w_px = int(px_info.group(3)) / int(px_info.group(4)) * (ref_v - 1)
# w_pt = w_px * 72 / 96
# new_src = new_src.replace('height="' + px_info.group(4), 'height="15') \
# .replace('width="' + px_info.group(3), 'width="' + str(w_px)) \
# .replace(px_info.group(2), 'style="width: ' + str(w_pt) + 'pt; height: ' + str(h_pt) + 'pt"')
# # .replace("<", "<").replace(">", ">") # replace(""", '"')
# elif int(px_info.group(4)) > ref_v + 2 and type(img_url) == 'str': # 公式图片太大或公式图片原本就大但被缩小的情况
# 第二种修改图片的方法:读取原图,获取大小
# ----------------------------------------------------------------------------------
# 图片信息简化替换
src_info = re.search(r' 20:
mathpix = ""
w_h_info = re.search('', src)
w_h = " w_h=" + w_h_info.group(2).split('.')[0] + "*" + w_h_info.group(3).split('.')[0] \
if w_h_info and not mathpix else "" # w_h 和 mathpix只存在一个
src2subs[src] = '"
subs2src['"] = new_src
for k, v in src2subs.items():
html2txt = html2txt.replace(k, v)
# print(src2subs)
# ------------------------------------------------------------------------
# html 转 list
html2txt = re.sub(r'(?div>||)(\n\s*)*?', r"\1 "+"\n", html2txt, flags=re.S)
con_list = sum([re.split(' |', i) if len(re.findall("|", i))>1 else [i] for i in
re.split(r" (?! | )|", html2txt)[:-1]], [])
con_list = [re.sub(r"^\n*\s*(|)+", "", ii) for ii in con_list]
# con_list = [re.sub(r"^\n*\s*(|)+", "", ii) for ii in
# re.split(r"
(?!)|", html2txt)[:-1]]
con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*\1>$", "", i.strip()) for i in con_list] # 2020/4/7,14
con_list = [re.sub(r"^
(.|\n)+?([一二三四五六七八九十])\s*[、..、]\s*(.{2,4}题)(.|\n)+?
", r"\2、\3", i.strip())
for i in con_list]
# 把最后可能还存在的?p>或考号信息去掉
con_list = [re.sub("?p>|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$"
"|((学校|班级|姓名|座位号|准考号|[学考]号)[\s::_]*?){2,}$", "", i.strip()) for i in con_list]
# 答案行格式处理
temp_list = [re.split("^((\s*\s*)+)", v.strip(), maxsplit=1)
if re.match(r'(\s*\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$'
r'|(\s*\s*)+?评分标准'
r'|(\s*\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$',
re.sub(r"[上下]?学[年期]|[\d—【】..、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "",
v.strip())) else [v] for v in con_list]
con_list = sum(temp_list, [])
# 对可能的题号的处理 如2、3、4、5、 加了【fei】 # 重新修改!!!!!!!!!!
con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[..、、])", r"【fei】\1", i.strip())
if (len(re.findall(r"(^|\s*[..、、])\s*[1-9][0-9]?\s*[..、、]", i)) >= 3
and len(re.sub(r"[\d..、、\s]", "", i)) < 2) else i for i in con_list]
# print(con_list)
if con_list and re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None:
con_list = con_list[1:]
while con_list and re.search(r"声明[::].*?著作权属.*?所有|(邮箱|用户|日期|QQ)\s*[::].+?", con_list[-1]):
con_list = con_list[:-1]
return con_list, subs2src
def del_no(item, item_no_type=1):
"""去开头的题号"""
if item_no_type==2:
item = re.sub('^\n*\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?', "", item)
return item
item = re.sub('^\n*\s*([1-9]|[1-4][0-9])\s*[..、、::]', "", item)
return item
def html_cleal_test(htmlf): # 不用
html2txt = re.sub(r" ", "", htmlf.read()) # ("", " ")
# html2txt.replace("①", "(1).").replace("②", "(2).").replace("③", "(3).")
con_list = [re.sub(r"^\n+\s+", "", ii) for ii in html2txt.split("
")[:-1]]
# pprint(con_list)
if re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None:
con_list = con_list[1:]
return con_list
def get_md5(image_id):
"""
由于hash不处理unicode编码的字符串(python3默认字符串是unicode)
所以这里判断是否字符串,如果是则进行转码
初始化md5、将image_name进行加密、然后返回加密字串
"""
image_name = str(image_id) + str(time.time()) + str(random.random())
image_name = image_name.encode("utf-8")
md = hashlib.md5()
md.update(image_name)
return str(md.hexdigest())
def huanhang_wash_after(res_dict):
"""
1.处理最终结果多余的换行符;2.对题文中已给答案的选择填空进行替换;3.选择题的细分
:param res_dict:
:struc_type:试卷类型,struc_type=1时为教师卷
:return:
"""
pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+((|[^_;;。?!,])+?)(?|") # 2020/4/10 gai
if num == len(res_dict)-1:
end_con = sr["content"] + sr["parse"]
if len(re.findall(r"[\u4e00-\u9fa5]", end_con))>1000 and (len(re.findall("\n\s*([1-9]|1[0-9])\s*[..、、].+?",
end_con))>4 or len(re.findall("[((]\s*[))]|_{2,}", end_con))>6):
sr['errmsgs'].append("原试卷格式有问题,导致本题可能包含了很多非本题的题文")
if not re.sub("[(())\n\s]", "", sr["content"]):
sr['errmsgs'].append("本题没有题干,请检查题干格式是否正确")
# 把首尾的换行都去掉,php接收时会用换行来拼接
# sr["content"] = table_label_cleal(re.sub(r"\n\s*","
",sr.get("content", "").lstrip()))
# 将选择题和填空题中的题干中出现答案的情况 去掉答案
kuo_con1 = re.search('([是为]|等于|[==有]|表示)[((]([A-Z][A-Z;;和与、、]*?)[))](.?($|
|))", sr["content"].replace(" ", ""))
if sr['item_topic_name'].replace("题", "") in ["单选", "多选", "选择", "单项选择", "多项选择"]:
# sr["type"] = "选择"
# 针对选择题在题文中已给出答案的处理
if kuo_con1:
sr["content"] = sr["content"].replace(kuo_con1.group(0), kuo_con1.group(1)+"( )" + kuo_con1.group(3))
sr["answer"] = kuo_con1.group(2) if not sr["answer"] else sr["answer"]
elif kuo_con2:
sr["content"] = sr["content"].replace(kuo_con2.group(0), "( )" + kuo_con2.group(2))
sr["answer"] = kuo_con2.group(1) if not sr["answer"] else sr["answer"]
if "options" in sr: # 对选项部分进行格式处理
for i in range(len(sr['options'])):
sr['options'][i] = sr['options'][i].lstrip().replace("\n\n", "\n").replace("\n", "
")
# sr['options_text'] = ""
elif sr['item_topic_name'] == '填空题':
# sr["type"] = "填空"
ans_list = []
# 针对填空题在题文中已给出答案的处理
while re.search(pattern1, sr["content"]):
blank_con1 = re.search(pattern1, sr["content"])
sr["content"] = sr["content"].replace(blank_con1.group(0), blank_con1.group(1)+"____" + blank_con1.group(4))
ans_list.append(blank_con1.group(2))
while re.search(pattern2, sr["content"]):
blank_con2 = re.search(pattern2, sr["content"])
# 这里的限制条件易出错,可以再判断一下
sr["content"] = sr["content"].replace(blank_con2.group(0), blank_con2.group(1) + "____" + blank_con2.group(4))
ans_list.append(blank_con2.group(2))
if re.findall(r"_{2,}", sr["content"]):
sr["blank_num"] = len(re.findall(r"_{2,}", sr["content"]))
if not sr["answer"] and ans_list:
sr["answer"] = "; ".join(ans_list)
# 已知题型是错误的情况,如解答题,放在填空题中
if 'blank_num' not in sr and re.search("_+([^_]*?)_+", sr['content']) is None:
sr['errmsgs'].append("填空题题干中没有下划线(__),与题型(填空题)不符")
# stem_c = re.sub("|[,,.。.、、]", "", sr["content"])
# if len(stem_c) > 2: # 不自动纠错
# sr["item_topic_name"] = "解答题"
# sr["type"] = "解答"
else: # 大题题型先不做范围判断
if sr['item_topic_name'] and sr['item_topic_name'].replace("题", "") not in ["解答", "计算", "实验","作图"]:
sr["type1"] = "解答"
else:
sr["type1"] = sr['item_topic_name'].replace("题", "")
# if "is_optional" not in sr:
# sr["is_optional"] = is_optional
sr["option_str"] = ""
if "slave" in sr and sr["slave"]:
# 带小题的大题,格式处理,高中数学没有这一功能
for s in sr["slave"]:
s["content"] = s.get("content", "").strip().replace("\n\n", "\n").replace("\n", "
")
# 已分小问了的题号,是不会带小题号的,故不需要替换
# s["content"] = re.sub(r"[((]\s*(\d|ⅰⅱⅲⅳ|i{1,3})\s*[))]|[①②③④]\s*(?![+-])", "", s["content"][:5]) + s["content"][5:]
s["parse"] = s.get("parse", "").strip().replace("\n\n", "\n").replace("\n", "
")\
.replace("解答:解:", "解答:").replace("解答:解:", "解答:")
s["answer"] = s.get("answer", "").strip().replace("\n\n", "\n").replace("\n", "
")
# sr["slave"] = sr.get("slave", "").replace("\n", "
")
else:
sr["parse"] = sr.get("parse", "").lstrip().replace("\n\n", "\n").replace("\n", "
")
sr["parse"] = re.sub("^【解[答析]】\s*", "", sr["parse"])
sr["answer"] = sr.get("answer", "").lstrip().replace("\n\n", "\n").replace("\n", "
")
if not sr["parse"] and not sr["answer"]: # 答案和解析都没有
sr["parse"] = "略"
sr["answer"] = "略"
sr['errmsgs'].append("本题缺少答案和解析")
elif not sr["answer"] and sr["parse"]:
sr["answer"] = "见解析"
elif sr["answer"] and not sr["parse"]:
sr["parse"] = "略"
sr['errmsgs'].append("本题缺少解析")
# 辅助标签处理
sr["analysis"] = ""
if "analy" in sr: # 存在题目分析时,将其放在解析里
sr["analysis"] = sr.get("analy", "").strip().replace("\n\n", "\n").replace("\n", "
")
# if len(sr["analy"].replace(" ", "")) >= 10:
# sr["parse"] = sr["analy"] + "
" + sr["parse"]
del sr["analy"]
if "chapter" in sr: # 如选修4-5:不等式选讲
if sr['item_id'] + 1 <= len(res_dict):
chapter_no[sr['item_id']] = sr["chapter"]
del sr["chapter"]
# 是否为选做题"is_optional",两种形式不会同时出现
if "option_st" in sr: # 带有此标签的后面的题目都是选做题option_score
option_st = sr['item_id']
is_optional = True
if "," in sr["option_st"]:
option_score = int(sr["option_st"].split(",")[-1])
del sr["option_st"]
elif sr['item_topic_name'] == '选做题': # 题型是选做题 如五、选做题
select_type_id.append(sr['item_id'])
sr['is_optional'] = 'true'
sr['score'] = option_score
elif "type1" in sr and sr["type1"] == "解答" and "is_optional" not in sr:
sr["is_optional"] = is_optional
if is_optional:
sr['score'] = option_score
if "type1" in sr:
del sr["type1"]
# 将选择题改为单选或多选,"is_multiple_choice"
sr['item_topic_name'] = re.sub("([单多])项选择题?", r"\1选题", sr['item_topic_name'])
sr['item_topic_name'] = sr['item_topic_name'].replace("简答", "解答")
# sr['item_topic_name'] = re.sub("(计算|简答)题?", "解答题", sr['item_topic_name'])
# if sr['item_topic_name'] in ["选择", "选择题"]: # 有的科目只有选择题,不分单选和多选
# if len(re.findall("[A-Z]", sr["answer"])) > 1:
# sr['item_topic_name'] = '多选题'
# else:
# sr['item_topic_name'] = '单选题'
if sr['item_topic_name'] == '多选题':
if len(re.findall("[A-Z]", sr["answer"])) == 1:
sr['errmsgs'].append("本题答案个数与题型(多选题)不符")
# sr["is_multiple_choice"] = 'true'
elif sr['item_topic_name'] == '单选题':
# sr["is_multiple_choice"] = 'false'
if "options" in sr and len(sr["options"]) > 4:
sr['errmsgs'].append("选项个数多于4个,与题型(单选题)不符")
if len(re.findall("[A-Z]", sr["answer"])) > 1:
sr['errmsgs'].append("本题答案个数与题型(单选题)不符")
# """按照原先高中数学解析的最后输出格式整理输出"""
sr["stem"] = sr["content"]
sr["type"] = sr['item_topic_name'].replace("非选择", "解答")
sr["topic_num"] = sr['item_id']
sr['errmsgs'] = ";".join(sr['errmsgs'])
sr["parse"] = re.sub(r"试题【([分解]析)】", r"试题\1:", sr["parse"]) # 解析
sr["key"] = re.sub("([;;]|
)\s*$", "", sr["answer"])
sr["slave_img"] = ""
sr["parse_img"] = ""
sr["stem_img"] = ""
if 'susp_pic' in sr:
del sr['susp_pic']
if 'is_optional' in sr:
del sr['is_optional']
if 'spliterr_point' in sr:
del sr['spliterr_point']
del sr["content"], sr["answer"], sr['item_topic_name'], sr['score'],sr['item_id']
# ------------------------------------------------------------------------
# if chapter_no: # 章节标签下移一位
# for c, v in chapter_no.items():
# res_dict[c]["chapter"] = v
# 选做题"option_str"处理
if select_type_id:
for s in select_type_id:
if len(select_type_id) == 2:
res_dict[s-1]['option_str'] = "2选1"
elif len(select_type_id) == 4:
res_dict[s - 1]['option_str'] = "4选2"
else:
res_dict[s-1]['text_errmsgs'] += ";
选做题不是“2选1”和“4选2”类型"
if option_st:
print("option_st:", option_st)
for s in range(option_st, len(res_dict)):
if (len(res_dict) - option_st) == 2:
res_dict[s]['option_str'] = "2选1"
elif (len(res_dict) - option_st) == 4:
res_dict[s]['option_str'] = "4选2"
else:
res_dict[s]['text_errmsgs'] += ";
选做题不是“2选1”和“4选2”类型"
return res_dict
def insert_sort2get_idx(item_list, num):
"""
:param item_list: 拍好序的列表
:param num: 插入的数值
:return: 插入的位置
"""
add_n = 0
for i in range(len(item_list)):
if num > item_list[i]:
add_n += 1
else:
break
return add_n
# def find_seq_num(num_list):
# """
# 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下
# 将连续的数字进行分组
# :param num_list:输入[3, 4, 8, 9, 12, 13, 14]
# :return: [[3, 4],[8, 9],[12, 13, 14]]
# """
# seq_ranges = []
# for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]):
# group = (map(itemgetter(1), g))
# group = list(map(int, group))
# seq_ranges.append(group)
# return seq_ranges
# def del_exception_value(item_list):
# """
# 去列表中的异常值,题目越多,越容易突出异常值
# :return:
# """
# import numpy as np
# max_v = max(item_list)
# arr_mean = np.mean(item_list) # 均值
# arr_var = np.var(item_list) # 方差
# while max_v > len(item_list)+4:
# item_list.remove(max_v)
# print(item_list)
# arr_mean = np.mean(item_list) # 去最大值后的均值
# arr_var = np.var(item_list) # 去最大值后的方差
# max_v = max(item_list)
# # print("均值与方差:",arr_mean,arr_var)
# if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3:
# return item_list
# else:
# exception_value = []
# for i in item_list:
# # print(abs((i - arr_mean) / arr_var), i)
# if(abs((i - arr_mean)/arr_var)) > 0.3:
# exception_value.append(i)
# right_seq = [i for i in item_list if i not in exception_value]
# return right_seq
def pic_transfer(con_list):
aft_opt = [] # 针对选项后是题目图片的情况,进行移位
if "\n" in con_list[-1]:
ccon = re.split("\n+", con_list[-1])
while re.match("0 and v['item_id'] - item_list[k-1]['item_id']>1:
# if
if __name__ == '__main__':
# -------------生成requirements.txt---------------
# pip freeze > requirements.txt
# import os, sys
#
# project_root = os.path.dirname(os.path.realpath(__file__)) # 找到当前目录
# print(project_root)
#
# # 找到解释器,虚拟环境目录
# python_root = sys.exec_prefix
# print(python_root)
#
# # 拼接生成requirements命令
# command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt'
# print(command)
#
# # 执行命令。
# os.system(command)
# ----------------一键安装 requirements.txt------------
# pip install -r requirement.txt
# python_root + '\Scripts\' + pip install -r requirements.txt
ans_no0=[16, 17, 18, 19, 20]
print(ans_no0[ans_no0.index(1):])
# # b = del_exception_value(a)
# print(b)
# import os
# rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx")
# print(rrr)