#!/usr/bin/env/python # -*- coding:utf-8 -*- # 本文件包含以下函数 # table_label_cleal:去掉表格中的换行符 # html_cleal :html文件清洗 # huanhang_wash_after: 处理最终结果多余的换行符 import datetime import random import re from operator import itemgetter from itertools import groupby from PIL import Image # import ps_configs as config from pprint import pprint import base64, os, random import time, hashlib # UPLOAD_FOLDER = config.UPLOAD_FOLDER import configs from utils.field_eq2latex import get_latex def table_label_cleal(con): """ 去掉表格中的【换行符】 """ # print(con) # print('------------------------------------------') con = re.sub(r"\n(\s|\n|\t)+", "\n", con) count = 1 while re.search(r"\n(|)", con, re.S) and count <= 10: con = re.sub("(|||)\n(||||

)", r"\1\2", con, flags=re.S) con = re.sub(r'()\n()', r'\1\2', con, flags=re.S) count += 1 # if re.search(r"(.|\n)+?
", con, re.S|re.M): # aa = re.search(r"((.|\n)+?
)", con, re.S|re.M) # con = con.replace(aa.group(1),aa.group(1).replace("\n","")) # 将空表格的情况去掉 con = re.sub(r'[\s\n\t]*?[\s\n\t]*?([\s\n\t]*?[\s\n\t]*?[\s\n\t]*?)+[\s\n\t]*?
[\s\n\t]*?

[\s\n\t]*?

' r'[\s\n\t]*?
[\s\n\t]*?

', "", con, flags=re.S) con = re.sub(r'(

)\s*([((]\s*\d\s*[))])', r'\1\n\2', con) return con # 标签清洗 def html_cleal(html, img_url, is_reparse): sub_list = ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "


", "
",""] sub_dd = {'×': '×', '÷': '÷', '°': '°', '·': '·', '±': '±', 'º': 'º', '¹': '¹', '²': '²', '³': '³', '½': '1/2', '¼': '¼', '¾': '¾', '¥': '¥', 'm³': 'm³', '<': '<', '£': '£', '∠<': '<', '>': '>', "A": "A", "А": "A", "Α": "A", "B": "B", "В": "B", "в": "B", "Β": "B", "C": "C", "С": "C", "c": "c", "с": "c", "D": "D", "Ε": "E", "E": "E", "F": "F", "G": "G", "g": "g", "m": "m", "N": "N", "s": "s", "t": "t", "/": "/", "=": "=", "-": "-", "2": "2", '  ': ' ', ' ': ' ', "〖": '【', "〗": '】', "題": '题', "单项选择": '单选', "多项选择": '多选', "不定项选择": '选择', "双项选择": '选择', } # 再解析中,将二进制图片进行转化,图片怎么保存比较好,先再“天数”建立文件夹 if is_reparse: # 按“天数”建立文件夹 time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d') file_path = os.path.join(configs.IMG_FOLDER, time_str) if not os.path.exists(file_path): os.makedirs(file_path) # 统计所有base64编码 all_base64_image = re.findall('', str(html)) for n, img in enumerate(all_base64_image): img1 = img.split(",") img_tape_info = re.search("data:image/(.+?);base64", img1[0]) img_tape = img_tape_info.group(1) if img_tape_info else "" img_data = base64.b64decode(str(img1[-1])) if img_tape: # save_path = os.path.join(configs.new_img_ip, get_md5(n)+"."+img_tape) img_name = "image" + get_md5(n) + "." + img_tape save_path = os.path.join(file_path, img_name) img_path = configs.new_img_ip + '/' + time_str + '/' + img_name # img_file_count = 0 # if os.listdir(configs.IMG_FOLDER): # img_file_count = max([int(i) for i in os.listdir(UPLOAD_FOLDER)]) + 1 with open(save_path, 'wb') as f: f.write(img_data) new_img = '' html = html.replace(img, new_img) # ------------------------------------------------------------------------------------- # 特殊符号处理 html2txt = re.sub(r"|".join(sub_list), "", str(html)) # ("", " ") #2020/4/7 html2txt = re.sub("|".join(sub_dd.keys()), lambda x: sub_dd[x.group()], html2txt) # 2020/4/1,4/7,4/20 html2txt = html2txt.replace(r"\\[{\\text{V}}V\]", "Ⓥ").replace(r"\\[{\\text{A}}A\]", "Ⓐ") \ .replace(r"\\[{\\text{W}}W\]", "Ⓦ").replace(r"\\[{\\text{X}}X\]", "Ⓧ").replace(r"\\[{\\text{G}}G\]", "Ⓖ") \ .replace("\uf067", "γ").replace('', "γ").replace('\uf020', "").replace("\u3000", " ").replace("\u2003", " ") \ .replace("\x7f", " ").replace("\xa0", "") # 域公式的转化处理 html2txt = get_latex(html2txt).replace("【域公式】", "") # \可以在前端显示,不需要用latex渲染 #
处理 html2txt = re.sub("", "\n", html2txt) # 题型行的统一处理 # ---->>>>>题型行可能放在表格中 if len(re.findall("", html2txt)) >= 6: # 这个限制还不太严谨 for tt in re.finditer('(((?!()).)*)', html2txt, re.S): tt_list = re.split(r'

|

', tt.group(1)) tt_list = [col for col in tt_list if col.strip()] if " ".join(tt_list).replace(" ", "") == '得分评卷人': html2txt = html2txt.replace(tt.group(0), "") else: html2txt = html2txt.replace(tt.group(0), "

" + " ".join(tt_list) + "

") html2txt = re.sub(r"||", "", html2txt) html2txt = re.sub(r"(
)\s*([一二三四五六七八九十]\s*[、..、]?.{2,6}题)", r"\1

\2", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]\s*(论述|填空|探究)题?[与和、、,,\s]*?(计算题|实验题)', r"\1、\3", html2txt) html2txt = re.sub(r"

\s*([一二三四五六七八九十])\s*[、..、,,]?\s*(计算|[解简]答|实验|作图)题?[与和、、,,\s]*?(计算|[解简]答|实验|作图)", r"

\1、\2题", html2txt) html2txt = re.sub(r'

(([一二三四五六七八九十])\s*[、..、,,]\s*(.{2,4}题)\s*

)[^p]*?

', r"\1", str(html2txt), flags=re.S) html2txt = re.sub(r'

\s*[((]\s*[一二三四五六]\s*[))]\s*必考题\s*(.?|.+?分\s*[.。.]?)\s*

', "", html2txt) html2txt = re.sub(r'

\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*.?\s*.{,4}(?', r"

【选做题】:'\1'

", html2txt) html2txt = re.sub(r'

\s*[((]\s*[一二三四五六]\s*[))]\s*选考题\s*(.?|.+?分\s*[.。.]?)\s*

', "

【选做题】

", html2txt) html2txt = re.sub(r'

\s*([一二三四五六七八九十])\s*[、..、,,]?\s*(单项?选择?|选择|多项?选择?|填空|计算|[解简]答|实验|作图)题?\s*

', r"

\1、\2题

", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*(单选|单项选择|选择|多选|多项选择|填空|计算|[解简]答|实验|作图)\s*(?!题)([((]\s*本题|.*?\d分)', r"\1" + "、" + r'\2' + "题" + r"\3", html2txt) html2txt = re.sub(r'([一二三四五六])\s*[、..、,,]?\s*(单选|单项选择|选择|多选|多项选择|填空|计算|[解简]答|实验|作图)题', r"\1" + "、" + r'\2' + "题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*[((]\s*本大题(.*?选项中)', r"\1" + "、" + "选择题", html2txt) # + r"\2" html2txt = re.sub(r'

\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((本大题]*?(.*?选项中)', r"\1" + "、" + "选择题", html2txt) html2txt = re.sub(r'([一二三四五六七八九十])\s*[、..、,,]?\s*([((]\s*(每小题|本大?题)((?!(选项)).)+?[))]|综合题)', r"\1" + "、" + "解答题", html2txt) html2txt = re.sub(r'(?)\s*([一二三四五六七八九十]\s*[、..、,,]?\s*(单项?选择?|选择|多项?选择?|填空|计算|[解简]答|实验|作图)题)', r'

\n

\1', html2txt) html2txt = re.sub(r'

\s*([一二三四五六七八九十])\s*[、..、,,]?\s*[((]?本?大?题((?!(选项)).)+?[))]?\s*

', r"

\1、本大题

", html2txt) # html2txt = re.sub(r'

\s*[^一二三四五六七八九十]{,3}\s*[、..、]\s*(选择|单选|多选|计算|[解简]答|实验|作图)题', r"

一、\1题", html2txt) # 答案解析关键字的统一处理 html2txt = re.sub(r'【\s*(\s*(解\s*[::])', r"

【解答】", str(html2txt)) html2txt = re.sub(r'【[^【】]*?(答案|[解分][析答]|详解|点[评睛])[^【】]*?】】', r"【\1】", str(html2txt)) html2txt = re.sub(r'(答案|解析|解答|详解|点评|点睛|考点|专题)\s*[::]', r"【\1】", str(html2txt)) html2txt = re.sub(r'(\n|^)\s*(分析)\s*[::]', r"【\2】", str(html2txt)) if "【解析】" not in html2txt and "【解答】" in html2txt and "【分析】" not in html2txt: html2txt = re.sub(r'【解答】', "【解析】", str(html2txt)) # 其他关键字的处理 html2txt = re.sub(r'

\s*(类型|知识点|考查角度|拔尖角度)[一二三四五六七八九十\d+][^p]*?

', "", str(html2txt)) html2txt = re.sub(r'

\s*(选修[\d-]*?[::].{2,15})\s*

', r"

【章节】\1

", html2txt) html2txt = re.sub(r'

\s*([一二三四五六]\s*[、..、]?)?\s*(\[.{2}-*?选修[\d-]*?.*?\])\s*([((]\d+分[))])?\s*

', r"

【章节】\2

", html2txt) html2txt = re.sub(r'

\s*(基础|中档|综合)题[^p题]*?

|

\s*【(考点|专题)】[^p]*?

', "", str(html2txt)) html2txt = re.sub(r'

\s*(基础训练|提升训练|探究培优)

', "", str(html2txt)) html2txt = re.sub(r'

注意事项[::]\s*

(\n+\s*

\s*\d\s*[、..、][^/]+?

){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'

注意事项[::]\s*\d\s*[、..、][^/]+?

(\n+\s*

\s*\d\s*[、..、][^/]+?

){1,}', "", html2txt, flags=re.S) html2txt = re.sub(r'[((]\s*([A-Z\dⅠⅡⅢⅣⅤ]+|IV)\s*[))]', r"(\1)".replace(" ", "").replace("(IV)", "Ⅳ"), html2txt) html2txt = re.sub(r'[((](\s*\d\s*\d?\s*分?\s*)[))]', "(" + r'\1'.replace(" ", "") + ")", html2txt) html2txt = re.sub(r'\[来源:.*?\]', "", html2txt) html2txt = re.sub('

欢迎访问.*?

', '', html2txt) html2txt = re.sub('w\s*w\s*w\..*?(\.\s*c\s*o\s*m|\.cn)+|(?', r'<\1>', html2txt) html2txt = re.sub('

\s*第\s*[二三四ⅡⅢⅣ]\s*(卷|部分)\s*([((].*?[))]|非?选择题.{,8})?\s*

', "\n", html2txt) # 选项的处理 html2txt = re.sub(r'(

\s*([1-9]|[1-4][0-9])\s*[..、、].+?[((]\s*[))])\s*(A\s*[..、、][^/]*?

)', r"\1

\n

\3", str(html2txt)) # 根据图片宽高的异常值判断删除隐藏图片--------------------------------------- # a = re.search(r'', html2txt, re.S) # while a and float(a.group(1)) <= 2 and float(a.group(2)) <= 2: # print(a.group(1)) # html2txt = html2txt.replace(a.group(0), "") # a = re.search(r'', html2txt, re.S) def sub1(ss): if float(ss.group(1)) <= 2 and float(ss.group(2)) <= 2: return "" else: return ss.group(0) html2txt = re.sub(r'',sub1, html2txt) # ------------------------------------------------------------- # 将图片中带有的汉字去掉 html2txt = re.sub(r'(\s*))(([1-9]|[1-4][0-9])\s*[、..、])', r"\1

\n

\3", html2txt) html2txt = re.sub(r'(\s*(\s*)?([1-9]|[1-4][0-9]))\s*([((]\s*(\d{1,2}[.\s\d]*?分|.{2,3}题?)\s*[))]|解析?\s*[::]|【解析】)', r"

\1、\4", html2txt) html2txt = re.sub(r"

\s*([1-9]|[1-4][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"

\1、\2", html2txt) html2txt = re.sub(r"

\s*([1-9]|[1-4][0-9])\s*(【(解析?|答案?)】|(解析?|答案?)\s*[::])", r"

\1、\2", html2txt) html2txt = re.sub(r"

\s*().)+?[/\"]>)\s*([1-9]|[1-4][0-9])\s*([((]20\d{2}\s*[\u4e00-\u9fa5、、]{2,9}[))])", r"

\1

"+"\n"+r"

\3、\4", html2txt) # 【susp_img】 html2txt = re.sub(r'((\s*\s*)?(\s*)?\s*)(([1-9]|[1-4][0-9])\s*[、..、])', r"

\1

" + "\n" + r"\4", html2txt) html2txt = re.sub(r"(

((?!

).)+?(\s|[/\"]>))(([1-9]|[1-4][0-9])\s*[、..、].{,20}本[大小]?题\d+分)", r"\1

" + "\n

" + r"\4", html2txt) html2txt = re.sub(r"((\s*\s*)?(\s*)?((\s*\s*)?(\s*)?)*?\s*)" r"\s*(([1-9]|[1-4][0-9])\s*[、..、])", r"

\1

" + "\n

" + r"\7", html2txt) html2txt = re.sub(r'(

\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-4][0-9])\s*[、..、].*?)

', r"\1

\n

\2

", html2txt) html2txt = re.sub(r'(

\s*[一二三四五六七八九十].*?题\s*\(.+?分.*?\))\s*(([1-9]|[1-4][0-9])\s*[、..、].*?)

', r"\1

\n

\2

", html2txt) html2txt = re.sub(r'(

.*?[..]{6,}\s*\d+分)\s*(([1-9]|[1-4][0-9])\s*[、..、].*?)

', r"\1

\n

\2

", html2txt) html2txt = re.sub(r'([1-9]|[1-4][0-9])\s*([((]\s*\d{1,2}[.\s\d]*?分\s*[))])\s*[、..、]', r"\1" + "、" + r"\2", html2txt) # 建立图片id字典,对原图片信息第一次替换 all_image = re.findall(r'', html2txt) src2subs = {} subs2src = {} for src in all_image: kk = re.search('(', src) # if re.search(" data-latex=", src) and px_info and type(img_url) == str: # if int(px_info.group(4)) < ref_v - 2: # 图片太小 # h_pt = (ref_v - 1) * 72 / 96 # w_px = int(px_info.group(3)) / int(px_info.group(4)) * (ref_v - 1) # w_pt = w_px * 72 / 96 # new_src = new_src.replace('height="' + px_info.group(4), 'height="15') \ # .replace('width="' + px_info.group(3), 'width="' + str(w_px)) \ # .replace(px_info.group(2), 'style="width: ' + str(w_pt) + 'pt; height: ' + str(h_pt) + 'pt"') # # .replace("<", "<").replace(">", ">") # replace(""", '"') # elif int(px_info.group(4)) > ref_v + 2 and type(img_url) == 'str': # 公式图片太大或公式图片原本就大但被缩小的情况 # 第二种修改图片的方法:读取原图,获取大小 # ---------------------------------------------------------------------------------- # 图片信息简化替换 src_info = re.search(r' 20: mathpix = "" w_h_info = re.search('', src) w_h = " w_h=" + w_h_info.group(2).split('.')[0] + "*" + w_h_info.group(3).split('.')[0] \ if w_h_info and not mathpix else "" # w_h 和 mathpix只存在一个 src2subs[src] = '" subs2src['"] = new_src for k, v in src2subs.items(): html2txt = html2txt.replace(k, v) # print(src2subs) # ------------------------------------------------------------------------ # html 转 list html2txt = re.sub(r'(||)(\n\s*)*?

', r"\1

"+"\n

", html2txt, flags=re.S) con_list = sum([re.split('

|', i) if len(re.findall("

|", i))>1 else [i] for i in re.split(r"

(?!)|", html2txt)[:-1]], []) con_list = [re.sub(r"^\n*\s*(

|)+", "", ii) for ii in con_list] # con_list = [re.sub(r"^\n*\s*(

|)+", "", ii) for ii in # re.split(r"

(?!)|", html2txt)[:-1]] con_list = [re.sub(r"^<([a-z]+)>[\s\t\n]*$", "", i.strip()) for i in con_list] # 2020/4/7,14 con_list = [re.sub(r"^(.|\n)+?([一二三四五六七八九十])\s*[、..、]\s*(.{2,4}题)(.|\n)+?
", r"\2、\3", i.strip()) for i in con_list] # 把最后可能还存在的或考号信息去掉 con_list = [re.sub("|[…O•.\s]*?密[…O•.\s]*?封[….O•\s]*?装?[…O•.\s]*?订?[….O•\s]*?线?[….O•\s]*?$" "|((学校|班级|姓名|座位号|准考号|[学考]号)[\s::_]*?){2,}$", "", i.strip()) for i in con_list] # 答案行格式处理 temp_list = [re.split("^((\s*\s*)+)", v.strip(), maxsplit=1) if re.match(r'(\s*\s*)+?(参考|考试|试[题卷]|物理|理综|数学|化学|生物)(答案|解析|答案[及与和]评分(标准|意见|细则))\s*$' r'|(\s*\s*)+?评分标准' r'|(\s*\s*)+?(参考|考试|试[题卷])(答案|解析|答案[及与和]评分(标准|意见|细则))\s*(物理|理综|数学|化学|生物)?\s*$', re.sub(r"[上下]?学[年期]|[\d—【】..、、::(())年\s]|[中大]学|模拟|[中高]考|年级|[学期][末中]|[高初][一二三]", "", v.strip())) else [v] for v in con_list] con_list = sum(temp_list, []) # 对可能的题号的处理 如2、3、4、5、 加了【fei】 # 重新修改!!!!!!!!!! con_list = [re.sub(r"^\s*([1-9][0-9]?\s*[..、、])", r"【fei】\1", i.strip()) if (len(re.findall(r"(^|\s*[..、、])\s*[1-9][0-9]?\s*[..、、]", i)) >= 3 and len(re.sub(r"[\d..、、\s]", "", i)) < 2) else i for i in con_list] # print(con_list) if con_list and re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None: con_list = con_list[1:] while con_list and re.search(r"声明[::].*?著作权属.*?所有|(邮箱|用户|日期|QQ)\s*[::].+?", con_list[-1]): con_list = con_list[:-1] return con_list, subs2src def del_no(item, item_no_type=1): """去开头的题号""" if item_no_type==2: item = re.sub('^\n*\s*[((]\s*([1-9]|[1-4][0-9])\s*[))]\s*[..、、::]?', "", item) return item item = re.sub('^\n*\s*([1-9]|[1-4][0-9])\s*[..、、::]', "", item) return item def html_cleal_test(htmlf): # 不用 html2txt = re.sub(r" ", "", htmlf.read()) # ("", " ") # html2txt.replace("①", "(1).").replace("②", "(2).").replace("③", "(3).") con_list = [re.sub(r"^\n+\s+

", "", ii) for ii in html2txt.split("

")[:-1]] # pprint(con_list) if re.search(r"[\u4e00-\u9fa5]", con_list[0]) is None: con_list = con_list[1:] return con_list def get_md5(image_id): """ 由于hash不处理unicode编码的字符串(python3默认字符串是unicode) 所以这里判断是否字符串,如果是则进行转码 初始化md5、将image_name进行加密、然后返回加密字串 """ image_name = str(image_id) + str(time.time()) + str(random.random()) image_name = image_name.encode("utf-8") md = hashlib.md5() md.update(image_name) return str(md.hexdigest()) def huanhang_wash_after(res_dict): """ 1.处理最终结果多余的换行符;2.对题文中已给答案的选择填空进行替换;3.选择题的细分 :param res_dict: :struc_type:试卷类型,struc_type=1时为教师卷 :return: """ pattern1 = re.compile(r"([是为点]|等于|=|=|有|存在)\s*_+((|[^_;;。?!,])+?)(?|") # 2020/4/10 gai if num == len(res_dict)-1: end_con = sr["content"] + sr["parse"] if len(re.findall(r"[\u4e00-\u9fa5]", end_con))>1000 and (len(re.findall("\n\s*([1-9]|1[0-9])\s*[..、、].+?", end_con))>4 or len(re.findall("[((]\s*[))]|_{2,}", end_con))>6): sr['errmsgs'].append("原试卷格式有问题,导致本题可能包含了很多非本题的题文") if not re.sub("[(())\n\s]", "", sr["content"]): sr['errmsgs'].append("本题没有题干,请检查题干格式是否正确") # 把首尾的换行都去掉,php接收时会用换行来拼接 # sr["content"] = table_label_cleal(re.sub(r"\n\s*","
",sr.get("content", "").lstrip())) # 将选择题和填空题中的题干中出现答案的情况 去掉答案 kuo_con1 = re.search('([是为]|等于|[==有]|表示)[((]([A-Z][A-Z;;和与、、]*?)[))](.?($|
|))", sr["content"].replace(" ", "")) if sr['item_topic_name'].replace("题", "") in ["单选", "多选", "选择", "单项选择", "多项选择"]: # sr["type"] = "选择" # 针对选择题在题文中已给出答案的处理 if kuo_con1: sr["content"] = sr["content"].replace(kuo_con1.group(0), kuo_con1.group(1)+"( )" + kuo_con1.group(3)) sr["answer"] = kuo_con1.group(2) if not sr["answer"] else sr["answer"] elif kuo_con2: sr["content"] = sr["content"].replace(kuo_con2.group(0), "( )" + kuo_con2.group(2)) sr["answer"] = kuo_con2.group(1) if not sr["answer"] else sr["answer"] if "options" in sr: # 对选项部分进行格式处理 for i in range(len(sr['options'])): sr['options'][i] = sr['options'][i].lstrip().replace("\n\n", "\n").replace("\n", "
") # sr['options_text'] = "" elif sr['item_topic_name'] == '填空题': # sr["type"] = "填空" ans_list = [] # 针对填空题在题文中已给出答案的处理 while re.search(pattern1, sr["content"]): blank_con1 = re.search(pattern1, sr["content"]) sr["content"] = sr["content"].replace(blank_con1.group(0), blank_con1.group(1)+"____" + blank_con1.group(4)) ans_list.append(blank_con1.group(2)) while re.search(pattern2, sr["content"]): blank_con2 = re.search(pattern2, sr["content"]) # 这里的限制条件易出错,可以再判断一下 sr["content"] = sr["content"].replace(blank_con2.group(0), blank_con2.group(1) + "____" + blank_con2.group(4)) ans_list.append(blank_con2.group(2)) if re.findall(r"_{2,}", sr["content"]): sr["blank_num"] = len(re.findall(r"_{2,}", sr["content"])) if not sr["answer"] and ans_list: sr["answer"] = "; ".join(ans_list) # 已知题型是错误的情况,如解答题,放在填空题中 if 'blank_num' not in sr and re.search("_+([^_]*?)_+", sr['content']) is None: sr['errmsgs'].append("填空题题干中没有下划线(__),与题型(填空题)不符") # stem_c = re.sub("|[,,.。.、、]", "", sr["content"]) # if len(stem_c) > 2: # 不自动纠错 # sr["item_topic_name"] = "解答题" # sr["type"] = "解答" else: # 大题题型先不做范围判断 if sr['item_topic_name'] and sr['item_topic_name'].replace("题", "") not in ["解答", "计算", "实验","作图"]: sr["type1"] = "解答" else: sr["type1"] = sr['item_topic_name'].replace("题", "") # if "is_optional" not in sr: # sr["is_optional"] = is_optional sr["option_str"] = "" if "slave" in sr and sr["slave"]: # 带小题的大题,格式处理,高中数学没有这一功能 for s in sr["slave"]: s["content"] = s.get("content", "").strip().replace("\n\n", "\n").replace("\n", "
") # 已分小问了的题号,是不会带小题号的,故不需要替换 # s["content"] = re.sub(r"[((]\s*(\d|ⅰⅱⅲⅳ|i{1,3})\s*[))]|[①②③④]\s*(?![+-])", "", s["content"][:5]) + s["content"][5:] s["parse"] = s.get("parse", "").strip().replace("\n\n", "\n").replace("\n", "
")\ .replace("解答:解:", "解答:").replace("解答:解:", "解答:") s["answer"] = s.get("answer", "").strip().replace("\n\n", "\n").replace("\n", "
") # sr["slave"] = sr.get("slave", "").replace("\n", "
") else: sr["parse"] = sr.get("parse", "").lstrip().replace("\n\n", "\n").replace("\n", "
") sr["parse"] = re.sub("^【解[答析]】\s*", "", sr["parse"]) sr["answer"] = sr.get("answer", "").lstrip().replace("\n\n", "\n").replace("\n", "
") if not sr["parse"] and not sr["answer"]: # 答案和解析都没有 sr["parse"] = "略" sr["answer"] = "略" sr['errmsgs'].append("本题缺少答案和解析") elif not sr["answer"] and sr["parse"]: sr["answer"] = "见解析" elif sr["answer"] and not sr["parse"]: sr["parse"] = "略" sr['errmsgs'].append("本题缺少解析") # 辅助标签处理 sr["analysis"] = "" if "analy" in sr: # 存在题目分析时,将其放在解析里 sr["analysis"] = sr.get("analy", "").strip().replace("\n\n", "\n").replace("\n", "
") # if len(sr["analy"].replace(" ", "")) >= 10: # sr["parse"] = sr["analy"] + "
" + sr["parse"] del sr["analy"] if "chapter" in sr: # 如选修4-5:不等式选讲 if sr['item_id'] + 1 <= len(res_dict): chapter_no[sr['item_id']] = sr["chapter"] del sr["chapter"] # 是否为选做题"is_optional",两种形式不会同时出现 if "option_st" in sr: # 带有此标签的后面的题目都是选做题option_score option_st = sr['item_id'] is_optional = True if "," in sr["option_st"]: option_score = int(sr["option_st"].split(",")[-1]) del sr["option_st"] elif sr['item_topic_name'] == '选做题': # 题型是选做题 如五、选做题 select_type_id.append(sr['item_id']) sr['is_optional'] = 'true' sr['score'] = option_score elif "type1" in sr and sr["type1"] == "解答" and "is_optional" not in sr: sr["is_optional"] = is_optional if is_optional: sr['score'] = option_score if "type1" in sr: del sr["type1"] # 将选择题改为单选或多选,"is_multiple_choice" sr['item_topic_name'] = re.sub("([单多])项选择题?", r"\1选题", sr['item_topic_name']) sr['item_topic_name'] = sr['item_topic_name'].replace("简答", "解答") # sr['item_topic_name'] = re.sub("(计算|简答)题?", "解答题", sr['item_topic_name']) # if sr['item_topic_name'] in ["选择", "选择题"]: # 有的科目只有选择题,不分单选和多选 # if len(re.findall("[A-Z]", sr["answer"])) > 1: # sr['item_topic_name'] = '多选题' # else: # sr['item_topic_name'] = '单选题' if sr['item_topic_name'] == '多选题': if len(re.findall("[A-Z]", sr["answer"])) == 1: sr['errmsgs'].append("本题答案个数与题型(多选题)不符") # sr["is_multiple_choice"] = 'true' elif sr['item_topic_name'] == '单选题': # sr["is_multiple_choice"] = 'false' if "options" in sr and len(sr["options"]) > 4: sr['errmsgs'].append("选项个数多于4个,与题型(单选题)不符") if len(re.findall("[A-Z]", sr["answer"])) > 1: sr['errmsgs'].append("本题答案个数与题型(单选题)不符") # """按照原先高中数学解析的最后输出格式整理输出""" sr["stem"] = sr["content"] sr["type"] = sr['item_topic_name'].replace("非选择", "解答") sr["topic_num"] = sr['item_id'] sr['errmsgs'] = ";".join(sr['errmsgs']) sr["parse"] = re.sub(r"试题【([分解]析)】", r"试题\1:", sr["parse"]) # 解析 sr["key"] = re.sub("([;;]|
)\s*$", "", sr["answer"]) sr["slave_img"] = "" sr["parse_img"] = "" sr["stem_img"] = "" if 'susp_pic' in sr: del sr['susp_pic'] if 'is_optional' in sr: del sr['is_optional'] if 'spliterr_point' in sr: del sr['spliterr_point'] del sr["content"], sr["answer"], sr['item_topic_name'], sr['score'],sr['item_id'] # ------------------------------------------------------------------------ # if chapter_no: # 章节标签下移一位 # for c, v in chapter_no.items(): # res_dict[c]["chapter"] = v # 选做题"option_str"处理 if select_type_id: for s in select_type_id: if len(select_type_id) == 2: res_dict[s-1]['option_str'] = "2选1" elif len(select_type_id) == 4: res_dict[s - 1]['option_str'] = "4选2" else: res_dict[s-1]['text_errmsgs'] += ";
选做题不是“2选1”和“4选2”类型" if option_st: print("option_st:", option_st) for s in range(option_st, len(res_dict)): if (len(res_dict) - option_st) == 2: res_dict[s]['option_str'] = "2选1" elif (len(res_dict) - option_st) == 4: res_dict[s]['option_str'] = "4选2" else: res_dict[s]['text_errmsgs'] += ";
选做题不是“2选1”和“4选2”类型" return res_dict def insert_sort2get_idx(item_list, num): """ :param item_list: 拍好序的列表 :param num: 插入的数值 :return: 插入的位置 """ add_n = 0 for i in range(len(item_list)): if num > item_list[i]: add_n += 1 else: break return add_n # def find_seq_num(num_list): # """ # 针对切分题号时切错的序号进行纠正,考虑序号是连续且正常的情况下 # 将连续的数字进行分组 # :param num_list:输入[3, 4, 8, 9, 12, 13, 14] # :return: [[3, 4],[8, 9],[12, 13, 14]] # """ # seq_ranges = [] # for k, g in groupby(enumerate(num_list), lambda x: x[0] - x[1]): # group = (map(itemgetter(1), g)) # group = list(map(int, group)) # seq_ranges.append(group) # return seq_ranges # def del_exception_value(item_list): # """ # 去列表中的异常值,题目越多,越容易突出异常值 # :return: # """ # import numpy as np # max_v = max(item_list) # arr_mean = np.mean(item_list) # 均值 # arr_var = np.var(item_list) # 方差 # while max_v > len(item_list)+4: # item_list.remove(max_v) # print(item_list) # arr_mean = np.mean(item_list) # 去最大值后的均值 # arr_var = np.var(item_list) # 去最大值后的方差 # max_v = max(item_list) # # print("均值与方差:",arr_mean,arr_var) # if abs((item_list[-1] - item_list[0] + 1) - len(item_list)) <= 3: # return item_list # else: # exception_value = [] # for i in item_list: # # print(abs((i - arr_mean) / arr_var), i) # if(abs((i - arr_mean)/arr_var)) > 0.3: # exception_value.append(i) # right_seq = [i for i in item_list if i not in exception_value] # return right_seq def pic_transfer(con_list): aft_opt = [] # 针对选项后是题目图片的情况,进行移位 if "\n" in con_list[-1]: ccon = re.split("\n+", con_list[-1]) while re.match("0 and v['item_id'] - item_list[k-1]['item_id']>1: # if if __name__ == '__main__': # -------------生成requirements.txt--------------- # pip freeze > requirements.txt # import os, sys # # project_root = os.path.dirname(os.path.realpath(__file__)) # 找到当前目录 # print(project_root) # # # 找到解释器,虚拟环境目录 # python_root = sys.exec_prefix # print(python_root) # # # 拼接生成requirements命令 # command = python_root + '\Scripts\pip freeze > ' + project_root + '\\requirements.txt' # print(command) # # # 执行命令。 # os.system(command) # ----------------一键安装 requirements.txt------------ # pip install -r requirement.txt # python_root + '\Scripts\' + pip install -r requirements.txt ans_no0=[16, 17, 18, 19, 20] print(ans_no0[ans_no0.index(1):]) # # b = del_exception_value(a) # print(b) # import os # rrr=os.path.basename(r"http:/pstatic.dev.xueping.com/data/word/2020/08/12/5f338d18e2cce.docx") # print(rrr)