# -*- coding:utf-8 -*- import random, hashlib import requests import configs import shutil from utils.diffi_label import get_item_diff from utils.img2latex import get_ocrlatex_by_url from utils.qcloud_bucket import upload_img_to_qcloud, img_inbucket_count, client, filestream_upload from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor from utils.image_convert import svg2png from utils.label_data_Hphy import Label from bisect import bisect_left import re, time, os, json, datetime from copy import deepcopy logger = configs.myLog(__name__, log_cate="ruku_log").getlog() subject_id = { "高中数学": 3, "高中英语": 8, "高中物理": 12, "高中化学": 13, "高中生物": 14, "高中政治": 15, "高中历史": 16, "高中地理": 17, "初中数学": 41, "初中英语": 42, "初中物理": 43, "初中化学": 44, "初中生物": 45, "初中地理": 46, "初中政治": 47, "初中历史": 48, "高中语文": 9, "初中语文": 40, } class Ruku(): def __init__(self, items_list, htmlt, svg_data, wordid, callback_info, subject=""): self.items_list = items_list self.htmlt = htmlt self.svg_data = svg_data self.wordid = wordid self.callback_url = callback_info["callback_url"] # self.callback_url = "123456" self.source = callback_info["source"] # {"xue_guan": "1", "teacher": "2", "ai": "3", "qtk": 4,"school":5} self.subject = subject # items_list[0]["period"] + items_list[0]["subject"] self.callback_code = 0 self.callback_err = "" self.ltx2url = {} # latex 映射 线上可访问url def bucket_img_del(self): """ wordbin中图片上传腾讯云的原因:1.存在线上服务器的图片要定期删除,而有的题可能还没审核完;2.上传腾讯云比较便宜 删除腾讯云中的图片 根据【解析结果】文本中删除的图片信息,通过阙值判断是否从腾讯云中删除图片 也有限制:解析结果不能为空,结果中出现的图片在原试卷应该也要有! 有风险:对于已经校对过的试卷,如果后期有人在资源库里删除了大量文本,很可能导致正在使用的图片被删除 暂时第三方上传的试卷还是先不要删除比较稳妥!或新建图片用md5命名 :return: """ items_str = str(self.items_list) raw_imgs = [] # img_source = "" http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/imgpaper/lqy_upload/612f60307ddb8b2765e50512/img_23.png img_source = "" items_str = re.sub(r'( 5: # 多余图片超过5张才开始删除,包含mathjax生成的图片 bucket_imgs = [i['Key'] for i in bucket_imgs] # dict:[{'Key': put_key}, {}] to_del_imgs = [bimg for bimg in bucket_imgs if bimg not in raw_imgs] if to_del_imgs: objects = { "Quiet": "true", "Object": [{'Key': item} for item in to_del_imgs] } client.delete_objects(configs.public_bucket, objects) # 批量删除 logger.info("----【paper_id:{}】删除桶数据时间:{}".format(self.wordid, time.time() - time2)) def mathjx2png(self): """ 【基于mathjax渲染输出是svg格式】 将mathjax渲染的公式转化为图片格式 mathjax渲染的svg图片提取->保存->格式转化 :return: """ file_path = configs.IMG_FOLDER + "/" + str(self.wordid) svgp_ltx = {} # svg图片本地路径 映射 latex put_key_mjmath = [] # 桶中key local_mjmath = [] # 本地图片存储位置 self.ltx2url = {} # latex 映射 线上可访问url ltx2svgcss = {} # latex 映射 svg-css if self.callback_url and "MathJax" in str(self.svg_data["svg_html_data"]): # 再解析中存在mathjax公式渲染的标签 # 需要统计mathjax转png失败时的具体定位 初始思路:based on 索引而展开二分法查找 # 题号所在位置 topicinfo topicidx topicno # topicinfo = [[nm.end(), nm.group(1)] for nm in re.finditer('class="topic-number">(\d+)\.', # str(self.svg_data["svg_html_data"]))] # topicidx = [i[0] for i in topicinfo] # topicno = [i[1] for i in topicinfo] time3 = time.time() all_mathjax = [] for topic_n, one_svghtml in enumerate(self.svg_data["svg_html_data"]): # some_mathjax = [[a.start(), a.group(1)] for a in # re.finditer('(()*)', one_svghtml)] some_mathjax = [[topic_n+1, a.group(1)] for a in re.finditer('(()*)', one_svghtml)] all_mathjax.extend(some_mathjax) # all_mathjax = [[a.start(), a.group(1)] for a in # re.finditer('(()*)', # str(self.svg_data["svg_html_data"]))] all_linkdata = re.findall('()', self.svg_data["svg_path"]) link_dict = {a[1]: a[0] for a in all_linkdata if a} # all_svg, all_latex = [], [] for n, jax in enumerate(all_mathjax): svgs = re.findall("", jax[1]) latex = re.findall('()*', jax[1]) if latex and svgs: latex_sub = re.sub(r"\\text{\s*\}|[{\}]", "", latex[0][0]) if latex and not latex_sub.strip(): # id_idx = bisect_left(topicidx, jax[0]) # based on 索引而展开二分法查找 logger.info("----【paper_id:{}】第{}题存在mathjax公式为空:latex:{}\nsvg:{}".format( self.wordid, jax[0], latex, svgs)) # topicno[id_idx - 1] else: ltx2svgcss["{}".format(latex[0][0])] = svgs[0] # latex去重 else: # id_idx = bisect_left(topicidx, jax[0]) # based on 索引而展开二分法查找 logger.info("----【paper_id:{}】第{}题存在mathjax公式格式有问题:latex:{}\nsvg:{}".format( self.wordid, jax[0], latex, svgs)) if latex: return "第{}题的公式latex:{}不规范,请编辑正确!".format(jax[0], latex[0][0]) else: return "第{}题的存在不规范的公式,,请重新编辑!".format(jax[0]) if ltx2svgcss: if not os.path.exists(file_path): os.makedirs(file_path) svg_mjmath_path = os.path.join(file_path, "svg_mjmath") if not os.path.exists(svg_mjmath_path): os.makedirs(svg_mjmath_path) else: # 需要清空 shutil.rmtree(svg_mjmath_path) os.makedirs(svg_mjmath_path) n = 0 name_list = random.sample(range(100000, 999999), len(ltx2svgcss)) # 随机数 for ltx, svg in ltx2svgcss.items(): linkkeys = list(set(re.findall(' xlink:href="#(.*?)"', svg))) linkvalues = [link_dict[ld] for ld in linkkeys] svg_p = file_path + "/svg_mjmath/MJMATH-{}.svg".format(str(int(time.time())) + str(name_list[n])) fs = open(svg_p, 'w', encoding='utf8') svg_1, svg_2 = svg.split("\n' + "\n".join(linkvalues) + "\n") fs.write(">svg转png if svgp_ltx: try: with ThreadPoolExecutor(max_workers=6) as t: all_png_info = [t.submit(svg2png, arg) for arg in svgp_ltx.keys()] except Exception as e: logger.info(json.dumps({"log_level": "warn", "paper_id": self.wordid, "status": "svg2png失败", "errmsg": str(e)}, ensure_ascii=False)) all_png_info = [] all_png_info = [i.result() for i in all_png_info] for shape, img_path in all_png_info: local_mjmath.append(img_path) imgname_online = "/zyk/uploadfiles/wording/" + str(self.wordid) + "/{}".format(os.path.basename(img_path)) put_key_mjmath.append(imgname_online) mj_ltx = svgp_ltx[img_path.replace(".png", ".svg")] #.replace("\\\\", "\\") w = shape[0] / 4 * 1.2 h = shape[1] / 4 * 1.2 if re.search(r"^{?[A-Z、\s{_\}\d]+\}?$", mj_ltx): w = shape[0] / 4 h = shape[1] / 4 mjmath_online = ''.format( int(w), int(h), "${}$".format(mj_ltx)) # shape[0] / 2 self.ltx2url[mj_ltx] = mjmath_online logger.info(json.dumps({"log_level": "info", "paper_id": self.wordid, "svg2png_time": time.time() - time4, "ltx2url": self.ltx2url}, ensure_ascii=False)) return put_key_mjmath, local_mjmath def upload_img(self): """ items_list:结构化纯文本 htmlt:ocr或word解析后的html文本 svg_data:{"svg_html_data": "", "svg_path": ""} 含svg数据的结构化整体html文本,svg中的索引数据 wordid:试卷存储id callback_url: 回调地址 入库操作包含: 1、再解析后确认入库时,将image上传,htmlt中图片; 2、学管端可以是组合(文本+图片)的解析结果。学管可以选择和修改文本,但选择的文本解析时都会替换原先的img标签内容, 最后入库时,传回学管端的都是带img标签的结果 3、传回学管端和共享题库的题目中的$$公式要转换为图片; $$左右还有公式字符串的话,在结构化过程中一起并入!!! 4、调取查重关联功能,暂时不调自动标注; 5、传入校本题库(发送标注),将结构化后的每道题设置个与大数据资源库的关联标签,如"zyk_id":id+题号 5种图片:1>>ser_static/.*?/word/media(解析服务中最开始保存在线上服务器本地) 2>>/zyk/uploadfiles/wording/(解析服务中上传到腾讯云) 3、4>>ser_static/.*?/(new_image[^"]*?|eq_img_\d.png):批量再解析和单题再解析中, 用户在编辑页面新粘贴进来的base64图片,以及批量再解析中域公式转图片,临时存在结构化服务器本地 5、利用mathjax渲染latex的svg格式转为png,再上传到腾讯云 上传之前,将腾讯云桶里的new_image全部替换,raw_image判断下再替换 items_list:list 所有题目 :return: res_zyk = {"data":{"html": xxx, "items": xxx}, "errcode": 0, "errmsgs": ""} res_xbk = {"items": xxx, "errcode": 0, "errmsg":"ok"} """ # if not self.items_list: # return {"errcode":1, "errmsgs": "无结果,不能入库", "data":{}} # # # if any([True if "stem_img" not in i and "stem" not in i else False # # for i in self.items_list]): # 存在新增的空试题时,只有3个字段type,img_status,check_type # # return {"errcode":1, "errmsgs": "存在空试题,请检查!", "data":{}} # contain_id = [] # for i in self.items_list: # if "stem_img" not in i and "stem" not in i: # return {"errcode": 1, "errmsgs": "存在空试题,请检查!", "data": {}} # if contain_id: # return {"errcode": 1, "errmsgs": "存在报错试题【{}题】还没改正过来,请检查!".format("、".join(contain_id)), "data": {}} # elif "errmsgs" in i and i["errmsgs"]: # return {"errcode": 1, "errmsgs": "存在报错试题还没改正过来,请检查!", "data": {}} res_xbk = {"items": [], "errcode": 0, "errmsg": "ok", "callback_type": 2, "subject_id": self.subject} res_zyk = {"errcode": 0, "errmsgs": "", "data": {"html": self.htmlt, "items": self.items_list}} # 1>>判断删除腾讯云桶内图片 # self.bucket_img_del() # --------------------------------------------------------------------- # 1.2>>将zyk/uploadfiles/wording/ 路径的缺latex的公式图片,再调mathpix接口拿到latex填充 # 2>>mathjax渲染的svg图片提取->保存->格式转化 svg_convert_res = self.mathjx2png() if type(svg_convert_res) == str: res_xbk = {"items": [], "errcode": 1, "errmsg": svg_convert_res, "callback_type": 2, "subject_id": self.subject} res_zyk = {"errcode": 1, "errmsgs": svg_convert_res, "data": {}} if self.callback_url: self.callback_user(res_xbk, self.callback_url) if self.source == "xue_guan": self.callback_php(res_xbk) # 失败时就不发送数据库端了,但学管端时发送 return res_zyk put_key_mjmath, local_mjmath = svg_convert_res # -------------------------------------------------------------------- # 3>> 统计 ocr和结构化试题 中的新图片,以防结构化入库的试题少了而导致图片上传不足,左边页面无法显示 put_key_list = [] # 桶中key localnewpic_list = [] # 本地图片存储位置 imgs_url_list = [] # 远程服务器上图片存储位置 put_key_imgsurl = [] new_imgs = re.findall(r'([^"]+?)', r"\1", s) if old_img_local: return s.replace(old_img_local, new_img_online) return s def sub2(s): """将试题中的latex转为线上可访问图片地址""" if s: all_ltx = re.findall(r'\$.*?\$', s) # 查找试题结构中的公式 all_ltx.extend(re.findall('\\\\\(.*?\\\\\)', s)) all_ltx = list(set(all_ltx)) for ltx in all_ltx: new_ltx = ltx.replace("$", "").replace("\\(", "").replace("\\)", "")\ .replace("&", "&").replace("<", "<") #.replace(" ", "\\u200a") if new_ltx in self.ltx2url: # 将latex换为其渲染图片的线上可访问地址 s = s.replace(ltx, self.ltx2url[new_ltx]) else: logger.info("----【word_id:{}】latex替换为imgurl失败:{}".format(self.wordid, new_ltx)) s = re.sub(r' data-latex=""\s*/>', r" data-latex=\1 />", s) return s def ltx2url_repl(one_items): """ 将每道题结构中的各字段进行匹配替换 ltx2url """ keys_items = ["stem", "key", "parse", "options", "slave"] if self.ltx2url: for k in keys_items: if k in one_items: if k == "options": one_items[k] = list(map(sub2, one_items[k])) elif k == "slave" and one_items[k]: for slave_one in one_items[k]: ltx2url_repl(slave_one) else: one_items[k] = sub2(one_items[k]) # else: # if "topic_num" in one_items: # logger.info("----【word_id:{}】第{}道题{}字段有问题".format(self.wordid, one_items["topic_num"], k)) # else: # logger.info("----【word_id:{}】{}字段有问题".format(self.wordid, k)) # ------------------------------------------------------------- time6 = time.time() # 4>> 结构化题目中图片地址替换,需要区分下学管端还是云题库!!!!!一定会保存一份在资源库 items_res_to_zyk = self.items_list.copy() items_res_to_zyk = eval(sub1(items_res_to_zyk)) # for one_items in items_res_to_zyk: # for k in ["stem", "key", "parse", "options"]: # "analysis", # if k in one_items: # if k == "options": # one_items[k] = list(map(sub1, one_items[k])) # else: # one_items[k] = sub1(one_items[k]) # -----------难度和知识点自动标注------------------------ diffs_xbk, items_res_to_zyk = self.get_diff(items_res_to_zyk.copy()) # 难度 for nn, one_items in enumerate(deepcopy(items_res_to_zyk)): new_one_item = {} if self.subject == 3: # "高中数学" new_one_item["difficulty"] = diffs_xbk[nn] # if one_items["checkType"]["name"] == "填空题": # new_one_item["blank_num"] = one_items["blank_num"] # keys_items = ["stem", "key", "parse", "options"] # if one_items['img_status'] == 1 and ("stem_img" in one_items and one_items["stem_img"]): # logger.info("----【paper_id:{}】mathjax2svg所取的字段是带img的".format(self.wordid)) # keys_items = ["stem_img", "key_img", "parse_img", "options_img"] if self.callback_url: # 查重 if self.source != "xue_guan": repeat_r, repeat_time = self.repeat_check(nn, one_items, repeat_time) if type(repeat_r) == str: res_xbk = {"items": [], "errcode": 1, "errmsg": repeat_r, "callback_type": 2, "subject_id": self.subject} res_zyk = {"errcode": 1, "errmsgs": repeat_r, "data": {}} self.callback_user(res_xbk, self.callback_url) return res_zyk new_one_item["repeat_res"] = repeat_r new_one_item["topic_type_id"] = one_items["checkType"]["id"] if "options_rank" in one_items and one_items["options_rank"]: new_one_item["options_rank"] = one_items["options_rank"] ltx2url_repl(one_items) one_items = self.slave2regroup(one_items) new_one_item.update(one_items) # for k in keys_items: # if k in one_items: # if self.ltx2url: # if k == "options": # one_items[k] = list(map(sub2, one_items[k])) # else: # one_items[k] = sub2(one_items[k]) # new_one_item[k] = one_items[k] # else: # 有的题本来就没有options字段 # logger.info("----【paper_id:{}】第{}道题{}字段有问题".format(self.wordid, one_items["topic_num"], k)) items_res_to_xbk.append(new_one_item) # 5>> ocr-htmlt中图片地址替换成云上地址 self.htmlt = re.sub(r'(> new_image上传腾讯云 try: logger.info('----【paper_id:{}】再解析开始上传图片到cloud,并替换成线上地址----'.format(self.wordid)) stime_u = time.time() # TODO 一个进程解析,一个进程上传 executor1 = ThreadPoolExecutor(5) executor1.map(upload_img_to_qcloud, zip(put_key_list, localnewpic_list)) if imgs_url_list: executor1.map(filestream_upload, zip(put_key_imgsurl, imgs_url_list)) executor1.shutdown(wait=True) # 进程池内部的进程都执行完毕,才会关闭,然后执行后续代码 img_upload_time = time.time() - stime_u logger.info(json.dumps({"log_level": "info", "paper_id": self.wordid, "img_upload_time": img_upload_time}, ensure_ascii=False)) res_zyk = {"data": {"html": self.htmlt, "items": items_res_to_zyk}, "errcode": 0, "errmsgs": ""} res_xbk = {"items": items_res_to_xbk, "errcode": 0, "errmsg": "ok", "callback_type": 2, "subject_id": self.subject} except Exception as e: res_xbk = {"items": [], "errcode": 1, "errmsg": "公式或图片上传腾讯云失败", "callback_type":2, "subject_id": self.subject} res_zyk = {"data":{"html": self.htmlt, "items": self.items_list}, "errcode": 1, "errmsgs": "公式或图片上传腾讯云失败"} logger.info(json.dumps({"log_level": "warn", "paper_id": self.wordid, "status": "公式或图片上传腾讯云失败", "errmsg": str(e)}, ensure_ascii=False)) else: # 本地没有新图片时 # -----------难度、知识点自动标注------------------------ diffs_xbk, self.items_list = self.get_diff(self.items_list) # 难度 if self.callback_url: for nn, one_items in enumerate(self.items_list): new_one_item = {} if self.subject == 3: # "高中数学" new_one_item["difficulty"] = diffs_xbk[nn] # 查重 if self.source != "xue_guan": repeat_r, repeat_time = self.repeat_check(nn, one_items, repeat_time) if type(repeat_r) == str: res_xbk = {"items": [], "errcode": 1, "errmsg": repeat_r, "callback_type": 2, "subject_id": self.subject} res_zyk = {"errcode": 1, "errmsgs": repeat_r, "data": {}} self.callback_user(res_xbk, self.callback_url) return res_zyk new_one_item["repeat_res"] = repeat_r new_one_item["topic_type_id"] = one_items["checkType"]["id"] if "options_rank" in one_items and one_items["options_rank"]: new_one_item["options_rank"] = one_items["options_rank"] one_items = self.slave2regroup(one_items) new_one_item.update(one_items) # keys_items = ["stem", "key", "parse", "options"] # for k in keys_items: # if k in one_items: # new_one_item[k] = one_items[k] items_res_to_xbk.append(new_one_item) res_xbk = {"items": items_res_to_xbk, "errcode": 0, "errmsg": "ok", "callback_type": 2, "subject_id": self.subject} logger.info(json.dumps({"log_level": "info", "paper_id": self.wordid, "repeat_time": repeat_time}, ensure_ascii=False)) # 先传一份到校本题库,再返回结果 if self.callback_url: # 知识点自动标注--批量 if self.subject == 3 and self.source == "qtk": # "高中数学" try: res_xbk = self.get_kps_auto(res_xbk) except Exception as e: logger.info("----【paper_id:{}】高中数学标注异常:{}".format(self.wordid, e)) # 回调前,将结果保存本地一份 self.save_post_file(res_xbk) have_err = self.callback_user(res_xbk, self.callback_url) if self.source == "xue_guan" or not have_err: # 失败时就不发送数据库端了,但学管端时发 self.callback_php(res_xbk) return res_zyk def slave2regroup(self, one_items): """ 将拆分的小题根据是否为旧题型进行重组,如果保存时选择的旧题型就要组合成三段式 对判断题的答案进行转化:正确-->A;错误-->B :return: """ # 此阶段将每个题可能的报错信息去掉 if "errmsgs" in one_items and one_items["errmsgs"]: one_items["errmsgs"] = "" if "btt_id" not in one_items["checkType"] or one_items["checkType"]["btt_id"] != 13: if "slave" in one_items and one_items["slave"]: # 带小题的 for idn, s in enumerate(one_items["slave"]): if not one_items["stem"] and not idn and s["stem"].strip(): one_items["stem"] += "(1)" + s["stem"] elif s["stem"].strip(): one_items["stem"] += "({})".format(idn + 1) + s["stem"] if not one_items["key"] and not idn and s["key"].strip(): one_items["key"] += "(1)" + s["key"] elif s["key"].strip(): one_items["key"] += "({})".format(idn + 1) + s["key"] if not one_items["parse"] and not idn and s["parse"].strip(): one_items["parse"] += "(1)" + s["parse"] elif s["parse"].strip(): one_items["parse"] += "({})".format(idn + 1) + s["parse"] if "options" in s and s["options"]: one_items["stem"] += "" + "".join(["{}、{}".format(chr(ord('@') + idm + 1), option) for idm, option in enumerate(s["options"])]) one_items["slave"] = [] if "btt_id" in one_items["checkType"] and one_items["checkType"]["btt_id"] not in [5, 6, 7]: one_items["stem"] = re.sub(r'(__{2,})', r"\1", one_items["stem"]) else: # if "btt_id" in one_items["checkType"] and one_items["checkType"]["btt_id"] == 13: # 将小问答案没拆出来的部分放入公共解析字段中 if "slave" in one_items and one_items["slave"] and one_items["key"] and \ all([True if not s["key"].strip() else False for s in one_items["slave"]]): # 带小题的且小题答案都为空 if one_items["parse"].strip(): one_items["parse"] = "【答案】\n{}\n【解析】\n{}".format(one_items["key"], one_items["parse"]) else: one_items["parse"] = "【答案】\n{}".format(one_items["key"]) one_items["key"] = "" if "btt_id" in one_items["checkType"] and one_items["checkType"]["btt_id"] == 12: # 判断题 one_items["answer_type"] = 4 if re.match("(?"+ "".join(s["options"]) # if "options" in s and s["options"] else s["slave_no"] + s["stem"] for s in one_items["slave"]] # stems += "".join(stem_l) items = all_items["items"] items_info = {"subject_id": self.subject, "topics": [{"topic_id": "123", # one_items["id"] "topic_type_id": one_items["checkType"]["id"], "content": one_items["stem"], "parse": one_items["parse"] if "parse" in one_items else "", "option": one_items["options"] if "options" in one_items else [], "resource_type": 0, "slave": one_items["slave"] if "slave" in one_items else []} for one_items in items] } auto_kps = Hmath_kps(items_info) if not auto_kps or type(auto_kps) == str: pass else: for nn, one_items in enumerate(all_items["items"]): one_items.update({"auto_mark_result": auto_kps[nn]}) # all_items["items"] = [lambda x:one_items.update({"auto_mark_result": auto_kps[nn]}) # for nn, one_items in enumerate(all_items["items"])] # print("自动标注考点:", auto_kps) logger.info(json.dumps({"log_level": "info", "paper_id": self.wordid, "automark_time": time.time() - t1}, ensure_ascii=False)) return all_items def img2latex(self): # 3>> 统计结构化试题data-latex为空的图片,再ocr-latex处理并替换 time5 = time.time() items_str = str(self.items_list) all_imgs_no_latex = re.findall(r'()'.format(m), sub3, items_str) self.items_list = eval(items_str) logger.info("----【paper_id:{}】结构化试题中空latex掉mathpix转化时间:{}".format(self.wordid, time.time() - time5)) except: logger.info("----【paper_id:{}】latex-ocr转化过程失败".format(self.wordid)) # ------------------------------------------------------------------- def get_diff(self, items_res_to_zyk): # -----------难度自动标注------------------------ # t11 = time.time() diffs_xbk = [3] * len(items_res_to_zyk) # if self.subject == "高中数学": # 难度自动标注暂时不用了 # from multiprocessing.dummy import Pool as ThreadPool # pool = ThreadPool(2) # 比# pool = multiprocessing.Pool(3)速度快 # diff_info = list(pool.map(get_item_diff, items_res_to_zyk)) # items_res_to_zyk = [f[0] for f in diff_info] # diffs_xbk = [f[1] for f in diff_info] # logger.info("----【paper_id:{}】结构化试题中难度标注时间:{}".format(self.wordid, time.time() - t11)) return diffs_xbk, items_res_to_zyk def repeat_check(self, id, one_items, repeat_time): t22 = time.time() stems = one_items["stem"] # if "slave" in one_items and one_items["slave"]: # stem_l = [s["slave_no"] + s["stem"] + ""+ "".join(s["options"]) # if "options" in s and s["options"] else s["slave_no"] + s["stem"] for s in one_items["slave"]] # stems += "".join(stem_l) chachong_item_dict = {"topic_id": id + 1, "subject_id": self.subject, "topic_type_id": one_items["checkType"]["id"], "content": stems} if "slave" in one_items: chachong_item_dict["slave"] = one_items["slave"] if "options" in one_items: chachong_item_dict["options"] = one_items["options"] # print(chachong_item_dict) try: repeat_r = requests.post(url=configs.repeat_ip, json=[chachong_item_dict]).json() print("查重结果:", repeat_r) # [str(id + 1)] if repeat_r: repeat_r = repeat_r[str(id + 1)] else: repeat_r = [] except Exception as e: logger.info("----【paper_id:{}】查重服务异常:{}".format(self.wordid, e)) repeat_r = "保存入库查重服务异常" repeat_time += time.time() - t22 return repeat_r, repeat_time def callback_user(self, res_xbk, callback_url, err=""): """全学科题库、第三方上传试卷回调""" have_err = 0 if not err: if res_xbk["errcode"]: have_err = 1 logger.info(json.dumps({"log_level": "info", "paper_id": self.wordid, "status": "入库失败", "errmsg": res_xbk["errmsg"], "task_name": "保存入库"}, ensure_ascii=False)) else: logger.info(json.dumps({"log_level": "info", "paper_id": self.wordid, "status": "入库成功", "task_name": "保存入库"}, ensure_ascii=False)) try: r = requests.post(callback_url, # json=res, # 可以,但是会进行转义 headers={"Content-Type":"application/json"}, data=json.dumps(res_xbk, ensure_ascii=False).encode("utf-8"), # 文件开头有编码显示 ) print("------【paper_id:{}】,post 回调地址状态:{}===>回调结果: {} -------\n".format(self.wordid, r.status_code, r.text)) if r.status_code != 200: have_err = 1 if r.text != "解析完成!": have_err = 1 logger.info(json.dumps( {"log_level": "info", "paper_id": self.wordid, "callback_res": {"status": "回调成功" if r.status_code == 200 else "回调失败", "status_code": r.status_code, "text": str(r.text), "callback_url": callback_url}, "task_name": "入库回调"}, ensure_ascii=False)) # except TimeoutError: # # print("回调超时") # callback_err += "回调超时" # logger.info("------【paper_id:{}】回调超时-------".format(self.wordid)) except Exception as e: self.callback_err += "回调出错" self.callback_code = 1 have_err = 1 print("------【paper_id:{}】回调出错:{}-------".format(self.wordid, e)) logger.info(json.dumps( {"log_level": "warn", "paper_id": self.wordid, "callback_res": {"status": "回调失败", "callback_url": callback_url}, "errmsg": str(e), "task_name": "入库回调"}, ensure_ascii=False)) else: if r.status_code != 200: have_err = 1 self.callback_code = 1 self.callback_err = "回调出错:{}".format(r.status_code) logger.info(json.dumps( {"log_level": "warn", "paper_id": self.wordid, "callback_res": {"status": "回调失败", "status_code": r.status_code, "text": r.text, "callback_url": callback_url}, "task_name": "入库回调"}, ensure_ascii=False)) return have_err def callback_php(self, res_xbk): """ 将解析校对完的试题返回数据库端 :param res_xbk: 返回给其它端的数据 :return: """ if res_xbk["errmsg"] not in ["ok", "保存失败"]: #公式上传腾讯云失败等 self.callback_err += res_xbk["errmsg"] if self.callback_err: self.callback_code = 1 type_check = {"errcode": self.callback_code, "errmsg": self.callback_err, "parseId": self.wordid, "callback_type": 2, "data": [k["checkType"]["id"] for k in self.items_list], } try: r2 = requests.post(configs.callback_url_taskcheck, data=json.dumps(type_check, ensure_ascii=False).encode("utf-8")) print("------【paper_id:{}】taskcheck post数据:{} 回调地址状态:{}===>回调结果: {}-------".format( self.wordid, type_check, r2.status_code, r2.text)) logger.info(json.dumps({"log_level": "info", "paper_id": self.wordid, "callback_data": type_check, "callback_res": {"status": "回调成功" if r2.status_code == 200 else "回调失败", "status_code": r2.status_code, "text": r2.text, "callback_url": configs.callback_url_taskcheck}, "task_name": "入库回调"}, ensure_ascii=False)) except Exception as e: print("------【paper_id:{}】taskcheck回调出错:{}-------".format(self.wordid, e)) logger.info(json.dumps({"log_level": "warn", "paper_id": self.wordid, "callback_data": type_check, "callback_res": {"status": "回调失败", "callback_url": configs.callback_url_taskcheck}, "errmsg": str(e), "task_name": "入库回调"}, ensure_ascii=False)) else: if r2.status_code != 200: print("------【paper_id:{}】taskcheck回调出错:{}".format(self.wordid, r2.status_code)) logger.info(json.dumps({"log_level": "warn", "paper_id": self.wordid, "callback_data": type_check, "callback_res": {"status": "回调失败", "status_code": r2.status_code, "text": r2.text, "callback_url": configs.callback_url_taskcheck}, "task_name": "入库回调"}, ensure_ascii=False)) if __name__ == '__main__': from pprint import pprint import pickle # items_list = [{'id': '60bdcd734a5335001b0a73cf', 'type': '填空题', # 'stem': '在数列\\({\\lbrace a_{n}\\rbrace }\\)中,若\\({a_{1}=1,a_{n}-a_{n-1}=n(n\\geq 2),}\\)则该数列的通项\\({a_{n}=}\\)', # 'options': [], 'key': '\\( \\frac{1}{2} \\) \n ', # 'subject': '数学', 'errmsgs': '', 'topic_num': 0, 'parse_img': '略', 'analysis': '', 'slave_img': '', # 'parse': '略 \n \n ', 'category': ['月考'], 'grade': '高一', # 'stem_img': "", 'period': '高中', # 'province': '湖北', 'susp_pic': None, 'option_str': '', 'blank_num': None, 'year': 2020, 'difficulty': '中', # 'specials': [], 'upload_time': '2021-07-16T13:51:49.561000Z', # 'key_img': "", 'options_img': [], # 'options_rank': None, 'text_status': None, 'img_status': 1, # 'source': {'type': 's', # 'related_exampaper': [{'paper_id': '5fc0d256407550d0b7d9a43c', # 'file_name': '十堰市一中 2019 级高一下4月月考 数学试题包含答案 ', # 'item_id': None}]}}] # ocr_html = r'数学命题人:王旭辉一、选择题(本大题共12小题,每小题5分,请将正确答案填涂在答题卡相应的位置。)1.cos80°sin40°+sin50°cos10°的值为()A.${\quad \frac{1}{2}}$B.${\quad \frac{\sqrt {2}}{2}}${c.\quad \frac{\sqrt {3}}{2}}${b.\quad -\frac{\sqrt {3}}{2}}$2.已知在等比数列${\lbrace a_{n}\rbrace }$中,${a_{1}=1,\quad a_{5}=9,}${a_{3}=(\quad }$B.±5.±3D.33.已知a>b>0,则下列不等式成立的是()A.${\quad a>b>\frac{a+b}{2}>\sqrt {ab}}$B.${\quad a>\frac{a+b}{2}>\sqrt {ab}>b}$${c.\quad a>\frac{a+b}{2}>b>\sqrt {ab}}$D.${\quad a>\sqrt {ab}>\frac{a+b}{2}>b}$4.给出下列命题:①棱柱的侧棱都相等,侧面都是全等的平行四边形;②用一个平面去截棱锥,棱锥底面与截面之间的部分是棱台;③半圆绕着它的直径所在的直线旋转一周所形成的曲面叫做球面;④棱台的侧棱延长后交于一点,侧面是等腰梯形.其中正确命题的序号是()A.①②④B.①②③C.②③).③5.已知向量a${i=(1,2),\vec {b}=(2,-2),\quad \vec {c}=(\lambda ,-1)}$,若${\vec {c}//(2\vec {a}+\vec {b})}$则λ=()A.-2B.-1${c.\quad -\frac{1}{2}}$D.${\quad \frac{1}{2}}$6.已知△ABC中,a=1,${b=\sqrt {3},}$A=30°,则B等于()A.30°B.30°或150°C.60°D.60°或120°7.在△ABC中,a、b、c分别为角A、B、C的对边,若b=2,c=1,C=30°,则a=()()A.${\quad \sqrt {3}}$B.3C.${\quad \sqrt {5}}$D.18.若(a+b+c)(b+c-a)=3bc,且sinA=2sinBcosC,那么△ABC是()A.直角三角形B.等边三角形C.等腰三角形D.等腰直角三角形9.在等差数列${\lbrace a_{n}\rbrace }$中,${S_{15}>0,\quad S_{16}<0,}$则使${a_{n}>0}$成立的n的最大值为()A.6B.7C.8D.S10.已知等比数列${\lbrace a_{n}\rbrace }$的前n项和为${S_{n},}${S_{5}=2,\quad S_{10}=6,}${a_{16}+a_{17}+a_{18}+a_{19}+a_{20}=}$()A.54B.48C.32D.1611.设点D为△ABC中BC边上的中点,O为AD边上靠近点A的三等分点,则()A.${\quad \overrightarrow {BO}=-\frac{1}{6}\overrightarrow {AB}+\frac{1}{2}\overrightarrow {AC}}$B.${\quad \overrightarrow {BO}=\frac{1}{6}\overrightarrow {AB}-\frac{1}{2}\overrightarrow {AC}}$${\therefore \overrightarrow {BO}=\frac{5}{6}\overrightarrow {AB}-\frac{1}{6}\overrightarrow {AC}}$D.${\quad \overrightarrow {BO}=-\frac{5}{6}\overrightarrow {AB}+\frac{1}{6}\overrightarrow {AC}}$12.△ABC中,角A、B、C的对边分别为a、b、c,且2a+b=2ccosB,若△ABC的面积为${S=\sqrt {3}c,}$则ab的最小值为()A.12B.24C.28D.48二、填空题(本大题共4小题,每小题5分.请将正确答案填写在答题卡相应位置。)13.在数列${\lbrace a_{n}\rbrace }$中,若${a_{1}=1,a_{n}-a_{n-1}=n(n\geq 2),}$则该数列的通项${a_{n}=}$14.已知圆锥的侧面展开图是一个半径为6cm,圆心角为${\frac{2\pi}{3}}$的扇形,则此圆锥的体积为15.已知平面向量${\vec {a}}${\vec {b}}$的夹角为${\frac{\pi}{3},\quad \vec {a}=(\sqrt {3},-1),\quad |\vec {b}|=1,}$则|${2\vec {a}-\vec {b}|=}$16.已知α、β为锐角,sinα=2,tan(β-α)=-,则tanβ=______三.解答题(本大题共6小题,共70分.请将正确答案写在答题卡相应位置。解答应写出文字说明、证明过程或演算步骤.)17.(10分)(1)解不等式(x-1)(x-a)≥0(2)已知${f(x)=\frac{x^{2}+6x+9}{x+1},}$其中x>-1,求f(x)的最小值.18.(12分)已知函数j${{(x)=2\sin x\cos (x+\frac{\pi}{3})+\frac{\sqrt {3}}{2}}}$(1)求函数f(x)的最小正周期;(2)若f(x)+m≤0对${x\∈\lbrack 0,\frac{\pi}{2}\rbrack }$亘成立,求实数m的取值范围.19.(12分)已知等比数列${\lbrace a_{n}\rbrace }$的前n项和为${S_{n},}$且满足${S_{3}=7,\quad S_{6}=63,}$(1)求数列${\lbrace a_{n}\rbrace }$的通项公式;(2)若${b_{n}=a_{n}+\log _{2}a_{n},}$求数列${\lbrace b_{n}\rbrace }$的前n项和${T_{n}}$0.(12分)已知数列${\lbrace a_{n}\rbrace }$满足${a_{1}=\frac{3}{2},}${l_{n}=\frac{a_{n-1}}{2}+\frac{1}{2^{n-1}}(n\geq 2,n\∈N^{*})}$(1)求证:数列${\lbrace 2^{n}a_{n}\rbrace }$是等差数列,并求出数列${\lbrace a_{n}\rbrace }$的通项公式;(2)求数列${\lbrace a_{n}\rbrace }$的前n项和${S_{n}.}$21.(12分)如图,在△ABC中,${C=\frac{\pi}{4},\overrightarrow {CA}\cdot \overrightarrow {CB}=48,}$点D在BC边上,且${AD=5\sqrt {2},\cos \angle ADB=\frac{3}{5}}$(I)求AC,CD的长;(II)求cos∠BAD的值.22.(12分)数列{a,}中,a,=1,当n≥2时,其前n项和S,满足S,2=a,(S,--)${(u_{n})^{\prime }//t_{1},\quad u_{1}-}${4b_{n}//A_{n}-u_{n}\cdot (\beta _{n}-\bar {2})}$(1)求${S_{n}}$的表达式;(2)求数列${\lbrace a_{n}\rbrace }$的通项公式;(3)设${b_{n}=\frac{S_{n}}{2n+1}.}$求数列${\lbrace b_{n}\rbrace }$的前n项和${T_{n}.}$数学答案一、选择题二、填空题${13.\frac{n^{2}+n}{2}\quad 14.\frac{16\sqrt {2}}{3}\pi\quad 15.\sqrt {13}\quad 16.\frac{73}{9}}$三、解答题17.(1)当a>1时,原不等式解集是{x|x≥a,或x≤1};当a=1时,原不等式解集是R:当a<1时,原不等式解集是(x|x|1或X]a}_5分(2)∵x>-1,则x+1>0,由基本不等式得${f(x)=\frac{x^{2}+6x+9}{x+1}=}$+1x+1${(x+1)+\frac{4}{x+i}+4}$${=2\sqrt {(x+1)\cdot \frac{4}{x+1}}+4=8}$(当且仅当${x+1=\frac{4}{x+1}}$时,即当x=1时取得等号)因此,函数${f(x)=\frac{x^{2}+6x+9}{x+1}(x>-1}$)的最小值为810分18.解:(1)因为${f(x)=2\sin x\cos (x+}$${=2\sin x(\cos x\cos \frac{\pi}{3}-\sin x\sin x\sin }$${=2\sin x(\frac{1}{2}\cos x-\frac{\sqrt {3}}{2}\sin x}$${\div \sin x\cos x-\sqrt {3}\sin ^{2}x+\frac{\sqrt {3}}{2}}$${=\frac{1}{2}\sin 2x+\frac{\sqrt {3}}{2}\cos 2x}$${=\sin (2x+\frac{\pi}{3})}$所以f(x)的最小正周期为${f=\frac{2\pi}{2}=\pi}$(2)"f(x)+m≤0对${x\∈\lbrack 0,\frac{\pi}{2}\rbrack }$恒成立"等价于"${f(x)_{\max }+m\leq 0^{n}}$因为${x\∈\lbrack 0,\frac{\pi}{2}\rbrack }$所以${2x+\frac{\pi}{3}\∈\lbrack \frac{\pi}{3},\frac{4\pi}{3}\rbrack }$${2x+\frac{\pi}{3}=\frac{\pi}{2},}${x=\frac{\pi}{12}}$f(x)的最大值为${f(\frac{\pi}{12})=1.}$所以实数m的取值范围为${(-9O,-1\rbrack }$12分19.(1)由题意知S${\because _{6}\neq 2s_{3},q\neq 1\cdots }$${\therefore S_{3}=\frac{a_{1}(7-q^{3})}{1-q}=7}$........................3分${s_{6}=\frac{a_{1}(1-q^{6})}{1-q}}$解得${\left\lbrace \begin{array}{l}{a_{1}=7}\\{q=2}\end{array}\right.\cdots \cdots }$5分${\therefore a_{n}=2^{n-7}\cdots \cdots }$.6分2)由(1)知${b_{n}=2^{n-7}+n-1\cdots \cdots \cdots }$..7分∴T。=(1+2+-..+${y^{-x-3}_{n}}$+[1+2+--+(n-1)].................9分${=2^{n}+\frac{n^{2}-n}{2}-1\cdots }$.........12分20.(1)因为${a_{n}=\frac{a_{n-1}}{2}+\frac{1}{2^{n-1}}(n\geq 2,n\∈N^{*})}$,所以${2^{n}a_{n}=2^{n-1}a_{a}+2}$,即${2^{n}a_{n}-2^{n-1}a_{n-1}=2,}$所以数列${\lbrace 2^{n}a_{n}\rbrace }$是等差数列,且公差d=2,其首项${2a_{1}=3}$所以${2^{7}a_{n}=3+(n-1)\times 2=2n+7}$,解得${a_{n}=\frac{2n+i}{2^{n}}}$${2)\quad S_{n}=\frac{3}{2}+\frac{5}{2^{2}}+\frac{7}{2^{3}}+\cdots +\frac{2n-1}{2^{p-1}}+\frac{2n+1}{z^{\theta }},}$${\frac{S_{n}}{2}=\frac{3}{2^{2}}+\frac{5}{2^{3}}+\frac{7}{2^{4}}+\cdots +\frac{2n-1}{2^{n}}+\frac{2n+1}{2^{n}+1},}$①-②,得${\frac{S_{e}}{z}=\frac{3}{z}+}${+\frac{7}{2^{3}}+\cdots +\frac{1}{2^{s}}}${=\frac{3}{2}+}${z^{e}+x^{2}}$ ${1-\frac{7}{2}}$${=\frac{5}{x}-\frac{2n+5}{a+1},}$所以${S_{n}=5-\frac{2n+5}{2^{n}}}$12分21.(I)在△ABD中,∵${\cos \angle ADB=\frac{3}{5},\therefore \sin \angle ADB=\frac{4}{5}.}$sin∠CAD=sin(∠ADB-${\angle AcD,=\sin \angle ADB\cos \frac{\pi}{4}-\cos \angle ADB\sin \frac{\pi}{4}}$${=\frac{4}{5}\times \frac{\sqrt {2}}{2}-\frac{3}{5}\times \frac{\sqrt {2}}{2}=\frac{\sqrt {2}}{10}}$在△ADC中,由正弦定理得${\frac{Ac}{\sin \angle ADc}=\frac{cD}{\sin \angle CAD}=\frac{AD}{\sin \angle ACD}}${\frac{AC}{\frac{4}{5}}=\frac{cD}{\sqrt {2}}=\frac{5\sqrt {2}}{\sqrt {2}}}$解得${AC=8,CD=\sqrt {2},}$${\Pi)\because \overrightarrow {CA}\cdot \overrightarrow {CB}=48,}${8\cdot cB\cdot \frac{\sqrt {2}}{2}=48.}$解得${c_{B}=6\sqrt {2},}$∴BD=CB-CD=5√2在△ABC中,A${B=\sqrt {8^{2}+(6\sqrt {2})-2\times 8\times 6\sqrt {2}}=2\sqrt {10}}$,在△ABD中${\infty \angle BAD=\frac{(2\sqrt {10})^{2}+(5\sqrt {2})}{2\times 2\sqrt {10}\times 5\sqrt {2}}=\frac{\sqrt {5}}{5}}$22.(1)由${\sin ^{2}=5}=S_{n}-5_{5}}&{T_{6}^{z}=-7}}${s_{w}^{2}=(S_{w}-S_{6-1})(s_{w}-\frac{1}{2})=s_{e}^{\vec {x}}-\frac{1}{2}S_{\varphi }-S_{6}=+\frac{7}{2}s}$${\therefore \frac{1}{s_{n}}-\frac{1}{S_{n-1}}=2(n\supseteq 2)}$${\therefore \left\lbrace \begin{array}{l}{\frac{1}{s}\rbrace }$是以${\frac{1}{s_{7}}}$为首项,以2为公差的等差数列,${\therefore \frac{1}{s_{n}}=2n-1,}$${s_{n}=\frac{1}{2n-1}(n\∈N^{*}.}$${a_{n}=\left\lbrace \begin{array}{l}{7,n=7}\\{\frac{1}{2p-1}-\frac{7}{2n-3},n\geq }\end{array}\right.}$(3)${b_{n}=\frac{7}{(2n-7)(2n+i)}=}$${\therefore 7_{8}=\frac{1}{2}(1-\frac{1}{3}+\frac{1}{3}-\frac{1}{5}+\frac{1}{2\pi-1}-\frac{1}{2\pi+1})=\frac{1}{2}(-\frac{1}{2n+1})=\frac{\pi}{2\pi}}$12/' # svg_data = {'svg_html_data': ['1.在数列{an}中,若a1=1,an−an−1=n(n≥2),则该数列的通项an=【答案】12 \n 【解析】略 \n \n 编辑删除'], 'svg_path': ''} # # wordid = "5fc0d256407550d0b7d9a43c" # # res=Ruku(items_list, ocr_html, svg_data, wordid).upload_img() # pprint(res) f1 = open(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\struct_items.pickle",'rb').read() items_list = pickle.loads(f1) # pprint(items_list) f2 = open(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\svg_data.pickle", 'rb').read() svg_data = pickle.loads(f2) # svg_data = {"svg_path": "", "svg_html_data": ""} htmlt = open(r"F:\zwj\Text_Structure\accept_files\624802d612cd45a7836f342e.html", "r", encoding="utf8").read() res = Ruku(items_list, htmlt, svg_data, "20220408", {"callback_url": "1223", "source": ""}, "高中物理").upload_img()