123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- import re
- import time
- import configs
- wordid = "6195e6d034b3123bf9d1fbf7"
- htmlt = """
- 一、选择题
- 1.随着$4G$网络和智能手机的普及,各种导航软件层出不穷,给人们的生活带来了极大的便利。
- 国庆放假期间,小明驾车从家里到济南大明湖游玩,某导航软件推荐的最佳驾车路线里程为
- $220$公里,时间为3小时2分,下列说法正确的是
- A.“3小时2分”指的是时间间隔
- B."$220$公里”指的是位移大小
- C.按照推荐的最佳路线走,位移一定最小
- D.按照推荐的最佳路线的里程和时间推算,该车的平均速度为$20.1m/s$
- """
- items_list = [{
- "stem":"随着$4G$网络和智能手机的普及,各种导航软件层出不穷,给人们的生活带来了极大的便利。国庆放假期间,小明驾车从家里到济南大明湖游玩,某导航软件推荐的最佳驾车路线里程为$220$公里,时间为$3$小时$2$分,下列说法正确的是",
- "key":"见解析",
- "parse": "略",
- "options": ['“$3$小时$2$分”指的是时间间隔', '"$220$公里”指的是位移大小', "按照推荐的最佳路线走,位移一定最小", "按照推荐的最佳路线的里程和时间推算,该车的平均速度为$20.1m/s$"],
- "options_rank": 1
- }]
- put_key_mjmath = ["/zyk/uploadfiles/wording/6195e6d034b3123bf9d1fbf7/MJMATH-1637661579842495.png"] # 桶中key
- local_mjmath = ["F:/zwj/Text_Structure/img_folder/6195e6d034b3123bf9d1fbf7/svg_mjmath/MJMATH-1637661579842495.png"] # 本地图片存储位置
- ltx2url = {"4G": '<img src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zyk/uploadfiles/wording/6195e6d034b3123bf9d1fbf7/MJMATH-1637661579842495.png'
- + '" width="18.5" height="12.5" data-latex="$4G$" />'}
- #
- put_key_list = [] # 桶中key
- localnewpic_list = [] # 本地图片存储位置
- new_imgs = re.findall(r'<img src=".*?/ser_static/.*?/(new_image[^"]*?|eq_img_\d+\.png)"', str(htmlt))
- new_imgs2 = re.findall(r'<img src=".*?/ser_static/.*?/(new_image[^"]*?|eq_img_\d+\.png)"', str(items_list))
- new_imgs.extend(new_imgs2)
- # print(new_imgs)
- new_imgs = list(set(new_imgs))
- if new_imgs:
- put_key_list.extend(["/zyk/uploadfiles/wording/" + str(wordid) + "/{}".format(img_name)
- for img_name in new_imgs])
- file_path = "F:/zwj/Text_Structure/img_folder/" + str(wordid)
- localnewpic_list.extend([file_path + "/field_eq/" + img_name if "eq_img_" in img_name
- else file_path + "/" + img_name for img_name in new_imgs])
- old_imgs = re.findall(r'<img src=".*?/ser_static[/\\](.*?/word/media)/(image[^"]*?)"', str(htmlt))
- if old_imgs: # 针对最开始入库的情况,后期都直接在wordbin时就上传腾讯云了
- # 以前新添加的图片在左预览页仍以base64保存,若再解析,则会按新的方法保存到本地
- # 只是最开始解析的老图片是保存在服务器本地的,若入库时,则一起拿出来再上传腾讯云
- file_path = "F:/zwj/word_folder/" + old_imgs[0][0]
- put_key_list.extend(["/zyk/uploadfiles/wording/" + str(wordid) + "/{}".format(img[1])
- for img in old_imgs])
- localnewpic_list.extend([file_path + "/" + img[1] for img in old_imgs])
- if True:
- put_key_list.extend(put_key_mjmath)
- localnewpic_list.extend(local_mjmath)
- # ------------------------------------------------------------------------------------------
- items_res_to_xbk = []
- if localnewpic_list: # 本地有新图片时
- new_img_online = '<img src="http://' + configs.public_bucket_addr + "/zyk/uploadfiles/wording/" + str(wordid)
- new_img_local = '<img src="' + configs.new_img_ip + '/' + str(wordid) # 结构化中产生的图片
- old_img_local = '<img src="' + configs.old_img_ip + '/' + old_imgs[0][0] if old_imgs else "" # wordbin最初产生的图片
- def sub1(s):
- if s:
- s = re.sub(r'(<img src="[^"]*?[a-z\d])\\(?!\\)([^"]*?")', r"\1/\2", s) # 将路径中的\改为/
- s = s.replace(new_img_local, new_img_online)
- if old_img_local:
- return s.replace(old_img_local, new_img_online)
- return s
- def sub2(s):
- if s:
- all_ltx = re.findall(r'\$.*?\$', s) # 查找试题结构中的公式
- all_ltx.extend(re.findall('\\\\\(.*?\\\\\)', s))
- all_ltx = list(set(all_ltx))
- for ltx in all_ltx:
- new_ltx = ltx.replace("$", "").replace("\\(", "").replace("\\)", "")
- if new_ltx in ltx2url: # 将latex换为其渲染图片的线上可访问地址
- s = s.replace(ltx, ltx2url[new_ltx])
- return s
- time6 = time.time()
- # 4>> 结构化题目中图片地址替换,需要区分下学管端还是云题库!!!!!一定会保存一份在资源库
- items_res_to_zyk = items_list.copy()
- for one_items in items_res_to_zyk:
- for k in ["stem", "key", "parse", "options"]: # "analysis",
- if k in one_items:
- if k == "options":
- one_items[k] = list(map(sub1, one_items[k]))
- else:
- one_items[k] = sub1(one_items[k])
- # -----------难度和知识点自动标注------------------------
- t11 = time.time()
- diffs_xbk = [3] * len(items_res_to_zyk)
- # items_res_to_xbk = items_res_to_zyk.copy()
- # if self.callback_url and self.ltx2url:
- for nn, one_items in enumerate(items_res_to_zyk):
- new_one_item = {"difficulty": diffs_xbk[nn],
- "knowledge": {}}
- # if 1:
- # new_one_item["topic_type_id"] = one_items["checkType"]["id"]
- # if one_items["checkType"]["name"] == "填空题":
- # new_one_item["blank_num"] = one_items["blank_num"]
- keys_items = ["stem", "key", "parse", "options", "options_rank"]
- # if one_items['img_status'] == 1 and ("stem_img" in one_items and one_items["stem_img"]):
- # logger.info("----【word_id:{}】mathjax2svg所取的字段是带img的".format(self.wordid))
- # keys_items = ["stem_img", "key_img", "parse_img", "options_img"]
- if 1 and ltx2url:
- for k in keys_items:
- if k in one_items:
- if k == "options":
- one_items[k] = list(map(sub2, one_items[k]))
- elif k != "options_rank":
- one_items[k] = sub2(one_items[k])
- new_one_item[k] = one_items[k]
- else:
- print("----【word_id:{}】第{}道题{}字段有问题".format(wordid, one_items["topic_num"], k))
- else:
- for k in keys_items:
- if k in one_items:
- new_one_item[k] = one_items[k]
- # if self.subject == "高中物理":
- # temp_items = {"topic_id": one_items["topic_num"]}
- # temp_items["content"] = new_one_item["stem"]
- # temp_items["parse"] = str(new_one_item["key"]) + "<br/>" + new_one_item["parse"] + "<br/>" + \
- # new_one_item["analysis"]
- # temp_items["option"] = new_one_item["options"] if "options" in new_one_item else []
- # auto_kps = self.get_phy_kps_auto(temp_items)
- # # print("自动标注考点:", auto_kps)
- # new_one_item["knowledge"] = auto_kps
- items_res_to_xbk.append(new_one_item)
- # 5>> ocr-htmlt中图片地址替换成云上地址
- htmlt = re.sub(r'(<img src="[^"]*?[a-z\d])\\(?!\\)([^"]*?")', r"\1/\2", htmlt)
- htmlt = htmlt.replace(new_img_local, new_img_online)
- if old_img_local:
- htmlt = htmlt.replace(old_img_local, new_img_online)
- print(htmlt)
- print(items_res_to_xbk)
- print("----【word_id:{}】结构化试题中图片地址替换时间:{}".format(wordid, time.time() - time6))
|