ruku_test.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import re
  2. import time
  3. import configs
  4. wordid = "6195e6d034b3123bf9d1fbf7"
  5. htmlt = """
  6. 一、选择题
  7. 1.随着$4G$网络和智能手机的普及,各种导航软件层出不穷,给人们的生活带来了极大的便利。
  8. 国庆放假期间,小明驾车从家里到济南大明湖游玩,某导航软件推荐的最佳驾车路线里程为
  9. $220$公里,时间为3小时2分,下列说法正确的是
  10. A.“3小时2分”指的是时间间隔
  11. B."$220$公里”指的是位移大小
  12. C.按照推荐的最佳路线走,位移一定最小
  13. D.按照推荐的最佳路线的里程和时间推算,该车的平均速度为$20.1m/s$
  14. """
  15. items_list = [{
  16. "stem":"随着$4G$网络和智能手机的普及,各种导航软件层出不穷,给人们的生活带来了极大的便利。国庆放假期间,小明驾车从家里到济南大明湖游玩,某导航软件推荐的最佳驾车路线里程为$220$公里,时间为$3$小时$2$分,下列说法正确的是",
  17. "key":"见解析",
  18. "parse": "略",
  19. "options": ['“$3$小时$2$分”指的是时间间隔', '"$220$公里”指的是位移大小', "按照推荐的最佳路线走,位移一定最小", "按照推荐的最佳路线的里程和时间推算,该车的平均速度为$20.1m/s$"],
  20. "options_rank": 1
  21. }]
  22. put_key_mjmath = ["/zyk/uploadfiles/wording/6195e6d034b3123bf9d1fbf7/MJMATH-1637661579842495.png"] # 桶中key
  23. local_mjmath = ["F:/zwj/Text_Structure/img_folder/6195e6d034b3123bf9d1fbf7/svg_mjmath/MJMATH-1637661579842495.png"] # 本地图片存储位置
  24. ltx2url = {"4G": '<img src="http://zxhx-1302712961.cos.ap-shanghai.myqcloud.com/zyk/uploadfiles/wording/6195e6d034b3123bf9d1fbf7/MJMATH-1637661579842495.png'
  25. + '" width="18.5" height="12.5" data-latex="$4G$" />'}
  26. #
  27. put_key_list = [] # 桶中key
  28. localnewpic_list = [] # 本地图片存储位置
  29. new_imgs = re.findall(r'<img src=".*?/ser_static/.*?/(new_image[^"]*?|eq_img_\d+\.png)"', str(htmlt))
  30. new_imgs2 = re.findall(r'<img src=".*?/ser_static/.*?/(new_image[^"]*?|eq_img_\d+\.png)"', str(items_list))
  31. new_imgs.extend(new_imgs2)
  32. # print(new_imgs)
  33. new_imgs = list(set(new_imgs))
  34. if new_imgs:
  35. put_key_list.extend(["/zyk/uploadfiles/wording/" + str(wordid) + "/{}".format(img_name)
  36. for img_name in new_imgs])
  37. file_path = "F:/zwj/Text_Structure/img_folder/" + str(wordid)
  38. localnewpic_list.extend([file_path + "/field_eq/" + img_name if "eq_img_" in img_name
  39. else file_path + "/" + img_name for img_name in new_imgs])
  40. old_imgs = re.findall(r'<img src=".*?/ser_static[/\\](.*?/word/media)/(image[^"]*?)"', str(htmlt))
  41. if old_imgs: # 针对最开始入库的情况,后期都直接在wordbin时就上传腾讯云了
  42. # 以前新添加的图片在左预览页仍以base64保存,若再解析,则会按新的方法保存到本地
  43. # 只是最开始解析的老图片是保存在服务器本地的,若入库时,则一起拿出来再上传腾讯云
  44. file_path = "F:/zwj/word_folder/" + old_imgs[0][0]
  45. put_key_list.extend(["/zyk/uploadfiles/wording/" + str(wordid) + "/{}".format(img[1])
  46. for img in old_imgs])
  47. localnewpic_list.extend([file_path + "/" + img[1] for img in old_imgs])
  48. if True:
  49. put_key_list.extend(put_key_mjmath)
  50. localnewpic_list.extend(local_mjmath)
  51. # ------------------------------------------------------------------------------------------
  52. items_res_to_xbk = []
  53. if localnewpic_list: # 本地有新图片时
  54. new_img_online = '<img src="http://' + configs.public_bucket_addr + "/zyk/uploadfiles/wording/" + str(wordid)
  55. new_img_local = '<img src="' + configs.new_img_ip + '/' + str(wordid) # 结构化中产生的图片
  56. old_img_local = '<img src="' + configs.old_img_ip + '/' + old_imgs[0][0] if old_imgs else "" # wordbin最初产生的图片
  57. def sub1(s):
  58. if s:
  59. s = re.sub(r'(<img src="[^"]*?[a-z\d])\\(?!\\)([^"]*?")', r"\1/\2", s) # 将路径中的\改为/
  60. s = s.replace(new_img_local, new_img_online)
  61. if old_img_local:
  62. return s.replace(old_img_local, new_img_online)
  63. return s
  64. def sub2(s):
  65. if s:
  66. all_ltx = re.findall(r'\$.*?\$', s) # 查找试题结构中的公式
  67. all_ltx.extend(re.findall('\\\\\(.*?\\\\\)', s))
  68. all_ltx = list(set(all_ltx))
  69. for ltx in all_ltx:
  70. new_ltx = ltx.replace("$", "").replace("\\(", "").replace("\\)", "")
  71. if new_ltx in ltx2url: # 将latex换为其渲染图片的线上可访问地址
  72. s = s.replace(ltx, ltx2url[new_ltx])
  73. return s
  74. time6 = time.time()
  75. # 4>> 结构化题目中图片地址替换,需要区分下学管端还是云题库!!!!!一定会保存一份在资源库
  76. items_res_to_zyk = items_list.copy()
  77. for one_items in items_res_to_zyk:
  78. for k in ["stem", "key", "parse", "options"]: # "analysis",
  79. if k in one_items:
  80. if k == "options":
  81. one_items[k] = list(map(sub1, one_items[k]))
  82. else:
  83. one_items[k] = sub1(one_items[k])
  84. # -----------难度和知识点自动标注------------------------
  85. t11 = time.time()
  86. diffs_xbk = [3] * len(items_res_to_zyk)
  87. # items_res_to_xbk = items_res_to_zyk.copy()
  88. # if self.callback_url and self.ltx2url:
  89. for nn, one_items in enumerate(items_res_to_zyk):
  90. new_one_item = {"difficulty": diffs_xbk[nn],
  91. "knowledge": {}}
  92. # if 1:
  93. # new_one_item["topic_type_id"] = one_items["checkType"]["id"]
  94. # if one_items["checkType"]["name"] == "填空题":
  95. # new_one_item["blank_num"] = one_items["blank_num"]
  96. keys_items = ["stem", "key", "parse", "options", "options_rank"]
  97. # if one_items['img_status'] == 1 and ("stem_img" in one_items and one_items["stem_img"]):
  98. # logger.info("----【word_id:{}】mathjax2svg所取的字段是带img的".format(self.wordid))
  99. # keys_items = ["stem_img", "key_img", "parse_img", "options_img"]
  100. if 1 and ltx2url:
  101. for k in keys_items:
  102. if k in one_items:
  103. if k == "options":
  104. one_items[k] = list(map(sub2, one_items[k]))
  105. elif k != "options_rank":
  106. one_items[k] = sub2(one_items[k])
  107. new_one_item[k] = one_items[k]
  108. else:
  109. print("----【word_id:{}】第{}道题{}字段有问题".format(wordid, one_items["topic_num"], k))
  110. else:
  111. for k in keys_items:
  112. if k in one_items:
  113. new_one_item[k] = one_items[k]
  114. # if self.subject == "高中物理":
  115. # temp_items = {"topic_id": one_items["topic_num"]}
  116. # temp_items["content"] = new_one_item["stem"]
  117. # temp_items["parse"] = str(new_one_item["key"]) + "<br/>" + new_one_item["parse"] + "<br/>" + \
  118. # new_one_item["analysis"]
  119. # temp_items["option"] = new_one_item["options"] if "options" in new_one_item else []
  120. # auto_kps = self.get_phy_kps_auto(temp_items)
  121. # # print("自动标注考点:", auto_kps)
  122. # new_one_item["knowledge"] = auto_kps
  123. items_res_to_xbk.append(new_one_item)
  124. # 5>> ocr-htmlt中图片地址替换成云上地址
  125. htmlt = re.sub(r'(<img src="[^"]*?[a-z\d])\\(?!\\)([^"]*?")', r"\1/\2", htmlt)
  126. htmlt = htmlt.replace(new_img_local, new_img_online)
  127. if old_img_local:
  128. htmlt = htmlt.replace(old_img_local, new_img_online)
  129. print(htmlt)
  130. print(items_res_to_xbk)
  131. print("----【word_id:{}】结构化试题中图片地址替换时间:{}".format(wordid, time.time() - time6))