cdZWj преди 2 месеца
родител
ревизия
1460b92170
променени са 4 файла, в които са добавени 24 реда и са изтрити 11 реда
  1. 7 4
      configs.py
  2. 2 2
      structure/final_structure.py
  3. 5 5
      structure/structure_main.py
  4. 10 0
      更改记录.txt

+ 7 - 4
configs.py

@@ -116,9 +116,10 @@ class ProductionCfg:  # production
     kps_Hmath_ip = "http://172.16.2.5:13356/auto_labels"
     repeat_ip = "http://10.19.1.18:8866/api/repeat/subject"
     # topic_segment_ip = "http://10.19.1.14:10622/math_phy_TopicSegment_predict"  #CPU
-    topic_segment_ip = "http://10.19.1.10:10622/math_phy_TopicSegment_predict"  #GPU
-    # topic_segment_ip = "http://49.232.72.198:10622/math_phy_TopicSegment_predict"
-    phy_topicType_ip = "http://10.19.1.6:10611/phy_topicType_predict"
+    # topic_segment_ip = "http://10.19.1.10:10622/math_phy_TopicSegment_predict"  #GPU
+    topic_segment_ip = "http://49.232.72.198:10622/math_phy_TopicSegment_predict"
+    # phy_topicType_ip = "http://10.19.1.6:10611/phy_topicType_predict"
+    phy_topicType_ip = "http://10.19.1.21:10611/phy_topicType_predict"  # 速度更快
     callback_url_taskcheck = "http://api.tk.zhixinhuixue.com/v1/interior-api/record"
 
 
@@ -161,7 +162,9 @@ RES_FOLDER = config_class.RES_FOLDER
 if not os.path.isdir(RES_FOLDER):
     os.makedirs(RES_FOLDER)
 
-new_img_ip = "http://{0}:{1}/ser_static".format(external_ip, server_port)
+# 对外地址要改为域名:zsycgi.zhixinhuixue.com/cgi/ai/nlp/ai_paper_parse/like
+# new_img_ip = "http://{0}:{1}/ser_static".format(external_ip, server_port)
+new_img_ip = "http://{0}/ser_static".format("zsycgi.zhixinhuixue.com/cgi/ai/nlp/ai_paper_parse/like")
 old_img_ip = config_class.old_img_ip
 RawImg_UploadFolder = config_class.raw_img_upload_folder
 mathpix_ip = config_class.mathpix_ip

+ 2 - 2
structure/final_structure.py

@@ -45,8 +45,8 @@ def one_item_structure(xyz):
         # return one_item
         if re.match(r"[A-Z][A-Z;;和与、、\s]*?$", ans.strip()):
             one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题"
-        elif re.search(r"[((]\s*[))]", one_item["stem"]) or \
-                len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 4:
+        elif (re.search(r"[((]\s*[))]", one_item["stem"]) or len(re.findall(r"[\n\s]\s*[A-D]\s*[..、、]", one_item["stem"])) >= 4)\
+                and (len(re.findall("\n[((]\s*[1-9][))]", one_item["stem"]))<=1 or re.search("回答下列问题[::]", one_item["stem"]) is None):
             one_item["type"] = "选择题"
         elif re.findall(r"_{2,}", one_item["stem"]):
             one_item["type"] = "填空题"

+ 5 - 5
structure/structure_main.py

@@ -1,7 +1,7 @@
 #!/usr/bin/env/python
 # -*- coding:utf-8 -*-
-# import sys
-# sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci")
+import sys
+sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci")
 
 from pprint import pprint
 from typing import Any
@@ -343,7 +343,7 @@ if __name__ == '__main__':
 
     # path2 = r"C:\Users\Python\Desktop\bug\5-9\663c90361ec1003b58557474.html"
     path2 = r"F:\zwj\Text_Structure\accept_files\667cb9c0c3c4da9e7009b8c4.html"
-    path2 = r"F:\zwj\Text_Structure\accept_files\668f4d57c3c4da9e7009bcd8.html"
+    path2 = r"F:\zwj\Text_Structure\accept_files\66e3ec74c3c4da9e7009cfb5.html"
     # path2 = r"C:\Users\Python\Desktop\bug\6419746d11a1cdad550f5502.html"
     # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\data\620bbf7aa7d375f4518b98e1.html"
     # path2 = r"F:\zwj\new_word_text_extract_v2\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级(下)第二次联考地理试卷-普通用卷.html"
@@ -361,9 +361,9 @@ if __name__ == '__main__':
     # print(html)
     # html = "\n1、已知集合M满足{1,2}≤M≤{1,2,5,6,7},则\n符合条件的集合M有__个."
     # html = html.replace('<img src="files', '<img src="/word/media')
-    res1 = WordParseStructure(html, "668f4d57c3c4da9e7009bcd8",
+    res1 = WordParseStructure(html, "66e3ec74c3c4da9e7009cfb5",
                               is_reparse=1, must_latex=1,
-                              source="qtk", subject="学")()
+                              source="qtk", subject="高中化学")()
     # new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html")
     # re_f = open(new_fpath, 'a+', encoding='utf-8')
     # for i in res1[0]["items"]:

+ 10 - 0
更改记录.txt

@@ -1,3 +1,8 @@
+2024.6.6  将物理题型预测服务部署至10.19.1.21(GPU服务器),虽然只用了CPU,但推理速度与10.19.1.6(CPU服务器)相比明显快了很多。
+        初步对比两台服务器,10.19.1.6核数24,剩余内存很多,10.19.1.21核数8,剩余内存不多,
+        但10.19.1.6处理任务多、线程多,且CPU使用达到100%,而10.19.1.21的CPU使用率不高
+
+2024.7.8-15:
 1、再解析清洗中,遇到表格里面公式的情况,需要对公式的标签"math-tex"进行清洗--->修改html_again_parse.py
 2、入库保存记录中发现存在一些“latex替换为imgurl失败”的情况,
    发现是:从items_list中获取$xxx$公式时,经过查重替换的公式也被提取出来,但替换后的latex与原本的latex不一定写法完全一样
@@ -7,3 +12,8 @@
    最后保存入库时,将红色标记去掉--->在ruku_opera.py中修改sub1
 4、试题中的多个连续空格在前端显示时会被折叠成1个,需要将多个空格改为多个&nbsp;====>修改washutil.py中的convert_huanhang
 2024.7.11 html清洗时漏掉了<meta charset="utf-8" />的“<” 不能替换为&lt;  ====》 修改washutil.py中html_cleal 
+
+2024.9.5 解析工具校对过程中截图粘贴的图片对外不显示问题处理:增加对外域名===>修改文科、理科结构化解析中configs.py中的new_img_ip
+
+2024.9.12  解答题中出现小题是选择题格式的情况下,当没有给定题型时,在单题结构化的时候需要判断题型,容易判成是选择题的情况,
+      =====>增加限制条件,修改final_structure.py第48-49行