2 달 전 · 1460b92170
--- a/configs.py
+++ b/configs.py
@@ -116,9 +116,10 @@ class ProductionCfg:  # production
 
				     kps_Hmath_ip = "http://172.16.2.5:13356/auto_labels"
			
 
				     repeat_ip = "http://10.19.1.18:8866/api/repeat/subject"
			
 
				     # topic_segment_ip = "http://10.19.1.14:10622/math_phy_TopicSegment_predict"  #CPU
			
 
				-    topic_segment_ip = "http://10.19.1.10:10622/math_phy_TopicSegment_predict"  #GPU
			
 
				-    # topic_segment_ip = "http://49.232.72.198:10622/math_phy_TopicSegment_predict"
			
 
				-    phy_topicType_ip = "http://10.19.1.6:10611/phy_topicType_predict"
			
 
				+    # topic_segment_ip = "http://10.19.1.10:10622/math_phy_TopicSegment_predict"  #GPU
			
 
				+    topic_segment_ip = "http://49.232.72.198:10622/math_phy_TopicSegment_predict"
			
 
				+    # phy_topicType_ip = "http://10.19.1.6:10611/phy_topicType_predict"
			
 
				+    phy_topicType_ip = "http://10.19.1.21:10611/phy_topicType_predict"  # 速度更快
			
 
				     callback_url_taskcheck = "http://api.tk.zhixinhuixue.com/v1/interior-api/record"
			
 
				 
			
 
				 
			
@@ -161,7 +162,9 @@ RES_FOLDER = config_class.RES_FOLDER
 
				 if not os.path.isdir(RES_FOLDER):
			
 
				     os.makedirs(RES_FOLDER)
			
 
				 
			
 
				-new_img_ip = "http://{0}:{1}/ser_static".format(external_ip, server_port)
			
 
				+# 对外地址要改为域名：zsycgi.zhixinhuixue.com/cgi/ai/nlp/ai_paper_parse/like
			
 
				+# new_img_ip = "http://{0}:{1}/ser_static".format(external_ip, server_port)
			
 
				+new_img_ip = "http://{0}/ser_static".format("zsycgi.zhixinhuixue.com/cgi/ai/nlp/ai_paper_parse/like")
			
 
				 old_img_ip = config_class.old_img_ip
			
 
				 RawImg_UploadFolder = config_class.raw_img_upload_folder
			
 
				 mathpix_ip = config_class.mathpix_ip
			
--- a/structure/final_structure.py
+++ b/structure/final_structure.py
@@ -45,8 +45,8 @@ def one_item_structure(xyz):
 
				         # return one_item
			
 
				         if re.match(r"[A-Z][A-Z;；和与、､\s]*?$", ans.strip()):
			
 
				             one_item["type"] = "单选题" if len(ans.strip()) == 1 else "多选题"
			
 
				-        elif re.search(r"[(（]\s*[)）]", one_item["stem"]) or \
			
 
				-                len(re.findall(r"[\n\s]\s*[A-D]\s*[.．、､]", one_item["stem"])) >= 4:
			
 
				+        elif (re.search(r"[(（]\s*[)）]", one_item["stem"]) or len(re.findall(r"[\n\s]\s*[A-D]\s*[.．、､]", one_item["stem"])) >= 4)\
			
 
				+                and (len(re.findall("\n[(（]\s*[1-9][)）]", one_item["stem"]))<=1 or re.search("回答下列问题[:：]", one_item["stem"]) is None):
			
 
				             one_item["type"] = "选择题"
			
 
				         elif re.findall(r"_{2,}", one_item["stem"]):
			
 
				             one_item["type"] = "填空题"
			
--- a/structure/structure_main.py
+++ b/structure/structure_main.py
@@ -1,7 +1,7 @@
 
				 #!/usr/bin/env/python
			
 
				 # -*- coding:utf-8 -*-
			
 
				-# import sys
			
 
				-# sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci")
			
 
				+import sys
			
 
				+sys.path.append(r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci")
			
 
				 
			
 
				 from pprint import pprint
			
 
				 from typing import Any
			
@@ -343,7 +343,7 @@ if __name__ == '__main__':
 
				 
			
 
				     # path2 = r"C:\Users\Python\Desktop\bug\5-9\663c90361ec1003b58557474.html"
			
 
				     path2 = r"F:\zwj\Text_Structure\accept_files\667cb9c0c3c4da9e7009b8c4.html"
			
 
				-    path2 = r"F:\zwj\Text_Structure\accept_files\668f4d57c3c4da9e7009bcd8.html"
			
 
				+    path2 = r"F:\zwj\Text_Structure\accept_files\66e3ec74c3c4da9e7009cfb5.html"
			
 
				     # path2 = r"C:\Users\Python\Desktop\bug\6419746d11a1cdad550f5502.html"
			
 
				     # path2 = r"F:\zwj\Text_Structure\new_tiku_structure_v3_sci\data\620bbf7aa7d375f4518b98e1.html"
			
 
				     # path2 = r"F:\zwj\new_word_text_extract_v2\data\地理\2\2020-2021学年广东省揭阳市揭西县五校九年级（下）第二次联考地理试卷-普通用卷.html"
			
@@ -361,9 +361,9 @@ if __name__ == '__main__':
 
				     # print(html)
			
 
				     # html = "\n1、已知集合M满足{1，2}≤M≤{1，2，5，6，7}，则\n符合条件的集合M有__个."
			
 
				     # html = html.replace('<img src="files', '<img src="/word/media')
			
 
				-    res1 = WordParseStructure(html, "668f4d57c3c4da9e7009bcd8",
			
 
				+    res1 = WordParseStructure(html, "66e3ec74c3c4da9e7009cfb5",
			
 
				                               is_reparse=1, must_latex=1,
			
 
				-                              source="qtk", subject="数学")()
			
 
				+                              source="qtk", subject="高中化学")()
			
 
				     # new_fpath = os.path.join(r"F:\zwj\Text_Structure\fail_files", "res1.html")
			
 
				     # re_f = open(new_fpath, 'a+', encoding='utf-8')
			
 
				     # for i in res1[0]["items"]:
			
--- a/更改记录.txt
+++ b/更改记录.txt
@@ -1,3 +1,8 @@
 
				+2024.6.6  将物理题型预测服务部署至10.19.1.21(GPU服务器)，虽然只用了CPU，但推理速度与10.19.1.6(CPU服务器)相比明显快了很多。
			
 
				+        初步对比两台服务器，10.19.1.6核数24，剩余内存很多，10.19.1.21核数8，剩余内存不多，
			
 
				+        但10.19.1.6处理任务多、线程多，且CPU使用达到100%，而10.19.1.21的CPU使用率不高
			
 
				+
			
 
				+2024.7.8-15：
			
 
				 1、再解析清洗中，遇到表格里面公式的情况，需要对公式的标签"math-tex"进行清洗--->修改html_again_parse.py
			
 
				 2、入库保存记录中发现存在一些“latex替换为imgurl失败”的情况，
			
 
				    发现是：从items_list中获取$xxx$公式时，经过查重替换的公式也被提取出来，但替换后的latex与原本的latex不一定写法完全一样
			
@@ -7,3 +12,8 @@
 
				    最后保存入库时，将红色标记去掉--->在ruku_opera.py中修改sub1
			
 
				 4、试题中的多个连续空格在前端显示时会被折叠成1个，需要将多个空格改为多个&nbsp;====>修改washutil.py中的convert_huanhang
			
 
				 2024.7.11 html清洗时漏掉了<meta charset="utf-8" />的“<” 不能替换为&lt;  ====》 修改washutil.py中html_cleal 
			
 
				+
			
 
				+2024.9.5 解析工具校对过程中截图粘贴的图片对外不显示问题处理：增加对外域名===>修改文科、理科结构化解析中configs.py中的new_img_ip
			
 
				+
			
 
				+2024.9.12  解答题中出现小题是选择题格式的情况下，当没有给定题型时，在单题结构化的时候需要判断题型，容易判成是选择题的情况，
			
 
				+      =====>增加限制条件，修改final_structure.py第48-49行