1 year ago · 6d60a081d1
--- a/__pycache__/config.cpython-38.pyc
+++ b/__pycache__/config.cpython-38.pyc
--- a/__pycache__/data_preprocessing.cpython-38.pyc
+++ b/__pycache__/data_preprocessing.cpython-38.pyc
--- a/__pycache__/formula_process.cpython-38.pyc
+++ b/__pycache__/formula_process.cpython-38.pyc
--- a/__pycache__/heap_sort.cpython-38.pyc
+++ b/__pycache__/heap_sort.cpython-38.pyc
--- a/__pycache__/hnsw_model_train.cpython-38.pyc
+++ b/__pycache__/hnsw_model_train.cpython-38.pyc
--- a/__pycache__/hnsw_retrieval.cpython-38.pyc
+++ b/__pycache__/hnsw_retrieval.cpython-38.pyc
--- a/__pycache__/info_retrieval.cpython-38.pyc
+++ b/__pycache__/info_retrieval.cpython-38.pyc
--- a/__pycache__/information_retrieval.cpython-38.pyc
+++ b/__pycache__/information_retrieval.cpython-38.pyc
--- a/__pycache__/log_config.cpython-38.pyc
+++ b/__pycache__/log_config.cpython-38.pyc
--- a/__pycache__/restart_server.cpython-38.pyc
+++ b/__pycache__/restart_server.cpython-38.pyc
--- a/__pycache__/restart_server_app.cpython-38.pyc
+++ b/__pycache__/restart_server_app.cpython-38.pyc
--- a/__pycache__/retrieve_algorithm.cpython-38.pyc
+++ b/__pycache__/retrieve_algorithm.cpython-38.pyc
--- a/__pycache__/word_segment.cpython-38.pyc
+++ b/__pycache__/word_segment.cpython-38.pyc
--- a/formula_process.py
+++ b/formula_process.py
@@ -136,13 +136,13 @@ def formula_init():
 
				         json.dump(formula_id_list, f, ensure_ascii=False)
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    text = '已知c水=4.2×103J/(kg·℃),求-10'
			
 
				-    text = "水的密度：ρ＝1.0×10kg/m3是"
			
 
				-    text = "'金宜高速'是经过河池市政府所在地的第一条高速公路,王华一家从金城江到刘三姐故里宜州自驾旅行,单程共消耗汽油5*kg.小车总重为1.5*10^4*N,静止在水平地面上时轮子与地面接触的总面积为0.15*m^2,(汽油的热值q=4.6*10^7J/kg).求:(1)小车静止时对水平地面的压强;(2)5*kg汽油完全燃烧放出的热量,"
			
 
				-    text = "p蜡=0.9*10^3Kq/m^3"
			
 
				-    text = "在一个案件中,公安人员在海滩案发现场发现了罪犯留下的清晰的双脚站立脚印,立即用蜡浇灌了一只鞋模.测量鞋模的平均厚度为3*cm,质量675*g,又经测试达到脚印同样深度的压强为1.5*10^4*Pa,请你帮助公安人员计算出罪犯的体重为多少?(ρ蜡=0.9*10^3*kg/m^3)"
			
 
				-    for i in range(1):
			
 
				-        print(formula_recognize(text))
			
 
				+    # text = '已知c水=4.2×103J/(kg·℃),求-10'
			
 
				+    # text = "水的密度：ρ＝1.0×10kg/m3是"
			
 
				+    # text = "'金宜高速'是经过河池市政府所在地的第一条高速公路,王华一家从金城江到刘三姐故里宜州自驾旅行,单程共消耗汽油5*kg.小车总重为1.5*10^4*N,静止在水平地面上时轮子与地面接触的总面积为0.15*m^2,(汽油的热值q=4.6*10^7J/kg).求:(1)小车静止时对水平地面的压强;(2)5*kg汽油完全燃烧放出的热量,"
			
 
				+    # text = "p蜡=0.9*10^3Kq/m^3"
			
 
				+    # text = "在一个案件中,公安人员在海滩案发现场发现了罪犯留下的清晰的双脚站立脚印,立即用蜡浇灌了一只鞋模.测量鞋模的平均厚度为3*cm,质量675*g,又经测试达到脚印同样深度的压强为1.5*10^4*Pa,请你帮助公安人员计算出罪犯的体重为多少?(ρ蜡=0.9*10^3*kg/m^3)"
			
 
				+    # for i in range(1):
			
 
				+    #     print(formula_recognize(text))
			
 
				     
			
 
				-    # # mongodb公式处理初始化
			
 
				-    # formula_init()
			
 
				+    # mongodb公式处理初始化
			
 
				+    formula_init()
			
--- a/hnsw_model_train.py
+++ b/hnsw_model_train.py
@@ -1,5 +1,6 @@
 
				 import os
			
 
				 import time
			
 
				+import numpy as np
			
 
				 import pickle
			
 
				 import hnswlib
			
 
				 
			
@@ -44,19 +45,7 @@ class HNSW_Model_Train():
 
				     # 训练HNSW模型
			
 
				     def hnsw_train(self, origin_dataset, hnsw_path):
			
 
				         start0 = time.time()
			
 
				-        idx_list = []
			
 
				-        vec_list = []
			
 
				 
			
 
				-        for data in origin_dataset:
			
 
				-            if "sentence_vec" not in data:
			
 
				-                continue
			
 
				-            sentence_vec = pickle.loads(data["sentence_vec"])
			
 
				-            if sentence_vec.size != config.vector_dim:
			
 
				-                continue
			
 
				-            # sentence_vec = (sentence_vec + self.bias).dot(self.kernel).reshape(-1)
			
 
				-            idx_list.append(data["id"])
			
 
				-            vec_list.append(sentence_vec)
			
 
				-        
			
 
				         # 初始化HNSW搜索图
			
 
				         # possible options are l2, cosine or ip
			
 
				         hnsw_p = hnswlib.Index(space = config.hnsw_metric, dim = config.vector_dim)
			
@@ -66,12 +55,30 @@ class HNSW_Model_Train():
 
				         hnsw_p.set_ef(config.hnsw_set_ef)
			
 
				         # 设置线程数量-during batch search/construction
			
 
				         hnsw_p.set_num_threads(4)
			
 
				+
			
 
				+        idx_list, vec_list = [], []
			
 
				+        for data_idx,data in enumerate(origin_dataset):
			
 
				+            if "sentence_vec" not in data:
			
 
				+                continue
			
 
				+            sentence_vec = pickle.loads(data["sentence_vec"]).astype(np.float32)
			
 
				+            if sentence_vec.size != config.vector_dim:
			
 
				+                continue
			
 
				+            idx_list.append(data["id"])
			
 
				+            vec_list.append(sentence_vec)
			
 
				+            # 设置批量处理长度,若满足条件则进行批量处理
			
 
				+            if (data_idx+1) % 300000 == 0:
			
 
				+                # 按数据对应顺序随机打乱数据
			
 
				+                idx_list, vec_list = shuffle_data_pair(idx_list, vec_list)
			
 
				+                # 将数据进行HNSW构图
			
 
				+                hnsw_p.add_items(vec_list, idx_list)
			
 
				+                idx_list, vec_list = [], []
			
 
				         # 将句向量加入到HNSW
			
 
				         if len(idx_list) > 0:
			
 
				             # 按数据对应顺序随机打乱数据
			
 
				             idx_list, vec_list = shuffle_data_pair(idx_list, vec_list)
			
 
				             # 将数据进行HNSW构图
			
 
				             hnsw_p.add_items(vec_list, idx_list)
			
 
				+
			
 
				         # 保存HNSW图模型
			
 
				         # 注意:HNSW需要进行路径管理-------------------------------------tjt
			
 
				         os.chdir(config.data_root_path)
			
--- a/hnsw_retrieval.py
+++ b/hnsw_retrieval.py
@@ -29,13 +29,28 @@ class HNSW():
 
				             self.formula_id_list = json.load(f)
			
 
				 
			
 
				     # 图片搜索查重功能
			
 
				-    def img_retrieve(self, retrieve_text, post_url, similar):
			
 
				+    def img_retrieve(self, retrieve_text, post_url, similar, topic_num):
			
 
				         try:
			
 
				             if post_url is not None:
			
 
				+                # 日志采集
			
 
				+                if self.logger is not None:
			
 
				+                    self.logger.info(self.log_msg.format(id="图片搜索查重",
			
 
				+                                                          type="{}图片搜索查重post".format(topic_num),
			
 
				+                                                          message=retrieve_text))
			
 
				                 img_dict = dict(img_url=retrieve_text, img_threshold=similar, img_max_num=40)
			
 
				-                img_res = requests.post(post_url, json=img_dict, timeout=20).json()
			
 
				+                img_res = requests.post(post_url, json=img_dict, timeout=30).json()
			
 
				+                # 日志采集
			
 
				+                if self.logger is not None:
			
 
				+                    self.logger.info(self.log_msg.format(id="图片搜索查重",
			
 
				+                                                          type="{}图片搜索查重success".format(topic_num),
			
 
				+                                                          message=img_res))
			
 
				                 return img_res
			
 
				         except Exception as e:
			
 
				+            # 日志采集
			
 
				+            if self.logger is not None:
			
 
				+                self.logger.error(self.log_msg.format(id="图片搜索查重",
			
 
				+                                                        type="{}图片搜索查重error".format(topic_num),
			
 
				+                                                        message=retrieve_text))
			
 
				             return []
			
 
				     
			
 
				     # 公式搜索查重功能
			
@@ -74,11 +89,12 @@ class HNSW():
 
				                 # 对余弦相似度进行折算
			
 
				                 cosine_score = formula_cos[0][idx]
			
 
				                 if 0.95 <= cosine_score < 0.98:
			
 
				+                    cosine_score = cosine_score * 0.98
			
 
				+                elif cosine_score < 0.95:
			
 
				                     cosine_score = cosine_score * 0.95
			
 
				-                elif 0.9 <= cosine_score < 0.95:
			
 
				-                    cosine_score = cosine_score * 0.93
			
 
				-                elif cosine_score < 0.9:
			
 
				-                    cosine_score = cosine_score * 0.91
			
 
				+                # 余弦相似度折算后阈值判断
			
 
				+                if cosine_score < similar:
			
 
				+                    continue
			
 
				                 res_list.append([self.formula_id_list[idx][1], int(cosine_score * 100) / 100])
			
 
				         # 根据分数对题目id排序并返回前50个
			
 
				         res_sort_list = sorted(res_list, key=lambda x: x[1], reverse=True)[:80]
			
@@ -93,24 +109,120 @@ class HNSW():
 
				 
			
 
				         return formula_res_list[:50]
			
 
				 
			
 
				+    # # HNSW查(支持多学科混合查重)
			
 
				+    # def retrieve(self, retrieve_list, post_url, similar, doc_flag, min_threshold=0.56):
			
 
				+    #     # 计算retrieve_list的vec值
			
 
				+    #     # 调用清洗分词函数和句向量计算函数
			
 
				+    #     sent_vec_list, cont_clear_list = self.dpp(retrieve_list, is_retrieve=True)
			
 
				+
			
 
				+    #     # HNSW查重
			
 
				+    #     def dup_search(retrieve_data, sent_vec, cont_clear):
			
 
				+    #         # 初始化返回数据类型
			
 
				+    #         retrieve_value_dict = dict(synthese=[], semantics=[], text=[], image=[])
			
 
				+    #         # 获取题目序号
			
 
				+    #         topic_num = retrieve_data["topic_num"] if "topic_num" in retrieve_data else 1
			
 
				+    #         # 图片搜索查重功能
			
 
				+    #         if doc_flag is True:
			
 
				+    #             retrieve_value_dict["image"] = self.img_retrieve(retrieve_data["stem"], post_url, similar, topic_num)
			
 
				+    #         else:
			
 
				+    #             retrieve_value_dict["image"] = []
			
 
				+    #         # 判断句向量维度
			
 
				+    #         if sent_vec.size != self.vector_dim:
			
 
				+    #             return retrieve_value_dict
			
 
				+    #         # 调用hnsw接口检索数据
			
 
				+    #         post_list = sent_vec.tolist()
			
 
				+    #         try:
			
 
				+    #             query_labels = requests.post(self.hnsw_retrieve_url, json=post_list, timeout=10).json()
			
 
				+    #         except Exception as e:
			
 
				+    #             query_labels = []
			
 
				+    #             # 日志采集
			
 
				+    #             if self.logger is not None:
			
 
				+    #                 self.logger.error(self.log_msg.format(id="HNSW检索error",
			
 
				+    #                                                       type="当前题目HNSW检索error",
			
 
				+    #                                                       message=cont_clear))
			
 
				+    #         if len(query_labels) == 0:
			
 
				+    #             return retrieve_value_dict
			
 
				+
			
 
				+    #         # 批量读取数据库
			
 
				+    #         mongo_find_dict = {"id": {"$in": query_labels}}
			
 
				+    #         query_dataset = self.mongo_coll.find(mongo_find_dict)
			
 
				+
			
 
				+    #         # 返回大于阈值的结果
			
 
				+    #         filter_threshold = similar
			
 
				+    #         for label_data in query_dataset:
			
 
				+    #             if "sentence_vec" not in label_data:
			
 
				+    #                 continue
			
 
				+    #             # 计算余弦相似度得分
			
 
				+    #             label_vec = pickle.loads(label_data["sentence_vec"])
			
 
				+    #             if label_vec.size != self.vector_dim:
			
 
				+    #                 continue
			
 
				+    #             cosine_score = util.cos_sim(sent_vec, label_vec)[0][0]
			
 
				+    #             # 阈值判断
			
 
				+    #             if cosine_score < filter_threshold:
			
 
				+    #                 continue
			
 
				+    #             # 计算编辑距离得分
			
 
				+    #             fuzz_score = fuzz.ratio(cont_clear, label_data["content_clear"]) / 100
			
 
				+    #             if fuzz_score < min_threshold:
			
 
				+    #                 continue
			
 
				+    #             # 对余弦相似度进行折算
			
 
				+    #             if cosine_score >= 0.91 and fuzz_score < min_threshold + 0.06:
			
 
				+    #                 cosine_score = cosine_score * 0.95
			
 
				+    #             elif cosine_score < 0.91 and fuzz_score < min_threshold + 0.06:
			
 
				+    #                 cosine_score = cosine_score * 0.94
			
 
				+    #             # 余弦相似度折算后阈值判断
			
 
				+    #             if cosine_score < filter_threshold:
			
 
				+    #                 continue
			
 
				+    #             retrieve_value = [label_data["id"], int(cosine_score * 100) / 100]
			
 
				+    #             retrieve_value_dict["semantics"].append(retrieve_value)
			
 
				+    #             # 进行编辑距离得分验证,若小于设定分则过滤
			
 
				+    #             if fuzz_score >= filter_threshold:
			
 
				+    #                 retrieve_value = [label_data["id"], fuzz_score]
			
 
				+    #                 retrieve_value_dict["text"].append(retrieve_value)
			
 
				+            
			
 
				+    #         # 将组合结果按照score降序排序并取得分前十个结果
			
 
				+    #         retrieve_sort_dict = {k: sorted(value, key=lambda x: x[1], reverse=True)
			
 
				+    #                               for k,value in retrieve_value_dict.items()}
			
 
				+
			
 
				+    #         # 综合排序
			
 
				+    #         synthese_list = sorted(sum(retrieve_sort_dict.values(), []), key=lambda x: x[1], reverse=True)
			
 
				+    #         synthese_set = set()
			
 
				+    #         for ele in synthese_list:
			
 
				+    #             if ele[0] not in synthese_set and len(retrieve_sort_dict["synthese"]) < 50:
			
 
				+    #                 synthese_set.add(ele[0])
			
 
				+    #                 retrieve_sort_dict["synthese"].append(ele)
			
 
				+    #         # 加入题目序号
			
 
				+    #         retrieve_sort_dict["topic_num"] = topic_num
			
 
				+            
			
 
				+    #         # 以字典形式返回最终查重结果
			
 
				+    #         return retrieve_sort_dict
			
 
				+
			
 
				+    #     # 多线程HNSW查重
			
 
				+    #     with ThreadPoolExecutor(max_workers=5) as executor:
			
 
				+    #         retrieve_res_list = list(executor.map(dup_search, retrieve_list, sent_vec_list, cont_clear_list))
			
 
				+
			
 
				+    #     return retrieve_res_list
			
 
				+
			
 
				     # HNSW查(支持多学科混合查重)
			
 
				-    def retrieve(self, retrieve_list, post_url, similar, doc_flag, min_threshold=0.4):
			
 
				+    def retrieve(self, retrieve_list, post_url, similar, doc_flag, min_threshold=0.56):
			
 
				         # 计算retrieve_list的vec值
			
 
				         # 调用清洗分词函数和句向量计算函数
			
 
				         sent_vec_list, cont_clear_list = self.dpp(retrieve_list, is_retrieve=True)
			
 
				-
			
 
				         # HNSW查重
			
 
				-        def dup_search(retrieve_data, sent_vec, cont_clear):
			
 
				+        retrieve_res_list = []
			
 
				+        for i,sent_vec in enumerate(sent_vec_list):
			
 
				             # 初始化返回数据类型
			
 
				             retrieve_value_dict = dict(synthese=[], semantics=[], text=[], image=[])
			
 
				+            # 获取题目序号
			
 
				+            topic_num = retrieve_list[i]["topic_num"] if "topic_num" in retrieve_list[i] else 1
			
 
				             # 图片搜索查重功能
			
 
				             if doc_flag is True:
			
 
				-                retrieve_value_dict["image"] = self.img_retrieve(retrieve_data["stem"], post_url, similar)
			
 
				+                retrieve_value_dict["image"] = self.img_retrieve(retrieve_list[i]["stem"], post_url, similar, topic_num)
			
 
				             else:
			
 
				                 retrieve_value_dict["image"] = []
			
 
				             # 判断句向量维度
			
 
				             if sent_vec.size != self.vector_dim:
			
 
				-                return retrieve_value_dict
			
 
				+                retrieve_res_list.append(retrieve_value_dict)
			
 
				+                continue
			
 
				             # 调用hnsw接口检索数据
			
 
				             post_list = sent_vec.tolist()
			
 
				             try:
			
@@ -121,9 +233,10 @@ class HNSW():
 
				                 if self.logger is not None:
			
 
				                     self.logger.error(self.log_msg.format(id="HNSW检索error",
			
 
				                                                           type="当前题目HNSW检索error",
			
 
				-                                                          message=cont_clear))
			
 
				+                                                          message=cont_clear_list[i]))
			
 
				             if len(query_labels) == 0:
			
 
				-                return retrieve_value_dict
			
 
				+                retrieve_res_list.append(retrieve_value_dict)
			
 
				+                continue
			
 
				 
			
 
				             # 批量读取数据库
			
 
				             mongo_find_dict = {"id": {"$in": query_labels}}
			
@@ -143,16 +256,17 @@ class HNSW():
 
				                 if cosine_score < filter_threshold:
			
 
				                     continue
			
 
				                 # 计算编辑距离得分
			
 
				-                fuzz_score = fuzz.ratio(cont_clear, label_data["content_clear"]) / 100
			
 
				+                fuzz_score = fuzz.ratio(cont_clear_list[i], label_data["content_clear"]) / 100
			
 
				                 if fuzz_score < min_threshold:
			
 
				                     continue
			
 
				                 # 对余弦相似度进行折算
			
 
				-                if 0.95 <= cosine_score < 0.98:
			
 
				-                    cosine_score = cosine_score * 0.93
			
 
				-                elif 0.9 <= cosine_score < 0.95:
			
 
				-                    cosine_score = cosine_score * 0.87
			
 
				-                elif cosine_score < 0.9:
			
 
				-                    cosine_score = cosine_score * 0.81
			
 
				+                if cosine_score >= 0.91 and fuzz_score < min_threshold + 0.06:
			
 
				+                    cosine_score = cosine_score * 0.95
			
 
				+                elif cosine_score < 0.91 and fuzz_score < min_threshold + 0.06:
			
 
				+                    cosine_score = cosine_score * 0.94
			
 
				+                # 余弦相似度折算后阈值判断
			
 
				+                if cosine_score < filter_threshold:
			
 
				+                    continue
			
 
				                 retrieve_value = [label_data["id"], int(cosine_score * 100) / 100]
			
 
				                 retrieve_value_dict["semantics"].append(retrieve_value)
			
 
				                 # 进行编辑距离得分验证,若小于设定分则过滤
			
@@ -171,31 +285,29 @@ class HNSW():
 
				                 if ele[0] not in synthese_set and len(retrieve_sort_dict["synthese"]) < 50:
			
 
				                     synthese_set.add(ele[0])
			
 
				                     retrieve_sort_dict["synthese"].append(ele)
			
 
				-
			
 
				+            # 加入题目序号
			
 
				+            retrieve_sort_dict["topic_num"] = topic_num
			
 
				+            
			
 
				             # 以字典形式返回最终查重结果
			
 
				-            retrieve_sort_dict["topic_num"] = retrieve_data["topic_num"]
			
 
				-            return retrieve_sort_dict
			
 
				+            retrieve_res_list.append(retrieve_sort_dict)
			
 
				 
			
 
				-        # 多线程HNSW查重
			
 
				-        with ThreadPoolExecutor(max_workers=5) as executor:
			
 
				-            retrieve_res_list = list(executor.map(dup_search, retrieve_list, sent_vec_list, cont_clear_list))
			
 
				-            
			
 
				         return retrieve_res_list
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     # 获取mongodb数据
			
 
				     mongo_coll = config.mongo_coll
			
 
				-    hnsw = HNSW()
			
 
				+    from data_preprocessing import DataPreProcessing
			
 
				+    hnsw = HNSW(DataPreProcessing())
			
 
				 
			
 
				-    # test_data = []
			
 
				-    # for idx in [15176736]:
			
 
				-    #     test_data.append(mongo_coll.find_one({"id": idx}))
			
 
				+    test_data = []
			
 
				+    for idx in [201511100736265]:
			
 
				+        test_data.append(mongo_coll.find_one({"id": idx}))
			
 
				 
			
 
				-    # res = hnsw.retrieve(test_data)
			
 
				-    # pprint(res)
			
 
				+    res = hnsw.retrieve(test_data, '', 0.8, False)
			
 
				+    pprint(res[0]["semantics"])
			
 
				 
			
 
				-    # 公式搜索查重功能
			
 
				-    formula_string = "ρ蜡=0.9*10^3Kg/m^3"
			
 
				-    formula_string = "p蜡=0.9*10^3Kq/m^3"
			
 
				-    print(hnsw.formula_retrieve(formula_string, 0.8))
			
 
				+    # # 公式搜索查重功能
			
 
				+    # formula_string = "ρ蜡=0.9*10^3Kg/m^3"
			
 
				+    # formula_string = "p蜡=0.9*10^3Kq/m^3"
			
 
				+    # print(hnsw.formula_retrieve(formula_string, 0.8))
			
--- a/info_retrieval.py
+++ b/info_retrieval.py
@@ -28,7 +28,7 @@ class Info_Retrieval():
 
				         # 将搜索语句进行标准化清洗
			
 
				         sentence = self.dpp.content_clear_func(sentence)
			
 
				         # 将搜索语句分词
			
 
				-        seg_list = self.word_seg(sentence)
			
 
				+        seg_list, seg_init_list = self.word_seg(sentence)
			
 
				         # 日志采集
			
 
				         self.logger.info(log_msg.format(id="文本查重",
			
 
				                                         type="info_retrieve分词",
			
@@ -64,7 +64,7 @@ class Info_Retrieval():
 
				         # 对检索结果进行判断并取出排序后的id
			
 
				         scores_sort_list = [ele[0] for ele in scores_list]
			
 
				         
			
 
				-        return scores_sort_list, seg_list
			
 
				+        return scores_sort_list, seg_init_list
			
 
				     
			
 
				     # bm25算法计算倒排索引关键词对应文档得分
			
 
				     def BM25(self, scores_dict, term_data, doc_param, recall_doc_set):
			
--- a/main_clear/__pycache__/sci_clear.cpython-38.pyc
+++ b/main_clear/__pycache__/sci_clear.cpython-38.pyc
--- a/main_clear/sci_clear.py
+++ b/main_clear/sci_clear.py
@@ -163,6 +163,12 @@ def get_maplef_items(html):
 
				     s = s.replace(r'②', '(2)、')
			
 
				     s = s.replace(r'③', '(3)、')
			
 
				     s = s.replace(r'④', '(4)、')
			
 
				+    s = s.replace(r'⑤', '(5)、')
			
 
				+    s = s.replace(r'⑥', '(6)、')
			
 
				+    s = s.replace(r'⑦', '(7)、')
			
 
				+    s = s.replace(r'⑧', '(8)、')
			
 
				+    s = s.replace(r'⑨', '(9)、')
			
 
				+    s = s.replace(r'⑩', '(10)、')
			
 
				 
			
 
				     s = re.sub(r'/images/1-50/[1-9].gif', '(   )', s)
			
 
				     s = re.sub(r'/([0-9a-z/*.]*?)(png|jpg|gif)', 'img', s)
			
@@ -196,6 +202,7 @@ def get_maplef_items(html):
 
				 
			
 
				     # tjt修改
			
 
				     s = s.replace("×", "*")
			
 
				+    s = s.replace("*", '')
			
 
				     s = s.replace("%%", '')
			
 
				     s = s.replace('\\n', '')
			
 
				     s = s.replace('\\r', '')
			
--- a/retrieval_app.py
+++ b/retrieval_app.py
@@ -40,7 +40,7 @@ def hnsw_retrieve():
 
				                                                     type="hnsw_retrieve接收",
			
 
				                                                     message=retrieve_dict))
			
 
				         # hnsw模型查重
			
 
				-        post_url = r"http://localhost:8068/topic_retrieval_http"
			
 
				+        post_url = r"http://192.168.1.209:8068/topic_retrieval_http"
			
 
				         res_list = hnsw_model.retrieve(retrieve_list, post_url, similar, doc_flag)
			
 
				         # 返回日志采集
			
 
				         retrieval_logger.info(config.log_msg.format(id=id_name,
			
@@ -59,10 +59,10 @@ def image_retrieve():
 
				         retrieve_img = retrieve_dict["content"]
			
 
				         similar = retrieve_dict["similar"] / 100
			
 
				         # 图片查重链接
			
 
				-        post_url = r"http://localhost:8068/img_retrieval_http"
			
 
				+        post_url = r"http://192.168.1.209:8068/img_retrieval_http"
			
 
				         img_dict = dict(img_url=retrieve_img, img_threshold=similar, img_max_num=30)
			
 
				         try:
			
 
				-            res_list = requests.post(post_url, json=img_dict, timeout=20).json()
			
 
				+            res_list = requests.post(post_url, json=img_dict, timeout=30).json()
			
 
				         except Exception as e:
			
 
				             res_list = []
			
 
				         # 返回日志采集
			
@@ -111,9 +111,8 @@ def info_retrieve():
 
				         id_list, seg_list = ir_model(sentence)
			
 
				         id_list = [int(idx) for idx in id_list]
			
 
				         # 语义相似度查重
			
 
				-        retrieve_list = [dict(stem=sentence, topic_num=1)]
			
 
				+        retrieve_list = [dict(stem=sentence)]
			
 
				         if len(sentence) > 30:
			
 
				-            retrieve_list = [dict(stem=sentence, topic_num=1)]
			
 
				             doc_list = hnsw_model.retrieve(retrieve_list, '', similar, False)[0]["semantics"]
			
 
				         else:
			
 
				             doc_list = hnsw_model.retrieve(retrieve_list, '', similar, False, 0.6)[0]["semantics"]
			
--- a/setup.py
+++ b/setup.py
@@ -68,14 +68,15 @@ if __name__ == "__main__":
 
				     exclude_list=["setup.py", "__init__.py", "config.py", "guc_conf.py", 
			
 
				                   "retrieval_monitor.py", "restart_server.py"]
			
 
				     # 获取当前目录下的所有"app.py"文件加入exclude_list列表
			
 
				-    file_list = os.listdir(root_path)
			
 
				+    os.chdir(copy_path)
			
 
				+    file_list = os.listdir(copy_path)
			
 
				     exclude_list.extend([f for f in file_list if f.endswith("app.py")])
			
 
				     
			
 
				     # 需要setup加密的".py"文件列表
			
 
				     pylist = []
			
 
				     # 遍历搜索目录下所有py文件并返回列表
			
 
				-    pylist = path_search(root_path, pylist, exclude_list)
			
 
				+    pylist = path_search(copy_path, pylist, exclude_list)
			
 
				     # 生成cython加密文件
			
 
				     setup_func(pylist, exclude_list)
			
 
				     # 清理setup生成的临时文件
			
 
				-    clean_setup_file(root_path, pylist)
			
 
				+    clean_setup_file(copy_path, pylist)
			
--- a/word_segment.py
+++ b/word_segment.py
@@ -15,31 +15,31 @@ class Word_Segment():
 
				     def __call__(self, sentence):
			
 
				         sentence = re.sub( r"[\-_—]", ' ', sentence)
			
 
				         # 统一将大写转化为小写
			
 
				-        seg_list = jieba.lcut(sentence.lower())
			
 
				+        seg_init_list = jieba.lcut(sentence.lower())
			
 
				         # 将分词列表中的数字变为空字符串
			
 
				-        seg_list = [re.sub(r'\d+\.?\d*%?', ' ', w) for w in seg_list]
			
 
				+        seg_list = [re.sub(r'\d+\.?\d*%?', ' ', w) for w in seg_init_list]
			
 
				         # 若词为停用词,则变为' '(方便后续进行n-grams组合)
			
 
				         seg_list = [w if w not in self.stop_words else ' ' for w in seg_list]
			
 
				         if self.n_grams_flag is True:
			
 
				             seg_list = self.n_grams(seg_list)
			
 
				-        
			
 
				-        return [w.strip() for w in seg_list if w.strip() != '']
			
 
				+        seg_list = [w.strip() for w in seg_list if w.strip() != '']
			
 
				+
			
 
				+        return seg_list, seg_init_list
			
 
				 
			
 
				     # 计算分词后的词语n-grams组合
			
 
				     def n_grams(self, seg_list):
			
 
				         length = len(seg_list)
			
 
				         for i in range(length):
			
 
				             if i+1 < length and self.is_Chinese(seg_list[i]) and self.is_Chinese(seg_list[i+1]):
			
 
				-                seg_list.append(seg_list[i]+seg_list[i+1])
			
 
				+                seg_list.append(seg_list[i] + seg_list[i+1])
			
 
				         return seg_list
			
 
				 
			
 
				     # 判断字符串是否为全为中文字符
			
 
				     def is_Chinese(self, string):
			
 
				         for char in string:
			
 
				-            if '\u4e00' <= char <= '\u9fff': 
			
 
				-                continue
			
 
				-            else: 
			
 
				-                return False
			
 
				+            if '\u4e00' <= char <= '\u9fff': continue
			
 
				+            else: return False
			
 
				+        
			
 
				         return True if string != '' else False
			
 
				 
			
 
				 if __name__ == "__main__":