tjt 1 year ago
parent
commit
6d60a081d1

BIN
__pycache__/config.cpython-38.pyc


BIN
__pycache__/data_preprocessing.cpython-38.pyc


BIN
__pycache__/formula_process.cpython-38.pyc


BIN
__pycache__/heap_sort.cpython-38.pyc


BIN
__pycache__/hnsw_model_train.cpython-38.pyc


BIN
__pycache__/hnsw_retrieval.cpython-38.pyc


BIN
__pycache__/info_retrieval.cpython-38.pyc


BIN
__pycache__/information_retrieval.cpython-38.pyc


BIN
__pycache__/log_config.cpython-38.pyc


BIN
__pycache__/restart_server.cpython-38.pyc


BIN
__pycache__/restart_server_app.cpython-38.pyc


BIN
__pycache__/retrieve_algorithm.cpython-38.pyc


BIN
__pycache__/word_segment.cpython-38.pyc


+ 9 - 9
formula_process.py

@@ -136,13 +136,13 @@ def formula_init():
         json.dump(formula_id_list, f, ensure_ascii=False)
 
 if __name__ == "__main__":
-    text = '已知c水=4.2×103J/(kg·℃),求-10'
-    text = "水的密度:ρ=1.0×10kg/m3是"
-    text = "'金宜高速'是经过河池市政府所在地的第一条高速公路,王华一家从金城江到刘三姐故里宜州自驾旅行,单程共消耗汽油5*kg.小车总重为1.5*10^4*N,静止在水平地面上时轮子与地面接触的总面积为0.15*m^2,(汽油的热值q=4.6*10^7J/kg).求:(1)小车静止时对水平地面的压强;(2)5*kg汽油完全燃烧放出的热量,"
-    text = "p蜡=0.9*10^3Kq/m^3"
-    text = "在一个案件中,公安人员在海滩案发现场发现了罪犯留下的清晰的双脚站立脚印,立即用蜡浇灌了一只鞋模.测量鞋模的平均厚度为3*cm,质量675*g,又经测试达到脚印同样深度的压强为1.5*10^4*Pa,请你帮助公安人员计算出罪犯的体重为多少?(ρ蜡=0.9*10^3*kg/m^3)"
-    for i in range(1):
-        print(formula_recognize(text))
+    # text = '已知c水=4.2×103J/(kg·℃),求-10'
+    # text = "水的密度:ρ=1.0×10kg/m3是"
+    # text = "'金宜高速'是经过河池市政府所在地的第一条高速公路,王华一家从金城江到刘三姐故里宜州自驾旅行,单程共消耗汽油5*kg.小车总重为1.5*10^4*N,静止在水平地面上时轮子与地面接触的总面积为0.15*m^2,(汽油的热值q=4.6*10^7J/kg).求:(1)小车静止时对水平地面的压强;(2)5*kg汽油完全燃烧放出的热量,"
+    # text = "p蜡=0.9*10^3Kq/m^3"
+    # text = "在一个案件中,公安人员在海滩案发现场发现了罪犯留下的清晰的双脚站立脚印,立即用蜡浇灌了一只鞋模.测量鞋模的平均厚度为3*cm,质量675*g,又经测试达到脚印同样深度的压强为1.5*10^4*Pa,请你帮助公安人员计算出罪犯的体重为多少?(ρ蜡=0.9*10^3*kg/m^3)"
+    # for i in range(1):
+    #     print(formula_recognize(text))
     
-    # # mongodb公式处理初始化
-    # formula_init()
+    # mongodb公式处理初始化
+    formula_init()

+ 19 - 12
hnsw_model_train.py

@@ -1,5 +1,6 @@
 import os
 import time
+import numpy as np
 import pickle
 import hnswlib
 
@@ -44,19 +45,7 @@ class HNSW_Model_Train():
     # 训练HNSW模型
     def hnsw_train(self, origin_dataset, hnsw_path):
         start0 = time.time()
-        idx_list = []
-        vec_list = []
 
-        for data in origin_dataset:
-            if "sentence_vec" not in data:
-                continue
-            sentence_vec = pickle.loads(data["sentence_vec"])
-            if sentence_vec.size != config.vector_dim:
-                continue
-            # sentence_vec = (sentence_vec + self.bias).dot(self.kernel).reshape(-1)
-            idx_list.append(data["id"])
-            vec_list.append(sentence_vec)
-        
         # 初始化HNSW搜索图
         # possible options are l2, cosine or ip
         hnsw_p = hnswlib.Index(space = config.hnsw_metric, dim = config.vector_dim)
@@ -66,12 +55,30 @@ class HNSW_Model_Train():
         hnsw_p.set_ef(config.hnsw_set_ef)
         # 设置线程数量-during batch search/construction
         hnsw_p.set_num_threads(4)
+
+        idx_list, vec_list = [], []
+        for data_idx,data in enumerate(origin_dataset):
+            if "sentence_vec" not in data:
+                continue
+            sentence_vec = pickle.loads(data["sentence_vec"]).astype(np.float32)
+            if sentence_vec.size != config.vector_dim:
+                continue
+            idx_list.append(data["id"])
+            vec_list.append(sentence_vec)
+            # 设置批量处理长度,若满足条件则进行批量处理
+            if (data_idx+1) % 300000 == 0:
+                # 按数据对应顺序随机打乱数据
+                idx_list, vec_list = shuffle_data_pair(idx_list, vec_list)
+                # 将数据进行HNSW构图
+                hnsw_p.add_items(vec_list, idx_list)
+                idx_list, vec_list = [], []
         # 将句向量加入到HNSW
         if len(idx_list) > 0:
             # 按数据对应顺序随机打乱数据
             idx_list, vec_list = shuffle_data_pair(idx_list, vec_list)
             # 将数据进行HNSW构图
             hnsw_p.add_items(vec_list, idx_list)
+
         # 保存HNSW图模型
         # 注意:HNSW需要进行路径管理-------------------------------------tjt
         os.chdir(config.data_root_path)

+ 149 - 37
hnsw_retrieval.py

@@ -29,13 +29,28 @@ class HNSW():
             self.formula_id_list = json.load(f)
 
     # 图片搜索查重功能
-    def img_retrieve(self, retrieve_text, post_url, similar):
+    def img_retrieve(self, retrieve_text, post_url, similar, topic_num):
         try:
             if post_url is not None:
+                # 日志采集
+                if self.logger is not None:
+                    self.logger.info(self.log_msg.format(id="图片搜索查重",
+                                                          type="{}图片搜索查重post".format(topic_num),
+                                                          message=retrieve_text))
                 img_dict = dict(img_url=retrieve_text, img_threshold=similar, img_max_num=40)
-                img_res = requests.post(post_url, json=img_dict, timeout=20).json()
+                img_res = requests.post(post_url, json=img_dict, timeout=30).json()
+                # 日志采集
+                if self.logger is not None:
+                    self.logger.info(self.log_msg.format(id="图片搜索查重",
+                                                          type="{}图片搜索查重success".format(topic_num),
+                                                          message=img_res))
                 return img_res
         except Exception as e:
+            # 日志采集
+            if self.logger is not None:
+                self.logger.error(self.log_msg.format(id="图片搜索查重",
+                                                        type="{}图片搜索查重error".format(topic_num),
+                                                        message=retrieve_text))
             return []
     
     # 公式搜索查重功能
@@ -74,11 +89,12 @@ class HNSW():
                 # 对余弦相似度进行折算
                 cosine_score = formula_cos[0][idx]
                 if 0.95 <= cosine_score < 0.98:
+                    cosine_score = cosine_score * 0.98
+                elif cosine_score < 0.95:
                     cosine_score = cosine_score * 0.95
-                elif 0.9 <= cosine_score < 0.95:
-                    cosine_score = cosine_score * 0.93
-                elif cosine_score < 0.9:
-                    cosine_score = cosine_score * 0.91
+                # 余弦相似度折算后阈值判断
+                if cosine_score < similar:
+                    continue
                 res_list.append([self.formula_id_list[idx][1], int(cosine_score * 100) / 100])
         # 根据分数对题目id排序并返回前50个
         res_sort_list = sorted(res_list, key=lambda x: x[1], reverse=True)[:80]
@@ -93,24 +109,120 @@ class HNSW():
 
         return formula_res_list[:50]
 
+    # # HNSW查(支持多学科混合查重)
+    # def retrieve(self, retrieve_list, post_url, similar, doc_flag, min_threshold=0.56):
+    #     # 计算retrieve_list的vec值
+    #     # 调用清洗分词函数和句向量计算函数
+    #     sent_vec_list, cont_clear_list = self.dpp(retrieve_list, is_retrieve=True)
+
+    #     # HNSW查重
+    #     def dup_search(retrieve_data, sent_vec, cont_clear):
+    #         # 初始化返回数据类型
+    #         retrieve_value_dict = dict(synthese=[], semantics=[], text=[], image=[])
+    #         # 获取题目序号
+    #         topic_num = retrieve_data["topic_num"] if "topic_num" in retrieve_data else 1
+    #         # 图片搜索查重功能
+    #         if doc_flag is True:
+    #             retrieve_value_dict["image"] = self.img_retrieve(retrieve_data["stem"], post_url, similar, topic_num)
+    #         else:
+    #             retrieve_value_dict["image"] = []
+    #         # 判断句向量维度
+    #         if sent_vec.size != self.vector_dim:
+    #             return retrieve_value_dict
+    #         # 调用hnsw接口检索数据
+    #         post_list = sent_vec.tolist()
+    #         try:
+    #             query_labels = requests.post(self.hnsw_retrieve_url, json=post_list, timeout=10).json()
+    #         except Exception as e:
+    #             query_labels = []
+    #             # 日志采集
+    #             if self.logger is not None:
+    #                 self.logger.error(self.log_msg.format(id="HNSW检索error",
+    #                                                       type="当前题目HNSW检索error",
+    #                                                       message=cont_clear))
+    #         if len(query_labels) == 0:
+    #             return retrieve_value_dict
+
+    #         # 批量读取数据库
+    #         mongo_find_dict = {"id": {"$in": query_labels}}
+    #         query_dataset = self.mongo_coll.find(mongo_find_dict)
+
+    #         # 返回大于阈值的结果
+    #         filter_threshold = similar
+    #         for label_data in query_dataset:
+    #             if "sentence_vec" not in label_data:
+    #                 continue
+    #             # 计算余弦相似度得分
+    #             label_vec = pickle.loads(label_data["sentence_vec"])
+    #             if label_vec.size != self.vector_dim:
+    #                 continue
+    #             cosine_score = util.cos_sim(sent_vec, label_vec)[0][0]
+    #             # 阈值判断
+    #             if cosine_score < filter_threshold:
+    #                 continue
+    #             # 计算编辑距离得分
+    #             fuzz_score = fuzz.ratio(cont_clear, label_data["content_clear"]) / 100
+    #             if fuzz_score < min_threshold:
+    #                 continue
+    #             # 对余弦相似度进行折算
+    #             if cosine_score >= 0.91 and fuzz_score < min_threshold + 0.06:
+    #                 cosine_score = cosine_score * 0.95
+    #             elif cosine_score < 0.91 and fuzz_score < min_threshold + 0.06:
+    #                 cosine_score = cosine_score * 0.94
+    #             # 余弦相似度折算后阈值判断
+    #             if cosine_score < filter_threshold:
+    #                 continue
+    #             retrieve_value = [label_data["id"], int(cosine_score * 100) / 100]
+    #             retrieve_value_dict["semantics"].append(retrieve_value)
+    #             # 进行编辑距离得分验证,若小于设定分则过滤
+    #             if fuzz_score >= filter_threshold:
+    #                 retrieve_value = [label_data["id"], fuzz_score]
+    #                 retrieve_value_dict["text"].append(retrieve_value)
+            
+    #         # 将组合结果按照score降序排序并取得分前十个结果
+    #         retrieve_sort_dict = {k: sorted(value, key=lambda x: x[1], reverse=True)
+    #                               for k,value in retrieve_value_dict.items()}
+
+    #         # 综合排序
+    #         synthese_list = sorted(sum(retrieve_sort_dict.values(), []), key=lambda x: x[1], reverse=True)
+    #         synthese_set = set()
+    #         for ele in synthese_list:
+    #             if ele[0] not in synthese_set and len(retrieve_sort_dict["synthese"]) < 50:
+    #                 synthese_set.add(ele[0])
+    #                 retrieve_sort_dict["synthese"].append(ele)
+    #         # 加入题目序号
+    #         retrieve_sort_dict["topic_num"] = topic_num
+            
+    #         # 以字典形式返回最终查重结果
+    #         return retrieve_sort_dict
+
+    #     # 多线程HNSW查重
+    #     with ThreadPoolExecutor(max_workers=5) as executor:
+    #         retrieve_res_list = list(executor.map(dup_search, retrieve_list, sent_vec_list, cont_clear_list))
+
+    #     return retrieve_res_list
+
     # HNSW查(支持多学科混合查重)
-    def retrieve(self, retrieve_list, post_url, similar, doc_flag, min_threshold=0.4):
+    def retrieve(self, retrieve_list, post_url, similar, doc_flag, min_threshold=0.56):
         # 计算retrieve_list的vec值
         # 调用清洗分词函数和句向量计算函数
         sent_vec_list, cont_clear_list = self.dpp(retrieve_list, is_retrieve=True)
-
         # HNSW查重
-        def dup_search(retrieve_data, sent_vec, cont_clear):
+        retrieve_res_list = []
+        for i,sent_vec in enumerate(sent_vec_list):
             # 初始化返回数据类型
             retrieve_value_dict = dict(synthese=[], semantics=[], text=[], image=[])
+            # 获取题目序号
+            topic_num = retrieve_list[i]["topic_num"] if "topic_num" in retrieve_list[i] else 1
             # 图片搜索查重功能
             if doc_flag is True:
-                retrieve_value_dict["image"] = self.img_retrieve(retrieve_data["stem"], post_url, similar)
+                retrieve_value_dict["image"] = self.img_retrieve(retrieve_list[i]["stem"], post_url, similar, topic_num)
             else:
                 retrieve_value_dict["image"] = []
             # 判断句向量维度
             if sent_vec.size != self.vector_dim:
-                return retrieve_value_dict
+                retrieve_res_list.append(retrieve_value_dict)
+                continue
             # 调用hnsw接口检索数据
             post_list = sent_vec.tolist()
             try:
@@ -121,9 +233,10 @@ class HNSW():
                 if self.logger is not None:
                     self.logger.error(self.log_msg.format(id="HNSW检索error",
                                                           type="当前题目HNSW检索error",
-                                                          message=cont_clear))
+                                                          message=cont_clear_list[i]))
             if len(query_labels) == 0:
-                return retrieve_value_dict
+                retrieve_res_list.append(retrieve_value_dict)
+                continue
 
             # 批量读取数据库
             mongo_find_dict = {"id": {"$in": query_labels}}
@@ -143,16 +256,17 @@ class HNSW():
                 if cosine_score < filter_threshold:
                     continue
                 # 计算编辑距离得分
-                fuzz_score = fuzz.ratio(cont_clear, label_data["content_clear"]) / 100
+                fuzz_score = fuzz.ratio(cont_clear_list[i], label_data["content_clear"]) / 100
                 if fuzz_score < min_threshold:
                     continue
                 # 对余弦相似度进行折算
-                if 0.95 <= cosine_score < 0.98:
-                    cosine_score = cosine_score * 0.93
-                elif 0.9 <= cosine_score < 0.95:
-                    cosine_score = cosine_score * 0.87
-                elif cosine_score < 0.9:
-                    cosine_score = cosine_score * 0.81
+                if cosine_score >= 0.91 and fuzz_score < min_threshold + 0.06:
+                    cosine_score = cosine_score * 0.95
+                elif cosine_score < 0.91 and fuzz_score < min_threshold + 0.06:
+                    cosine_score = cosine_score * 0.94
+                # 余弦相似度折算后阈值判断
+                if cosine_score < filter_threshold:
+                    continue
                 retrieve_value = [label_data["id"], int(cosine_score * 100) / 100]
                 retrieve_value_dict["semantics"].append(retrieve_value)
                 # 进行编辑距离得分验证,若小于设定分则过滤
@@ -171,31 +285,29 @@ class HNSW():
                 if ele[0] not in synthese_set and len(retrieve_sort_dict["synthese"]) < 50:
                     synthese_set.add(ele[0])
                     retrieve_sort_dict["synthese"].append(ele)
-
+            # 加入题目序号
+            retrieve_sort_dict["topic_num"] = topic_num
+            
             # 以字典形式返回最终查重结果
-            retrieve_sort_dict["topic_num"] = retrieve_data["topic_num"]
-            return retrieve_sort_dict
+            retrieve_res_list.append(retrieve_sort_dict)
 
-        # 多线程HNSW查重
-        with ThreadPoolExecutor(max_workers=5) as executor:
-            retrieve_res_list = list(executor.map(dup_search, retrieve_list, sent_vec_list, cont_clear_list))
-            
         return retrieve_res_list
 
 
 if __name__ == "__main__":
     # 获取mongodb数据
     mongo_coll = config.mongo_coll
-    hnsw = HNSW()
+    from data_preprocessing import DataPreProcessing
+    hnsw = HNSW(DataPreProcessing())
 
-    # test_data = []
-    # for idx in [15176736]:
-    #     test_data.append(mongo_coll.find_one({"id": idx}))
+    test_data = []
+    for idx in [201511100736265]:
+        test_data.append(mongo_coll.find_one({"id": idx}))
 
-    # res = hnsw.retrieve(test_data)
-    # pprint(res)
+    res = hnsw.retrieve(test_data, '', 0.8, False)
+    pprint(res[0]["semantics"])
 
-    # 公式搜索查重功能
-    formula_string = "ρ蜡=0.9*10^3Kg/m^3"
-    formula_string = "p蜡=0.9*10^3Kq/m^3"
-    print(hnsw.formula_retrieve(formula_string, 0.8))
+    # # 公式搜索查重功能
+    # formula_string = "ρ蜡=0.9*10^3Kg/m^3"
+    # formula_string = "p蜡=0.9*10^3Kq/m^3"
+    # print(hnsw.formula_retrieve(formula_string, 0.8))

+ 2 - 2
info_retrieval.py

@@ -28,7 +28,7 @@ class Info_Retrieval():
         # 将搜索语句进行标准化清洗
         sentence = self.dpp.content_clear_func(sentence)
         # 将搜索语句分词
-        seg_list = self.word_seg(sentence)
+        seg_list, seg_init_list = self.word_seg(sentence)
         # 日志采集
         self.logger.info(log_msg.format(id="文本查重",
                                         type="info_retrieve分词",
@@ -64,7 +64,7 @@ class Info_Retrieval():
         # 对检索结果进行判断并取出排序后的id
         scores_sort_list = [ele[0] for ele in scores_list]
         
-        return scores_sort_list, seg_list
+        return scores_sort_list, seg_init_list
     
     # bm25算法计算倒排索引关键词对应文档得分
     def BM25(self, scores_dict, term_data, doc_param, recall_doc_set):

BIN
main_clear/__pycache__/sci_clear.cpython-38.pyc


+ 7 - 0
main_clear/sci_clear.py

@@ -163,6 +163,12 @@ def get_maplef_items(html):
     s = s.replace(r'②', '(2)、')
     s = s.replace(r'③', '(3)、')
     s = s.replace(r'④', '(4)、')
+    s = s.replace(r'⑤', '(5)、')
+    s = s.replace(r'⑥', '(6)、')
+    s = s.replace(r'⑦', '(7)、')
+    s = s.replace(r'⑧', '(8)、')
+    s = s.replace(r'⑨', '(9)、')
+    s = s.replace(r'⑩', '(10)、')
 
     s = re.sub(r'/images/1-50/[1-9].gif', '(   )', s)
     s = re.sub(r'/([0-9a-z/*.]*?)(png|jpg|gif)', 'img', s)
@@ -196,6 +202,7 @@ def get_maplef_items(html):
 
     # tjt修改
     s = s.replace("×", "*")
+    s = s.replace("*", '')
     s = s.replace("%%", '')
     s = s.replace('\\n', '')
     s = s.replace('\\r', '')

+ 4 - 5
retrieval_app.py

@@ -40,7 +40,7 @@ def hnsw_retrieve():
                                                     type="hnsw_retrieve接收",
                                                     message=retrieve_dict))
         # hnsw模型查重
-        post_url = r"http://localhost:8068/topic_retrieval_http"
+        post_url = r"http://192.168.1.209:8068/topic_retrieval_http"
         res_list = hnsw_model.retrieve(retrieve_list, post_url, similar, doc_flag)
         # 返回日志采集
         retrieval_logger.info(config.log_msg.format(id=id_name,
@@ -59,10 +59,10 @@ def image_retrieve():
         retrieve_img = retrieve_dict["content"]
         similar = retrieve_dict["similar"] / 100
         # 图片查重链接
-        post_url = r"http://localhost:8068/img_retrieval_http"
+        post_url = r"http://192.168.1.209:8068/img_retrieval_http"
         img_dict = dict(img_url=retrieve_img, img_threshold=similar, img_max_num=30)
         try:
-            res_list = requests.post(post_url, json=img_dict, timeout=20).json()
+            res_list = requests.post(post_url, json=img_dict, timeout=30).json()
         except Exception as e:
             res_list = []
         # 返回日志采集
@@ -111,9 +111,8 @@ def info_retrieve():
         id_list, seg_list = ir_model(sentence)
         id_list = [int(idx) for idx in id_list]
         # 语义相似度查重
-        retrieve_list = [dict(stem=sentence, topic_num=1)]
+        retrieve_list = [dict(stem=sentence)]
         if len(sentence) > 30:
-            retrieve_list = [dict(stem=sentence, topic_num=1)]
             doc_list = hnsw_model.retrieve(retrieve_list, '', similar, False)[0]["semantics"]
         else:
             doc_list = hnsw_model.retrieve(retrieve_list, '', similar, False, 0.6)[0]["semantics"]

+ 4 - 3
setup.py

@@ -68,14 +68,15 @@ if __name__ == "__main__":
     exclude_list=["setup.py", "__init__.py", "config.py", "guc_conf.py", 
                   "retrieval_monitor.py", "restart_server.py"]
     # 获取当前目录下的所有"app.py"文件加入exclude_list列表
-    file_list = os.listdir(root_path)
+    os.chdir(copy_path)
+    file_list = os.listdir(copy_path)
     exclude_list.extend([f for f in file_list if f.endswith("app.py")])
     
     # 需要setup加密的".py"文件列表
     pylist = []
     # 遍历搜索目录下所有py文件并返回列表
-    pylist = path_search(root_path, pylist, exclude_list)
+    pylist = path_search(copy_path, pylist, exclude_list)
     # 生成cython加密文件
     setup_func(pylist, exclude_list)
     # 清理setup生成的临时文件
-    clean_setup_file(root_path, pylist)
+    clean_setup_file(copy_path, pylist)

+ 9 - 9
word_segment.py

@@ -15,31 +15,31 @@ class Word_Segment():
     def __call__(self, sentence):
         sentence = re.sub( r"[\-_—]", ' ', sentence)
         # 统一将大写转化为小写
-        seg_list = jieba.lcut(sentence.lower())
+        seg_init_list = jieba.lcut(sentence.lower())
         # 将分词列表中的数字变为空字符串
-        seg_list = [re.sub(r'\d+\.?\d*%?', ' ', w) for w in seg_list]
+        seg_list = [re.sub(r'\d+\.?\d*%?', ' ', w) for w in seg_init_list]
         # 若词为停用词,则变为' '(方便后续进行n-grams组合)
         seg_list = [w if w not in self.stop_words else ' ' for w in seg_list]
         if self.n_grams_flag is True:
             seg_list = self.n_grams(seg_list)
-        
-        return [w.strip() for w in seg_list if w.strip() != '']
+        seg_list = [w.strip() for w in seg_list if w.strip() != '']
+
+        return seg_list, seg_init_list
 
     # 计算分词后的词语n-grams组合
     def n_grams(self, seg_list):
         length = len(seg_list)
         for i in range(length):
             if i+1 < length and self.is_Chinese(seg_list[i]) and self.is_Chinese(seg_list[i+1]):
-                seg_list.append(seg_list[i]+seg_list[i+1])
+                seg_list.append(seg_list[i] + seg_list[i+1])
         return seg_list
 
     # 判断字符串是否为全为中文字符
     def is_Chinese(self, string):
         for char in string:
-            if '\u4e00' <= char <= '\u9fff': 
-                continue
-            else: 
-                return False
+            if '\u4e00' <= char <= '\u9fff': continue
+            else: return False
+        
         return True if string != '' else False
 
 if __name__ == "__main__":