Browse Source

README完善

tujintao 10 months ago
parent
commit
f7370b266f
8 changed files with 211 additions and 150 deletions
  1. 68 0
      README.md
  2. BIN
      __pycache__/config.cpython-38.pyc
  3. 51 67
      comparison.py
  4. 2 17
      comprehensive_score.py
  5. 5 3
      config.py
  6. 32 4
      db_train_app.py
  7. 26 29
      dim_classify.py
  8. 27 30
      hnsw_retrieval.py

+ 68 - 0
README.md

@@ -0,0 +1,68 @@
+考试院查重文档说明:
+
+初始化方式:
+
+注:若keyword_mapping.json不存在,则首先运行 `python comparison.py # 计算知识点/物理量映射ID`
+
+`python db_train_app.py  # mongodb数据清洗与向量化/计算物理量/知识点转ID/计算求解类型`
+
+`python hm_ir_train_app.py  # hnsw模型/关键词检索/公式查重模型初始化`
+
+启动方式:
+
+1、全部功能重启命令
+
+```
+conda activate dup_search
+python restart_server.py
+```
+
+2、部分功能重启命令
+
+```
+conda activate dup_search
+python restart_server.py 0/1/2/3/4/5
+```
+
+其中:
+
+    0表示重启考试院题库查重功能
+
+    1表示重启考试院题库HNSW模型检索功能
+
+    2表示重启多维度(求解类型/难度)分类模型功能
+
+    3表示重启服务监控功能
+
+查重功能主要分三个部分(公式查重、关键词检索、文本查重、语义查重):
+
+〇、数据初始化
+
+* config.py:  基础配置信息,包含数据库地址、调用接口地址、日志初始化配置、相似度阈值等
+* main_clear/sci_clear.py:  数据清洗模块
+* data_preprocessing.py:  数据预处理代码,包含数据清洗、字段数据拼接、数据向量化等
+* hnsw_model_train.py:  初始化训练HNSW模型
+
+一、公式查重
+
+* formula_process:  公式规则抽取、训练词袋模型和计算题库所有公式向量
+
+二、关键词检索
+
+* word_segment.py:  文档分词功能
+* ir_db_establish.py:  构建倒排索引并将结果存入sqlite数据库
+* info_retrieval.py:  使用BM25算法进行关键词检索打分排序
+
+三、文本查重
+
+* hnsw_retrieval.py:  HNSW召回、文本相似查重、语义相似查重
+* hnsw_model.py:  HNSW模型加载保存以及查操作
+
+* hnsw_app:  全学科题库查重接口服务
+
+四、语义查重
+
+* dim_classify.py:  求解类型分类/难度分类
+* dim_classify_app.py:  求解类型分类/难度分类服务
+* physical_quantity_extract:  物理量规则提取
+* comprehensive_score.py:  题型/求解类型/难度/物理量/知识点打分规则

BIN
__pycache__/config.cpython-38.pyc


+ 51 - 67
comparison.py

@@ -1,5 +1,4 @@
 import json
-import numpy as np
 import pandas as pd
 
 keyword2id_dict = dict()
@@ -19,90 +18,75 @@ for i in range(len(df)):
         quantity2id[knowledge] = sign_index
 keyword2id_dict["quantity2id"] = quantity2id
 
-# # 物理场景
-# excel_path = r"data/物理情景.xlsx"
+# # 风向标-知识点
+# excel_path = r"data/物理知识点.xlsx"
 # df = pd.read_excel(excel_path)
-# scene2id = dict()
+# knowledge2id = dict()
+# init_id2max_id = dict()
 # count_index = 0
 # for i in range(len(df)):
-#     if not pd.isna(df['知识点'][i]):
+#     if not pd.isna(df['2级知识点'][i]):
 #         count_index += 1
-#         sign_index = 10000 + count_index * 10
-#     knowledge = df['情景'][i]
+#     if not pd.isna(df['3级知识点'][i]):
+#         sign = df['3级知识点'][i].split(' ')[0].split('.')
+#         # sign_index = 10000 + int(sign[0]) * 100 + int(sign[1]) * 10
+#         sign_index = 10000 + count_index * 100 + int(sign[1]) * 10
+#         init_id = sign_index
+#         if init_id not in init_id2max_id:
+#             init_id2max_id[init_id] = []
+#         else:
+#             init_id2max_id[init_id].append(sign_index)
+#     knowledge = df['4级知识点'][i]
 #     if not pd.isna(knowledge):
 #         sign_index += 1
-#         scene2id[knowledge] = sign_index
-# keyword2id_dict["scene2id"] = scene2id
+#         knowledge2id[knowledge] = sign_index
+#         if init_id not in init_id2max_id:
+#             init_id2max_id[init_id] = []
+#         else:
+#             init_id2max_id[init_id].append(sign_index)
+# keyword2id_dict["knowledge2id"] = knowledge2id
+# keyword2id_dict["init_id2max_id"] = init_id2max_id
 
-# 风向标-知识点
-excel_path = r"data/物理知识点.xlsx"
+# 考试院-知识点
+excel_path = r"data/初中物理知识对应关系.xlsx"
 df = pd.read_excel(excel_path)
 knowledge2id = dict()
 init_id2max_id = dict()
 count_index = 0
 for i in range(len(df)):
-    if not pd.isna(df['2级知识点'][i]):
+    if not pd.isna(df.iloc[i][2]):
         count_index += 1
-    if not pd.isna(df['3级知识点'][i]):
-        sign = df['3级知识点'][i].split(' ')[0].split('.')
-        # sign_index = 10000 + int(sign[0]) * 100 + int(sign[1]) * 10
-        sign_index = 10000 + count_index * 100 + int(sign[1]) * 10
-        init_id = sign_index
-        if init_id not in init_id2max_id:
-            init_id2max_id[init_id] = []
-        else:
-            init_id2max_id[init_id].append(sign_index)
-    knowledge = df['4级知识点'][i]
-    if not pd.isna(knowledge):
+        sign_index = 100000000 + count_index * 1000000
+        if  pd.isna(df.iloc[i+1][3]):
+            knowledge = df.iloc[i][2].split(' ')[1]
+            knowledge2id[knowledge] = sign_index
+            continue
+    if not pd.isna(df.iloc[i][3]):
+        sign_index = int(str(sign_index)[:-4]) * 10000
+        sign_index += 10000
+        relate_index = sign_index
+        init_id2max_id[relate_index] = []
+        if pd.isna(df.iloc[i+1][4]):
+            knowledge = df.iloc[i][3].split(' ')[1]
+            knowledge2id[knowledge] = sign_index
+            continue
+    if not pd.isna(df.iloc[i][4]):
+        sign_index = int(str(sign_index)[:-2]) * 100
+        sign_index += 100
+        if pd.isna(df.iloc[i+1][5]):
+            knowledge = df.iloc[i][4].split(' ')[1]
+            knowledge2id[knowledge] = sign_index
+            init_id2max_id[relate_index].append(sign_index)
+            continue
+    if not pd.isna(df.iloc[i][5]):
         sign_index += 1
+        knowledge = df.iloc[i][5].split(' ')[1]
         knowledge2id[knowledge] = sign_index
-        if init_id not in init_id2max_id:
-            init_id2max_id[init_id] = []
-        else:
-            init_id2max_id[init_id].append(sign_index)
+        init_id2max_id[relate_index].append(sign_index)
+
 keyword2id_dict["knowledge2id"] = knowledge2id
 keyword2id_dict["init_id2max_id"] = init_id2max_id
 
-# # 考试院-知识点
-# excel_path = r"data/初中物理知识对应关系.xlsx"
-# df = pd.read_excel(excel_path)
-# knowledge2id = dict()
-# init_id2max_id = dict()
-# count_index = 0
-# for i in range(len(df)):
-#     if not pd.isna(df.iloc[i][2]):
-#         count_index += 1
-#         sign_index = 100000000 + count_index * 1000000
-#         if  pd.isna(df.iloc[i+1][3]):
-#             knowledge = df.iloc[i][2].split(' ')[1]
-#             knowledge2id[knowledge] = sign_index
-#             continue
-#     if not pd.isna(df.iloc[i][3]):
-#         sign_index = int(str(sign_index)[:-4]) * 10000
-#         sign_index += 10000
-#         relate_index = sign_index
-#         init_id2max_id[relate_index] = []
-#         if pd.isna(df.iloc[i+1][4]):
-#             knowledge = df.iloc[i][3].split(' ')[1]
-#             knowledge2id[knowledge] = sign_index
-#             continue
-#     if not pd.isna(df.iloc[i][4]):
-#         sign_index = int(str(sign_index)[:-2]) * 100
-#         sign_index += 100
-#         if pd.isna(df.iloc[i+1][5]):
-#             knowledge = df.iloc[i][4].split(' ')[1]
-#             knowledge2id[knowledge] = sign_index
-#             init_id2max_id[relate_index].append(sign_index)
-#             continue
-#     if not pd.isna(df.iloc[i][5]):
-#         sign_index += 1
-#         knowledge = df.iloc[i][5].split(' ')[1]
-#         knowledge2id[knowledge] = sign_index
-#         init_id2max_id[relate_index].append(sign_index)
-
-# keyword2id_dict["knowledge2id"] = knowledge2id
-# keyword2id_dict["init_id2max_id"] = init_id2max_id
-
 # 映射转换
 with open("model_data/keyword_mapping.json", 'w', encoding="utf8") as f:
     json.dump(keyword2id_dict, f, ensure_ascii=False, indent=2)

+ 2 - 17
comprehensive_score.py

@@ -19,7 +19,6 @@ class Comprehensive_Score():
         solving_type = self.compute_solving_type(query["solving_type"], refer["solving_type"])
         difficulty = self.compute_difficulty(query["difficulty"], refer["difficulty"])
         physical_quantity = self.compute_physical_quantity(query["physical_quantity"], refer["physical_quantity"])
-        # image_semantics = self.compute_image_semantics(query["image_semantics"], refer["image_semantics"])
 
         sum_score = quesType * scale["quesType"] + knowledge * scale["knowledge"] + \
                     solving_type * scale["solving_type"] + difficulty * scale["difficulty"] + \
@@ -32,7 +31,6 @@ class Comprehensive_Score():
         score_dict["solving_type"] = solving_type
         score_dict["difficulty"] = difficulty
         score_dict["physical_quantity"] = physical_quantity
-        # score_dict["image_semantics"] = image_semantics
 
         return sum_score, score_dict
 
@@ -63,7 +61,7 @@ class Comprehensive_Score():
                     if abs(query_id - refer_id) < 100: query_score += 0.2
                     else: continue
                 fuzz_score = fuzz.ratio(query, refer)
-                if fuzz_score >= 0.4:
+                if mode == 1 and fuzz_score >= 0.4:
                     query_score += 0.1
             # refer长度为1特殊处理
             if max_length == 1:
@@ -106,17 +104,4 @@ class Comprehensive_Score():
     # 物理量相似度评分
     def compute_physical_quantity(self, query_list, refer_list):
         score = self.compute_relate_score(query_list, refer_list, self.quantity2id, mode=2)
-        return int(score * 100) / 100
-
-    # # 图片语义相似度评分
-    # def compute_image_semantics(self, query_list, refer_list):
-    #     query_set, refer_set = set(query_list), set(refer_list)
-    #     if len(query_set) == 0 and len(refer_set) == 0:
-    #         return 1
-    #     elif len(query_set) == 0 or len(refer_set) == 0:
-    #         return 0
-    #     elif len(query_set) > len(refer_set):
-    #         query_set, refer_set = refer_set, query_set
-    #     same_count = sum([1 for ele in query_set if ele in refer_set])
-    #     score = same_count / len(refer_set)
-    #     return int(score * 100) / 100
+        return int(score * 100) / 100

+ 5 - 3
config.py

@@ -37,16 +37,18 @@ illustration_url = dict(fxb="http://192.168.1.204:8068/topic_retrieval_http", ks
 image_url = dict(fxb="http://192.168.1.204:8068/img_retrieval_http", ksy="http://127.0.0.1:8068/img_retrieval_http")[dev_mode]
 # hnsw模型检索链接
 hnsw_retrieve_url = r"http://127.0.0.1:8836/retrieve"
-# 多维度分类链接
+# 多维度(求解类型/难度)分类链接
 dim_classify_url = r"http://127.0.0.1:8837/dim_classify"
+# 知识点标注链接
+knowledge_tagging_url = r"http://127.0.0.1:8840/generate"
 
 # 根地址
 root_path = os.getcwd()
 data_root_path = os.path.join(root_path, "model_data")
 # Sentence_BERT模型地址
 sbert_path = os.path.join(data_root_path, "all-MiniLM-L6-v2")
-# bert-whitening参数地址
-whitening_path = os.path.join(data_root_path, "whitening_param.pkl")
+# # bert-whitening参数地址
+# whitening_path = os.path.join(data_root_path, "whitening_param.pkl")
 # 停用词地址
 stop_words_path = os.path.join(data_root_path, "stop_words.txt")
 # sqlite数据库地址

+ 32 - 4
db_train_app.py

@@ -1,8 +1,28 @@
 import sys
 import time
 import json
+
 import config
 from data_preprocessing import DataPreProcessing
+from dim_classify import Dimension_Classification
+from physical_quantity_extract import physical_quantity_extract
+
+"""
+MongoDB数据类型: 
+{ 
+    "id" : 20231001, 
+    "quesType" : {
+        "quesType" : "单选题"
+    }, 
+    "quesBody" : "荔枝是一种岭南佳果,小明拿起一个荔枝,如题图所示,它的尺寸l大小约为(   )<br/><img src=\"Upload/QBM/20231001.png\" /><br/>\nA. 0.1cm B. 3cm C. 0.3m D. 1m", 
+    "quesParse" : "......", 
+    "quesAnswer" : "【答案】见解析", 
+    "difficulty" : "一般", 
+    "knowledge" : [
+        "长度的测量"
+    ]
+}
+"""
 
 # 数据清洗与句向量计算
 def clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub):
@@ -12,19 +32,27 @@ def clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub):
     dpp(origin_dataset[sup:sub])
     print("耗时:", time.time()-start)
 
-# 知识点转换成id用于mongodb检索
+# 知识点转换成id用于mongodb检索/计算物理量/计算求解类型
 def convert_knowledge2id(mongo_coll, mongo_find_dict, sup, sub):
+    # 加载知识点转ID数据
     with open("model_data/keyword_mapping.json", 'r', encoding="utf8") as f:
         knowledge2id = json.load(f)["knowledge2id"]
+    dim_classify = Dimension_Classification(dim_mode=0)
     origin_dataset = mongo_coll.find(mongo_find_dict, no_cursor_timeout=True, batch_size=5)
     start = time.time()
     for data in origin_dataset[sup:sub]:
-        print(data["knowledge"])
         condition = {"id": data["id"]}
-        # 需要新增train_flag,防止机器奔溃重复训练
+        # 计算物理量
+        physical_quantity_list = physical_quantity_extract(data["content_clear"])
+        # 计算求解类型
+        solution_list = dim_classify(data["content_clear"], data["quesType"])["solving_type"]
+        # 知识点转ID
         knowledge_list = [knowledge2id[ele] for ele in data["knowledge"] if ele in knowledge2id]
-        update_elements = {"$set": {"knowledge_id": knowledge_list}}
+        update_elements = {"$set": {"physical_quantity": physical_quantity_list, 
+                                    "solving_type": solution_list,
+                                    "knowledge_id": knowledge_list}}
         mongo_coll.update_one(condition, update_elements)
+        print(physical_quantity_list, solution_list)
     print("耗时:", time.time()-start)
 
 if __name__ == "__main__":

+ 26 - 29
dim_classify.py

@@ -22,7 +22,7 @@ class Difficulty_Model(nn.Module):
         super(Difficulty_Model, self).__init__()
         self.bert_config = AutoConfig.from_pretrained(config.bert_path)
         self.bert = AutoModel.from_pretrained(config.bert_path)
-        self.fc = nn.Linear(in_features=self.bert_config.hidden_size, out_features=8)
+        self.fc = nn.Linear(in_features=self.bert_config.hidden_size, out_features=1)
 
     def forward(self, input_ids, attention_mask):
         x = self.bert(input_ids, attention_mask)[0][:, 0, :]
@@ -31,10 +31,14 @@ class Difficulty_Model(nn.Module):
         return x
 
 class Dimension_Classification():
-    def __init__(self, logger=None):
+    def __init__(self, dim_mode=2, logger=None):
+        self.dim_mode = dim_mode
         self.tokenizer = BertTokenizer.from_pretrained(config.bert_path)
-        self.solution_model = torch.load(config.solution_model_path)
-        self.difficulty_model = torch.load(config.difficulty_model_path)
+        self.solution_model, self.difficulty_model = None, None
+        if self.dim_mode in {0, 2}:
+            self.solution_model = torch.load(config.solution_model_path)
+        if self.dim_mode in {1, 2}:
+            self.difficulty_model = torch.load(config.difficulty_model_path)
         self.max_squence_length = 500
         self.solving_type_dict = {
             0: "实验操作", 
@@ -50,8 +54,11 @@ class Dimension_Classification():
         self.logger = logger
 
     def __call__(self, sentence, quesType):
-        solution_list = self.solution_classify(sentence, quesType)
-        difficulty_value = self.difficulty_classify(sentence)
+        solution_list, difficulty_value = [], 0.6
+        if self.dim_mode in {0, 2}:
+            solution_list = self.solution_classify(sentence, quesType)
+        if self.dim_mode in {1, 2}:
+            difficulty_value = self.difficulty_classify(sentence)
         res_dict = {
             "solving_type": solution_list, 
             "difficulty": difficulty_value, 
@@ -63,16 +70,17 @@ class Dimension_Classification():
         solution_tensor = self.model_calculate(self.solution_model, sentence)
         solution_tensor[solution_tensor >= 0.5] = 1
         solution_tensor[solution_tensor < 0.5] = 0
-        solution_list = [self.solving_type_dict[idx] for idx in solution_tensor[0].int().tolist() if idx == 1]
+        solution_list = solution_tensor[0].int().tolist()
+        solution_result = [self.solving_type_dict[i] for i,idx in enumerate(solution_list) if idx == 1]
         # 题型判断
         if quesType == "计算题":
-            solution_list.append("计算分析")
+            solution_result.append("计算分析")
         elif quesType == "作图题":
-            solution_list.append("连线作图")
-        if len(solution_list) == 0:
-            solution_list.append("规律理解")
+            solution_result.append("连线作图")
+        if len(solution_result) == 0:
+            solution_result.append("规律理解")
             
-        return list(set(solution_list))
+        return list(set(solution_result))
 
     def difficulty_classify(self, sentence):
         difficulty_tensor = self.model_calculate(self.difficulty_model, sentence).item()
@@ -89,9 +97,9 @@ class Dimension_Classification():
     def model_calculate(self, model, sentence):
         model.eval()
         with torch.no_grad():
-            token_list = self.sentence_tokenize(sentence)
-            mask_list = self.attention_mask(token_list)
-            output_tensor = model(torch.tensor(token_list), attention_mask=torch.tensor(mask_list))
+            token_tensor = self.sentence_tokenize(sentence)
+            mask_tensor = torch.ones_like(token_tensor, dtype=torch.float)
+            output_tensor = model(token_tensor, attention_mask=mask_tensor)
             output_tensor = torch.sigmoid(output_tensor)
 
         return output_tensor
@@ -100,23 +108,12 @@ class Dimension_Classification():
         # 直接截断
         # 编码时: 开头添加[LCS]->101, 结尾添加[SEP]->102, 未知的字或单词变为[UNK]->100
         token_list = self.tokenizer.encode(sentence[:self.max_squence_length])
-        # 补齐(pad的索引号就是0)
-        if len(token_list) < self.max_squence_length + 2:
-            token_list.extend([0] * (self.max_squence_length + 2 - len(token_list)))
         
-        return [token_list]
-
-    def attention_mask(self, tokens_list):
-        # 在一个文本中,如果是PAD符号则是0,否则就是1
-        mask_list = []
-        for tokens in tokens_list:
-            mask = [float(token > 0) for token in tokens]
-            mask_list.append(mask)
+        return torch.tensor([token_list])
 
-        return mask_list
 
 if __name__ == "__main__":
-    dc = Dimension_Classification()
-    sentence = "荆门市是国家循环经济试点市,目前正在沙洋建设全国最大的秸秆气化发电厂.电厂建成后每年可消化秸秆13万吨,发电9*10^7*kW*h.同时电厂所产生的灰渣将生成肥料返还农民,焦油用于精细化工,实现“农业--工业--农业”循环.(1)若秸秆电厂正常工作时,每小时可发电2.5*10^5*kW*h,按每户居民每天使用5只20*W的节能灯、1个800*W的电饭锅、1台100*W的电视机计算,该发电厂同时可供多少户居民正常用电?(2)与同等规模的火电厂相比,该电厂每年可减少6.4万吨二氧化碳的排放量,若火电厂煤燃烧的热利用率为20%,秸秆电厂每年可节约多少吨标准煤?(标准煤的热值按3.6*10^7J/k*g计算)"
+    dc = Dimension_Classification(dim_mode=0)
+    sentence = "请在图乙中的虚线框内画出与图甲中实物图对应的电路图。"
     res = dc(sentence, "")
     print(res)

+ 27 - 30
hnsw_retrieval.py

@@ -18,6 +18,7 @@ class HNSW():
         self.vector_dim = config.vector_dim
         self.hnsw_retrieve_url = config.hnsw_retrieve_url
         self.dim_classify_url = config.dim_classify_url
+        self.knowledge_tagging_url = config.knowledge_tagging_url
         # 日志采集
         self.logger = logger
         self.log_msg = config.log_msg
@@ -115,6 +116,19 @@ class HNSW():
 
         return formula_res_list[:50]
 
+    def api_post(self, post_url, post_data, log_info):
+        try:
+            post_result = requests.post(post_url, json=post_data, timeout=10).json()
+        except Exception as e:
+            post_result = []
+            # 日志采集
+            if self.logger is not None:
+                self.logger.error(self.log_msg.format(id="{}error".format(log_info[0]),
+                                                        type="当前题目{}error".format(log_info[0]),
+                                                        message=log_info[1]))
+        
+        return post_result
+
     # HNSW查(支持多学科混合查重)
     def retrieve(self, retrieve_list, post_url, similar, scale, doc_flag):
         """
@@ -154,16 +168,7 @@ class HNSW():
                 retrieve_res_list.append(retrieve_value_dict)
                 continue
             # 调用hnsw接口检索数据
-            try:
-                hnsw_post_list = sent_vec.tolist()
-                query_labels = requests.post(self.hnsw_retrieve_url, json=hnsw_post_list, timeout=10).json()
-            except Exception as e:
-                query_labels = []
-                # 日志采集
-                if self.logger is not None:
-                    self.logger.error(self.log_msg.format(id="HNSW检索error",
-                                                          type="当前题目HNSW检索error",
-                                                          message=cont_clear_list[i]))
+            query_labels = self.api_post(self.hnsw_retrieve_url, sent_vec.tolist(), ["HNSW检索", cont_clear_list[i]])
             if len(query_labels) == 0:
                 retrieve_res_list.append(retrieve_value_dict)
                 continue
@@ -202,25 +207,22 @@ class HNSW():
             语义相似度特殊处理
             """
             # 标签字典初始化
-            label_dict = dict()
+            label_dict = dict()  # todo: 可以加一个高相似的替换, 不走模型预测
             # 知识点LLM标注
-            # label_dict["knowledge"] = query_data["knowledge"] if query_data else []
             label_dict["knowledge"] = query_data["knowledge"] if query_data else []
+            ####################################### 知识点标注接口调用 ####################################### 
+            # knowledge_post_data = {"sentence": cont_clear_list[i]}
+            # label_dict["knowledge"] = self.api_post(self.knowledge_tagging_url, knowledge_post_data, ["知识点标注", cont_clear_list[i]])
+            ####################################### 知识点标注接口调用 ####################################### 
             tagging_id_list = [self.cph_score.knowledge2id[ele] for ele in label_dict["knowledge"] \
                                if ele in self.cph_score.knowledge2id]
             # 题型数据获取
             label_dict["quesType"] = retrieve_list[i].get("quesType", "选择题")
             # 多维分类api调用
-            try:
-                dim_post_list = {"sentence": cont_clear_list[i], "quesType": label_dict["quesType"]}
-                dim_classify_dict = requests.post(self.dim_classify_url, json=dim_post_list, timeout=10).json()
-            except Exception as e:
+            dim_post_data = {"sentence": cont_clear_list[i], "quesType": label_dict["quesType"]}
+            dim_classify_dict = self.api_post(self.dim_classify_url, dim_post_data, ["多维分类", cont_clear_list[i]])
+            if len(dim_classify_dict) == 0:
                 dim_classify_dict = {"solving_type": ["规律理解"], "difficulty": 0.6}
-                # 日志采集
-                if self.logger is not None:
-                    self.logger.error(self.log_msg.format(id="多维分类error",
-                                                          type="当前题目多维分类error",
-                                                          message=cont_clear_list[i]))
             # 求解类型模型分类
             label_dict["solving_type"] = dim_classify_dict["solving_type"]
             # 难度模型分类
@@ -245,8 +247,11 @@ class HNSW():
                 knowledge_query_dataset = self.mongo_coll.find(mongo_find_dict)
             # 返回大于阈值的结果
             if knowledge_query_dataset:
+                # 待查重难度数值转换
+                if label_dict["difficulty"] in self.difficulty_transfer:
+                    label_dict["difficulty"] = self.difficulty_transfer[label_dict["difficulty"]]
                 for refer_data in knowledge_query_dataset:
-                    # 难度数值转换
+                    # 题库数据难度数值转换
                     if refer_data["difficulty"] in self.difficulty_transfer:
                         refer_data["difficulty"] = self.difficulty_transfer[refer_data["difficulty"]]
                     sum_score, score_dict = self.cph_score(label_dict, refer_data, scale)
@@ -259,14 +264,6 @@ class HNSW():
             retrieve_sort_dict = {k: sorted(value, key=lambda x: x[1], reverse=True)
                                   for k,value in retrieve_value_dict.items()}
 
-            # # 综合排序
-            # synthese_list = sorted(sum(retrieve_sort_dict.values(), []), key=lambda x: x[1], reverse=True)
-            # synthese_set = set()
-            # for ele in synthese_list:
-            #     # 综合排序返回前50个
-            #     if ele[0] not in synthese_set and len(retrieve_sort_dict["synthese"]) < 50:
-            #         synthese_set.add(ele[0])
-            #         retrieve_sort_dict["synthese"].append(ele[:2])
             # 加入题目序号
             retrieve_sort_dict["label"] = label_dict
             retrieve_sort_dict["topic_num"] = topic_num