from fuzzywuzzy import fuzz import config from data_preprocessing import DataPreProcessing # 知识点关联 class Knowledge_Relate(): def __init__(self, logger=None): # 配置初始数据 self.mongo_coll = config.mongo_coll_list # 日志采集 self.logger = logger self.log_msg = config.log_msg # 数据预处理实例化 self.dpp = DataPreProcessing(self.mongo_coll, self.logger) def relate(self, key_data, filterType=1): # 关键词搜索处理 if str(key_data["keyword"]).isdigit(): topic_id = int(key_data["keyword"]) find_data = self.mongo_coll[0].find_one({"topic_id": topic_id}) if find_data is not None: return [[topic_id, 1, len(find_data["special_id"])]] # 直接通过索引获取句向量,去除清洗之后的文本,减少内存空间占用 _, key_clear = self.dpp([key_data], hnsw_index=0, is_retrieve=True) if key_data["school_id"] != 0: find_data = self.mongo_coll[1].find_one({"topic_id": key_data["topic_id"]}) if find_data is not None: find_clear = find_data.get("content_clear", '') if find_clear != '': key_clear = find_clear # 获取当前查询数据special_id if "special_id" in key_data: knowledge_list = [int(i) for i in key_data["special_id"]] else: knowledge_list = [] # mongodb查询条件 mongo_find_dict = {"special_id": {"$in": knowledge_list}, "is_stop": 0, "subject_id": key_data["subject_id"]} if "topic_type_id" in key_data: mongo_find_dict["topic_type_id"] = key_data["topic_type_id"] relate_dataset = self.mongo_coll[0].find(mongo_find_dict, no_cursor_timeout=True, batch_size=5) # 知识点集合 knowledge_set = set(knowledge_list) exam_set = set([int(i) for i in key_data["examMethodIds"]]) exam_union_set = knowledge_set | exam_set # 严格模式下relate_list[0]存放高考题, relate_list[1]存放非高考题 relate_list = [[],[]] for data in relate_dataset: # 过滤题目本身以及不符合条件的题目 if data["topic_id"] == key_data["topic_id"] or "content_clear" not in data: continue relate_clear = data["content_clear"] if key_data["keyword"] not in relate_clear: continue # 关联数据自身集合 relate_set = set(data["special_id"]) # 严格模式(relate_set必须是knowledge_set的子集) if filterType == 1: if relate_set.issubset(knowledge_set) is False: continue # 宽松模式(补题模式和标注模式) elif filterType == 2: if relate_set.issubset(knowledge_set) is True: continue if len(exam_set) > 0 and relate_set.issubset(exam_union_set) is False: continue relate_set = relate_set & knowledge_set if filterType == 2 else relate_set # 判断文本内容是否为空 if key_clear == '' or relate_clear == '': fuzz_score = 0 else: fuzz_score = fuzz.ratio(key_clear, relate_clear) / 100 # 设定阈值界限防止关联到重题 if fuzz_score > 0.99: continue relate_value = [data["topic_id"], fuzz_score, len(relate_set)] if filterType == 1: # 严格模式需要判断是否为高考题 relate_list[0].append(relate_value) if data["old_exam"] == 1 else relate_list[1].append(relate_value) else: relate_list[0].append(relate_value) if len(relate_list[0]) + len(relate_list[1]) == 0: return [] # 使用sort排序 relate_list[0].sort(key=lambda x: (-x[2], -x[0], -x[1])) relate_list[1].sort(key=lambda x: (-x[2], -x[0], -x[1])) res_list = relate_list[0] + relate_list[1] return res_list[:2000] if __name__ == '__main__': # Knowledge_Relation初始化 kl_rlt = Knowledge_Relate() relate_data = {'topic_id': 177064120, 'topic_type_id': 7, 'school_id': 1, 'subject_id': 6, 'content': '', 'special': ['481', '480', '478', '479', '489'], 'filterType': 1, 'keyword': '838656', 'examMethodIds': []} res_list = kl_rlt.relate(relate_data) print(res_list)