123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- from fuzzywuzzy import fuzz
- import config
- from data_preprocessing import DataPreProcessing
- # 知识点关联
- class Knowledge_Relate():
- def __init__(self, logger=None):
- # 配置初始数据
- self.mongo_coll = config.mongo_coll_list
- # 日志采集
- self.logger = logger
- self.log_msg = config.log_msg
- # 数据预处理实例化
- self.dpp = DataPreProcessing(self.mongo_coll, self.logger)
- def relate(self, key_data, filterType=1):
- # 关键词搜索处理
- if str(key_data["keyword"]).isdigit():
- topic_id = int(key_data["keyword"])
- find_data = self.mongo_coll[0].find_one({"topic_id": topic_id})
- if find_data is not None:
- return [[topic_id, 1, len(find_data["special_id"])]]
- # 直接通过索引获取句向量,去除清洗之后的文本,减少内存空间占用
- _, key_clear = self.dpp([key_data], hnsw_index=0, is_retrieve=True)
- if key_data["school_id"] != 0:
- find_data = self.mongo_coll[1].find_one({"topic_id": key_data["topic_id"]})
- if find_data is not None:
- find_clear = find_data.get("content_clear", '')
- if find_clear != '':
- key_clear = find_clear
- # 获取当前查询数据special_id
- if "special_id" in key_data:
- knowledge_list = [int(i) for i in key_data["special_id"]]
- else:
- knowledge_list = []
- # mongodb查询条件
- mongo_find_dict = {"special_id": {"$in": knowledge_list}, "is_stop": 0,
- "subject_id": key_data["subject_id"]}
- if "topic_type_id" in key_data:
- mongo_find_dict["topic_type_id"] = key_data["topic_type_id"]
- relate_dataset = self.mongo_coll[0].find(mongo_find_dict, no_cursor_timeout=True, batch_size=5)
-
- # 知识点集合
- knowledge_set = set(knowledge_list)
- exam_set = set([int(i) for i in key_data["examMethodIds"]])
- exam_union_set = knowledge_set | exam_set
- # 严格模式下relate_list[0]存放高考题, relate_list[1]存放非高考题
- relate_list = [[],[]]
- for data in relate_dataset:
- # 过滤题目本身以及不符合条件的题目
- if data["topic_id"] == key_data["topic_id"] or "content_clear" not in data:
- continue
- relate_clear = data["content_clear"]
- if key_data["keyword"] not in relate_clear:
- continue
- # 关联数据自身集合
- relate_set = set(data["special_id"])
- # 严格模式(relate_set必须是knowledge_set的子集)
- if filterType == 1:
- if relate_set.issubset(knowledge_set) is False:
- continue
- # 宽松模式(补题模式和标注模式)
- elif filterType == 2:
- if relate_set.issubset(knowledge_set) is True:
- continue
- if len(exam_set) > 0 and relate_set.issubset(exam_union_set) is False:
- continue
- relate_set = relate_set & knowledge_set if filterType == 2 else relate_set
- # 判断文本内容是否为空
- if key_clear == '' or relate_clear == '':
- fuzz_score = 0
- else:
- fuzz_score = fuzz.ratio(key_clear, relate_clear) / 100
- # 设定阈值界限防止关联到重题
- if fuzz_score > 0.99:
- continue
- relate_value = [data["topic_id"], fuzz_score, len(relate_set)]
- if filterType == 1:
- # 严格模式需要判断是否为高考题
- relate_list[0].append(relate_value) if data["old_exam"] == 1 else relate_list[1].append(relate_value)
- else:
- relate_list[0].append(relate_value)
-
- if len(relate_list[0]) + len(relate_list[1]) == 0:
- return []
- # 使用sort排序
- relate_list[0].sort(key=lambda x: (-x[2], -x[0], -x[1]))
- relate_list[1].sort(key=lambda x: (-x[2], -x[0], -x[1]))
- res_list = relate_list[0] + relate_list[1]
- return res_list[:2000]
- if __name__ == '__main__':
- # Knowledge_Relation初始化
- kl_rlt = Knowledge_Relate()
- relate_data = {'topic_id': 177064120, 'topic_type_id': 7, 'school_id': 1, 'subject_id': 6, 'content': '', 'special': ['481', '480', '478', '479', '489'], 'filterType': 1, 'keyword': '838656', 'examMethodIds': []}
- res_list = kl_rlt.relate(relate_data)
- print(res_list)
|