knowledge_relate.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. from fuzzywuzzy import fuzz
  2. import config
  3. from data_preprocessing import DataPreProcessing
  4. # 知识点关联
  5. class Knowledge_Relate():
  6. def __init__(self, logger=None):
  7. # 配置初始数据
  8. self.mongo_coll = config.mongo_coll_list
  9. # 日志采集
  10. self.logger = logger
  11. self.log_msg = config.log_msg
  12. # 数据预处理实例化
  13. self.dpp = DataPreProcessing(self.mongo_coll, self.logger)
  14. def relate(self, key_data, filterType=1):
  15. # 关键词搜索处理
  16. if str(key_data["keyword"]).isdigit():
  17. topic_id = int(key_data["keyword"])
  18. find_data = self.mongo_coll[0].find_one({"topic_id": topic_id})
  19. if find_data is not None:
  20. return [[topic_id, 1, len(find_data["special_id"])]]
  21. # 直接通过索引获取句向量,去除清洗之后的文本,减少内存空间占用
  22. _, key_clear = self.dpp([key_data], hnsw_index=0, is_retrieve=True)
  23. if key_data["school_id"] != 0:
  24. find_data = self.mongo_coll[1].find_one({"topic_id": key_data["topic_id"]})
  25. if find_data is not None:
  26. find_clear = find_data.get("content_clear", '')
  27. if find_clear != '':
  28. key_clear = find_clear
  29. # 获取当前查询数据special_id
  30. if "special_id" in key_data:
  31. knowledge_list = [int(i) for i in key_data["special_id"]]
  32. else:
  33. knowledge_list = []
  34. # mongodb查询条件
  35. mongo_find_dict = {"special_id": {"$in": knowledge_list}, "is_stop": 0,
  36. "subject_id": key_data["subject_id"]}
  37. if "topic_type_id" in key_data:
  38. mongo_find_dict["topic_type_id"] = key_data["topic_type_id"]
  39. relate_dataset = self.mongo_coll[0].find(mongo_find_dict, no_cursor_timeout=True, batch_size=5)
  40. # 知识点集合
  41. knowledge_set = set(knowledge_list)
  42. exam_set = set([int(i) for i in key_data["examMethodIds"]])
  43. exam_union_set = knowledge_set | exam_set
  44. # 严格模式下relate_list[0]存放高考题, relate_list[1]存放非高考题
  45. relate_list = [[],[]]
  46. for data in relate_dataset:
  47. # 过滤题目本身以及不符合条件的题目
  48. if data["topic_id"] == key_data["topic_id"] or "content_clear" not in data:
  49. continue
  50. relate_clear = data["content_clear"]
  51. if key_data["keyword"] not in relate_clear:
  52. continue
  53. # 关联数据自身集合
  54. relate_set = set(data["special_id"])
  55. # 严格模式(relate_set必须是knowledge_set的子集)
  56. if filterType == 1:
  57. if relate_set.issubset(knowledge_set) is False:
  58. continue
  59. # 宽松模式(补题模式和标注模式)
  60. elif filterType == 2:
  61. if relate_set.issubset(knowledge_set) is True:
  62. continue
  63. if len(exam_set) > 0 and relate_set.issubset(exam_union_set) is False:
  64. continue
  65. relate_set = relate_set & knowledge_set if filterType == 2 else relate_set
  66. # 判断文本内容是否为空
  67. if key_clear == '' or relate_clear == '':
  68. fuzz_score = 0
  69. else:
  70. fuzz_score = fuzz.ratio(key_clear, relate_clear) / 100
  71. # 设定阈值界限防止关联到重题
  72. if fuzz_score > 0.99:
  73. continue
  74. relate_value = [data["topic_id"], fuzz_score, len(relate_set)]
  75. if filterType == 1:
  76. # 严格模式需要判断是否为高考题
  77. relate_list[0].append(relate_value) if data["old_exam"] == 1 else relate_list[1].append(relate_value)
  78. else:
  79. relate_list[0].append(relate_value)
  80. if len(relate_list[0]) + len(relate_list[1]) == 0:
  81. return []
  82. # 使用sort排序
  83. relate_list[0].sort(key=lambda x: (-x[2], -x[0], -x[1]))
  84. relate_list[1].sort(key=lambda x: (-x[2], -x[0], -x[1]))
  85. res_list = relate_list[0] + relate_list[1]
  86. return res_list[:2000]
  87. if __name__ == '__main__':
  88. # Knowledge_Relation初始化
  89. kl_rlt = Knowledge_Relate()
  90. relate_data = {'topic_id': 177064120, 'topic_type_id': 7, 'school_id': 1, 'subject_id': 6, 'content': '', 'special': ['481', '480', '478', '479', '489'], 'filterType': 1, 'keyword': '838656', 'examMethodIds': []}
  91. res_list = kl_rlt.relate(relate_data)
  92. print(res_list)