import os import pymongo # 建立mongodb连接 myclient = pymongo.MongoClient("mongodb://192.168.1.140:27017/") mongo_info_db = myclient["ksy"] mongo_coll = mongo_info_db['test_topic'] # mongodb句向量训练标志 sent_train_flag = 1 # 批量处理数据字典格式 batch_processing_dict = { "id_list": [], "cont_clear_list": [], "cont_cut_list": [], "cut_idx_list": [0] } # 词向量维度 vector_dim = 384 # hnsw评价指标('l2','cosine','ip') hnsw_metric = ('l2','cosine','ip')[0] # hnsw最大索引数量 num_elements = 1000000 # hnsw召回数量参数 hnsw_set_ef = 150 # hnsw模型检索链接 hnsw_retrieve_url = r"http://localhost:8836/retrieve" # 根地址 root_path = os.getcwd() data_root_path = os.path.join(root_path, "model_data") # Sentence_BERT模型地址 sbert_path = os.path.join(data_root_path, "all-MiniLM-L6-v2") # bert-whitening参数地址 whitening_path = os.path.join(data_root_path, "whitening_param.pkl") # 停用词地址 stop_words_path = os.path.join(data_root_path, "stop_words.txt") # sqlite数据库地址 sqlite_path = os.path.join(data_root_path, "info_retrieval.db") sqlite_copy_path = os.path.join(data_root_path, "info_retrieval_copy.db") # hnsw模型地址 hnsw_path = "hnsw_model.bin" # 公式处理数据地址 bow_model_path = os.path.join(data_root_path, "bow_model.pkl") bow_vector_path = os.path.join(data_root_path, "bow_vector.npy") formula_data_path = os.path.join(data_root_path, "formula_data.json") # 日志地址 log_root_path = os.path.join(root_path, "logs") # 查重日志地址 retrieval_path = os.path.join(log_root_path, "retrieval_app.log") # 日志信息(message)格式 log_msg = "id : {id} -> {type} -> {message}"