1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- import os
- import pymongo
- # 建立mongodb连接
- myclient = pymongo.MongoClient("mongodb://192.168.1.140:27017/")
- mongo_info_db = myclient["ksy"]
- mongo_coll = mongo_info_db['topic']
- # mongodb句向量训练标志
- sent_train_flag = 1
- # 批量处理数据字典格式
- batch_processing_dict = {
- "id_list": [],
- "cont_clear_list": [],
- "cont_cut_list": [],
- "cut_idx_list": [0]
- }
- # 词向量维度
- vector_dim = 384
- # hnsw评价指标('l2','cosine','ip')
- hnsw_metric = ('l2','cosine','ip')[0]
- # hnsw最大索引数量
- num_elements = 1000000
- # hnsw召回数量参数
- hnsw_set_ef = 150
- # hnsw模型检索链接
- hnsw_retrieve_url = r"http://localhost:8836/retrieve"
- # 根地址
- root_path = os.getcwd()
- data_root_path = os.path.join(root_path, "model_data")
- # Sentence_BERT模型地址
- sbert_path = os.path.join(data_root_path, "all-MiniLM-L6-v2")
- # bert-whitening参数地址
- whitening_path = os.path.join(data_root_path, "whitening_param.pkl")
- # 停用词地址
- stop_words_path = os.path.join(data_root_path, "stop_words.txt")
- # sqlite数据库地址
- sqlite_path = os.path.join(data_root_path, "info_retrieval.db")
- sqlite_copy_path = os.path.join(data_root_path, "info_retrieval_copy.db")
- # hnsw模型地址
- hnsw_path = "hnsw_model.bin"
- # hnsw_path = "hnsw_model_norm.bin"
- # 公式处理数据地址
- bow_model_path = os.path.join(data_root_path, "bow_model.pkl")
- bow_vector_path = os.path.join(data_root_path, "bow_vector.npy")
- formula_data_path = os.path.join(data_root_path, "formula_data.json")
- # 日志地址
- log_root_path = os.path.join(root_path, "logs")
- # 查重日志地址
- retrieval_path = os.path.join(log_root_path, "retrieval_app.log")
- # 日志信息(message)格式
- log_msg = "id : {id} -> {type} -> {message}"
|