tujintao
/
physics_repeat_check


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
							import os
import pymongo

# 建立mongodb连接
myclient = pymongo.MongoClient("mongodb://192.168.1.140:27017/")
mongo_info_db = myclient["ksy"]
mongo_coll = mongo_info_db['topic']

# mongodb句向量训练标志
sent_train_flag = 1
# 批量处理数据字典格式
batch_processing_dict = {
    "id_list": [],
    "cont_clear_list": [],
    "cont_cut_list": [],
    "cut_idx_list": [0]
}

# 词向量维度
vector_dim = 384
# hnsw评价指标('l2','cosine','ip')
hnsw_metric = ('l2','cosine','ip')[0]
# hnsw最大索引数量
num_elements = 1000000
# hnsw召回数量参数
hnsw_set_ef = 150

# hnsw模型检索链接
hnsw_retrieve_url = r"http://localhost:8836/retrieve"

# 根地址
root_path = os.getcwd()
data_root_path = os.path.join(root_path, "model_data")
# Sentence_BERT模型地址
sbert_path = os.path.join(data_root_path, "all-MiniLM-L6-v2")
# bert-whitening参数地址
whitening_path = os.path.join(data_root_path, "whitening_param.pkl")
# 停用词地址
stop_words_path = os.path.join(data_root_path, "stop_words.txt")
# sqlite数据库地址
sqlite_path = os.path.join(data_root_path, "info_retrieval.db")
sqlite_copy_path = os.path.join(data_root_path, "info_retrieval_copy.db")
# hnsw模型地址
hnsw_path = "hnsw_model.bin"
# hnsw_path = "hnsw_model_norm.bin"
# 公式处理数据地址
bow_model_path = os.path.join(data_root_path, "bow_model.pkl")
bow_vector_path = os.path.join(data_root_path, "bow_vector.npy")
formula_data_path = os.path.join(data_root_path, "formula_data.json")

# 日志地址
log_root_path = os.path.join(root_path, "logs")
# 查重日志地址
retrieval_path = os.path.join(log_root_path, "retrieval_app.log")
# 日志信息(message)格式
log_msg = "id : {id} -> {type} -> {message}"