import sys import time import config from data_preprocessing import DataPreProcessing # 数据清洗与句向量计算 def clear_embedding_train(mongo_coll, sup, sub): origin_dataset = mongo_coll[0].find({"is_stop": 0}, no_cursor_timeout=True, batch_size=5) dpp = DataPreProcessing(mongo_coll, is_train=True) start = time.time() dpp(origin_dataset[sup:sub], hnsw_index=0) print("耗时:", time.time()-start) if __name__ == "__main__": # 获取shell输入参数 argv_list = sys.argv if len(argv_list) == 1: sup, sub = None, None elif len(argv_list) == 2: sup, sub = argv_list[1].split(':') sup = None if sup == '' else int(sup) sub = None if sub == '' else int(sub) # 获取mongodb数据 mongo_coll = config.mongo_coll_list # 清洗文本与计算句向量(train_mode=1表示需要进行文本清洗与句向量计算) clear_embedding_train(mongo_coll, sup, sub)