123456789101112131415161718192021222324252627 |
- import sys
- import time
- import config
- from data_preprocessing import DataPreProcessing
- # 数据清洗与句向量计算
- def clear_embedding_train(mongo_coll, sup, sub):
- origin_dataset = mongo_coll[0].find({"is_stop": 0}, no_cursor_timeout=True, batch_size=5)
- dpp = DataPreProcessing(mongo_coll, is_train=True)
- start = time.time()
- dpp(origin_dataset[sup:sub], hnsw_index=0)
- print("耗时:", time.time()-start)
- if __name__ == "__main__":
- # 获取shell输入参数
- argv_list = sys.argv
- if len(argv_list) == 1:
- sup, sub = None, None
- elif len(argv_list) == 2:
- sup, sub = argv_list[1].split(':')
- sup = None if sup == '' else int(sup)
- sub = None if sub == '' else int(sub)
- # 获取mongodb数据
- mongo_coll = config.mongo_coll_list
- # 清洗文本与计算句向量(train_mode=1表示需要进行文本清洗与句向量计算)
- clear_embedding_train(mongo_coll, sup, sub)
|