import sys import time import json import config from data_preprocessing import DataPreProcessing from dim_classify import Dimension_Classification from physical_quantity_extract import physical_quantity_extract """ MongoDB数据类型: { "id" : 20231001, "quesType" : { "quesType" : "单选题" }, "quesBody" : "荔枝是一种岭南佳果,小明拿起一个荔枝,如题图所示,它的尺寸l大小约为( )

\nA. 0.1cm B. 3cm C. 0.3m D. 1m", "quesParse" : "......", "quesAnswer" : "【答案】见解析", "difficulty" : "一般", "knowledge" : [ "长度的测量" ] } """ # 数据清洗与句向量计算 def clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub): origin_dataset = mongo_coll.find(mongo_find_dict, no_cursor_timeout=True, batch_size=5) dpp = DataPreProcessing(mongo_coll, is_train=True) start = time.time() dpp(origin_dataset[sup:sub]) print("耗时:", time.time()-start) # 知识点转换成id用于mongodb检索/计算物理量/计算求解类型 def convert_knowledge2id(mongo_coll, mongo_find_dict, sup, sub): # 加载知识点转ID数据 with open(config.keyword_mapping_path, 'r', encoding="utf8") as f: knowledge2id = json.load(f)["knowledge2id"] dim_classify = Dimension_Classification(dim_mode=0) origin_dataset = mongo_coll.find(mongo_find_dict, no_cursor_timeout=True, batch_size=5) start = time.time() for data in origin_dataset[sup:sub]: condition = {"id": data["id"]} # 计算物理量 physical_quantity_list = physical_quantity_extract(data["content_clear"]) # 计算求解类型 solution_list = dim_classify(data["content_clear"], data["quesType"])["solving_type"] # 知识点转ID knowledge_list = [knowledge2id[ele] for ele in data["knowledge"] if ele in knowledge2id] update_elements = {"$set": {"physical_quantity": physical_quantity_list, "solving_type": solution_list, "knowledge_id": knowledge_list}} mongo_coll.update_one(condition, update_elements) print(physical_quantity_list, solution_list) print("耗时:", time.time()-start) if __name__ == "__main__": # 获取shell输入参数 argv_list = sys.argv if len(argv_list) == 1: sup, sub = None, None elif len(argv_list) == 2: sup, sub = argv_list[1].split(':') sup = None if sup == '' else int(sup) sub = None if sub == '' else int(sub) # 获取mongodb数据 mongo_coll = config.mongo_coll # mongo_find_dict = {"sent_train_flag": {"$exists": 0}} mongo_find_dict = dict() # 清洗文本与计算句向量(train_mode=1表示需要进行文本清洗与句向量计算) clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub) # 知识点转换成id用于mongodb检索 convert_knowledge2id(mongo_coll, mongo_find_dict, sup, sub)