1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 |
- import sys
- import time
- import json
- import config
- from data_preprocessing import DataPreProcessing
- from dim_classify import Dimension_Classification
- from physical_quantity_extract import physical_quantity_extract
- """
- MongoDB数据类型:
- {
- "id" : 20231001,
- "quesType" : {
- "quesType" : "单选题"
- },
- "quesBody" : "荔枝是一种岭南佳果,小明拿起一个荔枝,如题图所示,它的尺寸l大小约为( )<br/><img src=\"Upload/QBM/20231001.png\" /><br/>\nA. 0.1cm B. 3cm C. 0.3m D. 1m",
- "quesParse" : "......",
- "quesAnswer" : "【答案】见解析",
- "difficulty" : "一般",
- "knowledge" : [
- "长度的测量"
- ]
- }
- """
- # 数据清洗与句向量计算
- def clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub):
- origin_dataset = mongo_coll.find(mongo_find_dict, no_cursor_timeout=True, batch_size=5)
- dpp = DataPreProcessing(mongo_coll, is_train=True)
- start = time.time()
- dpp(origin_dataset[sup:sub])
- print("耗时:", time.time()-start)
- # 知识点转换成id用于mongodb检索/计算物理量/计算求解类型
- def convert_knowledge2id(mongo_coll, mongo_find_dict, sup, sub):
- # 加载知识点转ID数据
- with open("model_data/keyword_mapping.json", 'r', encoding="utf8") as f:
- knowledge2id = json.load(f)["knowledge2id"]
- dim_classify = Dimension_Classification(dim_mode=0)
- origin_dataset = mongo_coll.find(mongo_find_dict, no_cursor_timeout=True, batch_size=5)
- start = time.time()
- for data in origin_dataset[sup:sub]:
- condition = {"id": data["id"]}
- # 计算物理量
- physical_quantity_list = physical_quantity_extract(data["content_clear"])
- # 计算求解类型
- solution_list = dim_classify(data["content_clear"], data["quesType"])["solving_type"]
- # 知识点转ID
- knowledge_list = [knowledge2id[ele] for ele in data["knowledge"] if ele in knowledge2id]
- update_elements = {"$set": {"physical_quantity": physical_quantity_list,
- "solving_type": solution_list,
- "knowledge_id": knowledge_list}}
- mongo_coll.update_one(condition, update_elements)
- print(physical_quantity_list, solution_list)
- print("耗时:", time.time()-start)
- if __name__ == "__main__":
- # 获取shell输入参数
- argv_list = sys.argv
- if len(argv_list) == 1:
- sup, sub = None, None
- elif len(argv_list) == 2:
- sup, sub = argv_list[1].split(':')
- sup = None if sup == '' else int(sup)
- sub = None if sub == '' else int(sub)
- # 获取mongodb数据
- mongo_coll = config.mongo_coll
- # mongo_find_dict = {"sent_train_flag": {"$exists": 0}}
- mongo_find_dict = dict()
- # 清洗文本与计算句向量(train_mode=1表示需要进行文本清洗与句向量计算)
- clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub)
- # 知识点转换成id用于mongodb检索
- convert_knowledge2id(mongo_coll, mongo_find_dict, sup, sub)
|