import sys
import time
import json
import config
from data_preprocessing import DataPreProcessing
from dim_classify import Dimension_Classification
from physical_quantity_extract import physical_quantity_extract
"""
MongoDB数据类型:
{
"id" : 20231001,
"quesType" : {
"quesType" : "单选题"
},
"quesBody" : "荔枝是一种岭南佳果,小明拿起一个荔枝,如题图所示,它的尺寸l大小约为( )
\nA. 0.1cm B. 3cm C. 0.3m D. 1m",
"quesParse" : "......",
"quesAnswer" : "【答案】见解析",
"difficulty" : "一般",
"knowledge" : [
"长度的测量"
]
}
"""
# 数据清洗与句向量计算
def clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub):
origin_dataset = mongo_coll.find(mongo_find_dict, no_cursor_timeout=True, batch_size=5)
dpp = DataPreProcessing(mongo_coll, is_train=True)
start = time.time()
dpp(origin_dataset[sup:sub])
print("耗时:", time.time()-start)
# 知识点转换成id用于mongodb检索/计算物理量/计算求解类型
def convert_knowledge2id(mongo_coll, mongo_find_dict, sup, sub):
# 加载知识点转ID数据
with open(config.keyword_mapping_path, 'r', encoding="utf8") as f:
knowledge2id = json.load(f)["knowledge2id"]
dim_classify = Dimension_Classification(dim_mode=0)
origin_dataset = mongo_coll.find(mongo_find_dict, no_cursor_timeout=True, batch_size=5)
start = time.time()
for data in origin_dataset[sup:sub]:
condition = {"id": data["id"]}
# 计算物理量
physical_quantity_list = physical_quantity_extract(data["content_clear"])
# 计算求解类型
solution_list = dim_classify(data["content_clear"], data["quesType"])["solving_type"]
# 知识点转ID
knowledge_list = [knowledge2id[ele] for ele in data["knowledge"] if ele in knowledge2id]
update_elements = {"$set": {"physical_quantity": physical_quantity_list,
"solving_type": solution_list,
"knowledge_id": knowledge_list}}
mongo_coll.update_one(condition, update_elements)
print(physical_quantity_list, solution_list)
print("耗时:", time.time()-start)
if __name__ == "__main__":
# 获取shell输入参数
argv_list = sys.argv
if len(argv_list) == 1:
sup, sub = None, None
elif len(argv_list) == 2:
sup, sub = argv_list[1].split(':')
sup = None if sup == '' else int(sup)
sub = None if sub == '' else int(sub)
# 获取mongodb数据
mongo_coll = config.mongo_coll
# mongo_find_dict = {"sent_train_flag": {"$exists": 0}}
mongo_find_dict = dict()
# 清洗文本与计算句向量(train_mode=1表示需要进行文本清洗与句向量计算)
clear_embedding_train(mongo_coll, mongo_find_dict, sup, sub)
# 知识点转换成id用于mongodb检索
convert_knowledge2id(mongo_coll, mongo_find_dict, sup, sub)