import torch import torch.nn as nn from transformers import AutoConfig, BertTokenizer, AutoModel import config class Solution_Model(nn.Module): def __init__(self): super(Solution_Model, self).__init__() self.bert_config = AutoConfig.from_pretrained(config.bert_path) self.bert = AutoModel.from_pretrained(config.bert_path) self.fc = nn.Linear(in_features=self.bert_config.hidden_size, out_features=8) def forward(self, input_ids, attention_mask): x = self.bert(input_ids, attention_mask)[0][:, 0, :] x = self.fc(x) return x class Difficulty_Model(nn.Module): def __init__(self): super(Difficulty_Model, self).__init__() self.bert_config = AutoConfig.from_pretrained(config.bert_path) self.bert = AutoModel.from_pretrained(config.bert_path) self.fc = nn.Linear(in_features=self.bert_config.hidden_size, out_features=8) def forward(self, input_ids, attention_mask): x = self.bert(input_ids, attention_mask)[0][:, 0, :] x = self.fc(x) return x class Dimension_Classification(): def __init__(self, logger=None): self.tokenizer = BertTokenizer.from_pretrained(config.bert_path) self.solution_model = torch.load(config.solution_model_path) self.difficulty_model = torch.load(config.difficulty_model_path) self.max_squence_length = 500 self.solving_type_dict = { 0: "实验操作", 1: "计算分析", 2: "连线作图", 3: "实验读数", 4: "现象解释", 5: "概念辨析", 6: "规律理解", 7: "物理学史" } # 日志采集 self.logger = logger def __call__(self, sentence, quesType): solution_list = self.solution_classify(sentence, quesType) difficulty_value = self.difficulty_classify(sentence) res_dict = { "solving_type": solution_list, "difficulty": difficulty_value, } return res_dict def solution_classify(self, sentence, quesType): solution_tensor = self.model_calculate(self.solution_model, sentence) solution_tensor[solution_tensor >= 0.5] = 1 solution_tensor[solution_tensor < 0.5] = 0 solution_list = [self.solving_type_dict[idx] for idx in solution_tensor[0].int().tolist() if idx == 1] # 题型判断 if quesType == "计算题": solution_list.append("计算分析") elif quesType == "作图题": solution_list.append("连线作图") if len(solution_list) == 0: solution_list.append("规律理解") return list(set(solution_list)) def difficulty_classify(self, sentence): difficulty_tensor = self.model_calculate(self.difficulty_model, sentence).item() difficulty_value = 0.6 if difficulty_tensor >= 0.8: difficulty_value = 0.8 elif difficulty_tensor <= 0.2: difficulty_value = 0.4 else: difficulty_value = 0.6 return difficulty_value def model_calculate(self, model, sentence): model.eval() with torch.no_grad(): token_list = self.sentence_tokenize(sentence) mask_list = self.attention_mask(token_list) output_tensor = model(torch.tensor(token_list), attention_mask=torch.tensor(mask_list)) output_tensor = torch.sigmoid(output_tensor) return output_tensor def sentence_tokenize(self, sentence): # 直接截断 # 编码时: 开头添加[LCS]->101, 结尾添加[SEP]->102, 未知的字或单词变为[UNK]->100 token_list = self.tokenizer.encode(sentence[:self.max_squence_length]) # 补齐(pad的索引号就是0) if len(token_list) < self.max_squence_length + 2: token_list.extend([0] * (self.max_squence_length + 2 - len(token_list))) return [token_list] def attention_mask(self, tokens_list): # 在一个文本中,如果是PAD符号则是0,否则就是1 mask_list = [] for tokens in tokens_list: mask = [float(token > 0) for token in tokens] mask_list.append(mask) return mask_list if __name__ == "__main__": dc = Dimension_Classification() sentence = "荆门市是国家循环经济试点市,目前正在沙洋建设全国最大的秸秆气化发电厂.电厂建成后每年可消化秸秆13万吨,发电9*10^7*kW*h.同时电厂所产生的灰渣将生成肥料返还农民,焦油用于精细化工,实现“农业--工业--农业”循环.(1)若秸秆电厂正常工作时,每小时可发电2.5*10^5*kW*h,按每户居民每天使用5只20*W的节能灯、1个800*W的电饭锅、1台100*W的电视机计算,该发电厂同时可供多少户居民正常用电?(2)与同等规模的火电厂相比,该电厂每年可减少6.4万吨二氧化碳的排放量,若火电厂煤燃烧的热利用率为20%,秸秆电厂每年可节约多少吨标准煤?(标准煤的热值按3.6*10^7J/k*g计算)" res = dc(sentence, "") print(res)