import sys import numpy as np import torch from tqdm import tqdm from torch.utils.data import DataLoader, RandomSampler from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup # from torch.nn import DataParallel import matplotlib.pyplot as plt from PointerNet.model import UIEModel from PointerNet.config import NerArgs from PointerNet.data_loader import NerDataset, NerCollate, load_and_split_dataset, get_paper_for_predict from PointerNet.utils.decode import ner_decode, topic_ner_decode from PointerNet.utils.metrics import calculate_metric, classification_report, get_p_r_f from pprint import pprint from PointerNet import config logger = config.myLog(__name__, log_cate="train_log").getlog() sigmoid = torch.nn.Sigmoid() class NerPipeline: def __init__(self, model, args): self.model = model self.args = args if "train_path" in dir(self.args): self.train_data, self.valid_data, self.test_data = load_and_split_dataset(self.args.train_path, train_ratio=0.995, valid_ratio=0.003) def save_model(self): torch.save(self.model.state_dict(), self.args.save_dir) torch.save(self.optimizer.state_dict(), self.args.optimizer_save_dir) def load_model(self): self.model.load_state_dict(torch.load(self.args.save_dir, map_location="cpu")) self.model.to(self.args.device) def build_optimizer_and_scheduler(self, t_total): module = ( self.model.module if hasattr(self.model, "module") else self.model ) # 差分学习率 no_decay = ["bias", "LayerNorm.weight"] model_param = list(module.named_parameters()) bert_param_optimizer = [] other_param_optimizer = [] for name, para in model_param: space = name.split('.') # print(name) if "bert" in space[0]: bert_param_optimizer.append((name, para)) else: other_param_optimizer.append((name, para)) optimizer_grouped_parameters = [ # bert other module {"params": [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, 'lr': self.args.lr}, {"params": [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, 'lr': self.args.lr}, # 其他模块,差分学习率 {"params": [p for n, p in other_param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, 'lr': self.args.other_lr}, {"params": [p for n, p in other_param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, 'lr': self.args.other_lr}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(self.args.warmup_proportion * t_total), num_training_steps=t_total ) return optimizer, scheduler def eval_forward(self, data_loader): s_logits, e_logits = [], [] self.model.eval() for eval_step, batch_data in enumerate(data_loader): # for key in batch_data.keys(): # batch_data[key] = batch_data[key].to(self.args.device) output = model(batch_data['input_ids'], batch_data['attention_mask'], ) start_logits = output["ner_output"]["ner_start_logits"] end_logits = output["ner_output"]["ner_end_logits"] for i in range(len(start_logits)): s_logits.append([start_logits[i]]) e_logits.append([end_logits[i]]) return s_logits, e_logits def half_batch_eval_forward(self, data_loader): """ 本项目batch_size=1,一个批次为一个文档样本,包含句子太多时,需要截断再分批前向计算 不过此时无须反馈学习,耗显存不会很大,一次性计算的句子可以更多 """ s_logits, e_logits, con_logits = [], [], [] self.model.eval() for eval_step, batch_data in enumerate(data_loader): max_input_len = self.args.max_input_sent_num # 本项目中由于显存限制,batch_size=1,则batch_n=0 one_data_s_logit, one_data_e_logit, one_data_con_logit = [], [], [] for batch_n in range(len(batch_data['ner_end_labels'])): batch_num = int(len(batch_data['input_ids'][batch_n]) / max_input_len) if batch_num > 0: for i in range(batch_num): left, right = i*max_input_len, (i+1)*max_input_len if i == batch_num-1 and len(batch_data['input_ids'][batch_n]) - (i+1)*max_input_len<3: batch_num -= 1 break output = self.model(batch_data['input_ids'][batch_n][left:right].to(self.args.device), batch_data['attention_mask'][batch_n][left:right].to(self.args.device)) start_logits = output["ner_output"]["ner_start_logits"] end_logits = output["ner_output"]["ner_end_logits"] content_logits = output["ner_output"]["ner_content_logits"] one_data_s_logit.extend(start_logits) one_data_e_logit.extend(end_logits) one_data_con_logit.extend(content_logits) if len(batch_data['input_ids'][batch_n]) - batch_num * max_input_len > 0: left = batch_num*max_input_len output = self.model(batch_data['input_ids'][batch_n][left:].to(self.args.device), batch_data['attention_mask'][batch_n][left:].to(self.args.device)) start_logits = output["ner_output"]["ner_start_logits"] end_logits = output["ner_output"]["ner_end_logits"] content_logits = output["ner_output"]["ner_content_logits"] one_data_s_logit.extend(start_logits) one_data_e_logit.extend(end_logits) one_data_con_logit.extend(content_logits) one_data_s_logit = torch.cat(one_data_s_logit) one_data_e_logit = torch.cat(one_data_e_logit) one_data_con_logit = torch.cat(one_data_con_logit) s_logits.append(one_data_s_logit) e_logits.append(one_data_e_logit) con_logits.append(one_data_con_logit) return s_logits, e_logits, con_logits def get_metric(self, s_logits, e_logits, con_logits, callback): # batch_size = len(callback) total_count = [0 for _ in range(len(self.args.id2label))] role_metric = np.zeros([len(self.args.id2label), 3]) for s_logit, e_logit, con_logit, tmp_callback in zip(s_logits, e_logits, con_logits, callback): text_list, gt_entities = tmp_callback pred_entities = topic_ner_decode(sigmoid(s_logit), sigmoid(e_logit), sigmoid(con_logit), text_list, self.args.id2label) # print("真实起始位置及试题长度:", [i[1] for i in gt_entities['TOPIC']], [len(i[0]) for i in gt_entities['TOPIC']]) # print("预测起始位置及试题长度:", [i[1] for i in pred_entities['TOPIC']], [len(i[0]) for i in pred_entities['TOPIC']]) # print("=========预测===============") # print(pred_entities) # print("==========实际==============") # print(gt_entities) for idx, _type in enumerate(self.args.labels): # 只有一个label if _type not in pred_entities: pred_entities[_type] = [] total_count[idx] += len(gt_entities[_type]) role_metric[idx] += calculate_metric(pred_entities[_type], gt_entities[_type]) return role_metric, total_count def half_batch_train(self, batch_data, epoch, global_step, t_total): """ 本项目batch_size=1,一个批次为一个文档样本,包含句子太多时,需要截断再分批训练 batch_data: 批次数据,list """ y_loss = [] max_input_len = self.args.max_input_sent_num for batch_n in range(len(batch_data['ner_end_labels'])): # 本项目中由于显存限制,batch_size=1 batch_num = int(len(batch_data['input_ids'][batch_n]) / max_input_len) print("batch_num:::", batch_num, batch_n, len(batch_data['input_ids'][batch_n])) global_step += 1 if batch_num > 0: for i in range(batch_num): left, right = i*max_input_len, (i+1)*max_input_len if i == batch_num-1 and len(batch_data['input_ids'][batch_n]) - (i+1)*max_input_len<3: batch_num -= 1 break output = self.model(batch_data['input_ids'][batch_n][left:right].to(self.args.device), batch_data['attention_mask'][batch_n][left:right].to(self.args.device), batch_data['ner_start_labels'][batch_n][left:right].to(self.args.device), batch_data['ner_end_labels'][batch_n][left:right].to(self.args.device), batch_data['ner_content_labels'][batch_n][left:right].to(self.args.device), ) loss = output["ner_output"]["ner_loss"] y_loss.append(loss.item()) torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) loss.backward() # 耗显存 self.optimizer.step() self.scheduler.step() self.model.zero_grad() # global_step += 1 print('【train】 Epoch: %d/%d Step: %d/%d loss: %.5f' % ( epoch, self.args.train_epoch, global_step, t_total, loss.item())) if len(batch_data['input_ids']) - batch_num * max_input_len > 0: left = batch_num*max_input_len output = self.model(batch_data['input_ids'][batch_n][left:].to(self.args.device), batch_data['attention_mask'][batch_n][left:].to(self.args.device), batch_data['ner_start_labels'][batch_n][left:].to(self.args.device), batch_data['ner_end_labels'][batch_n][left:].to(self.args.device), batch_data['ner_content_labels'][batch_n][left:].to(self.args.device), ) loss = output["ner_output"]["ner_loss"] y_loss.append(loss.item()) torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) loss.backward() # 耗显存 self.optimizer.step() self.scheduler.step() self.model.zero_grad() # global_step += 1 print('【train】 Epoch: %d/%d Step: %d/%d loss: %.5f' % ( epoch, self.args.train_epoch, global_step, t_total, loss.item())) return global_step, sum(y_loss) / len(y_loss) def train(self, dev=True): train_dataset, train_callback = NerDataset(data=self.train_data, tokenizer=self.args.tokenizer, max_len=self.args.max_seq_len, label_list=self.args.labels) print(111111111111111111111111111111) collate = NerCollate(max_len=self.args.max_seq_len, label2id=self.args.label2id) print(222222222222222222222222222222) train_sampler = RandomSampler(train_dataset) train_loader = DataLoader(dataset=train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, num_workers=10, # 多进程加载数据 collate_fn=collate.collate_fn) dev_loader = None dev_callback = None if dev: dev_dataset, dev_callback = NerDataset(data=self.valid_data, tokenizer=self.args.tokenizer, max_len=self.args.max_seq_len, label_list=self.args.labels) dev_loader = DataLoader(dataset=dev_dataset, batch_size=self.args.eval_batch_size, shuffle=False, num_workers=10, collate_fn=collate.collate_fn) t_total = len(train_loader) * self.args.train_epoch self.optimizer, self.scheduler = self.build_optimizer_and_scheduler(t_total) global_step = 0 self.model.zero_grad() self.model.to(self.args.device) # if torch.cuda.device_count() > 1: # 多GPU训练时,要求数据在送入model()前也要加载到gpu中 # self.model = DataParallel(self.model, device_ids=[0,1]) eval_step = self.args.eval_step best_f1 = 0. y_loss = [] for epoch in range(1, self.args.train_epoch + 1): y_loss_bt = [] # 训练过程每一轮的所有loss for batch_data in tqdm(train_loader): self.model.train() # for key in batch_data.keys(): # print(batch_data[key]) # batch_data[key] = batch_data[key].to(self.args.device) global_step, mean_loss = self.half_batch_train(batch_data, epoch, global_step, t_total) y_loss_bt.append(mean_loss) if dev and global_step % eval_step == 0: # s_logits, e_logits = self.eval_forward(dev_loader) s_logits, e_logits, con_logits = self.half_batch_eval_forward(dev_loader) role_metric, _ = self.get_metric(s_logits, e_logits, con_logits, dev_callback) mirco_metrics = np.sum(role_metric, axis=0) mirco_metrics = get_p_r_f(mirco_metrics[0], mirco_metrics[1], mirco_metrics[2]) print('【eval】 precision={:.4f} recall={:.4f} f1_score={:.4f}'.format(mirco_metrics[0], mirco_metrics[1], mirco_metrics[2])) logger.info('【eval】 precision={:.4f} recall={:.4f} f1_score={:.4f}'.format(mirco_metrics[0], mirco_metrics[1], mirco_metrics[2])) y_loss.append(sum(y_loss_bt) / len(y_loss_bt)) y_loss_bt = [] if mirco_metrics[2] > best_f1: best_f1 = mirco_metrics[2] print("best_f1:{}".format(mirco_metrics[2])) logger.info("best_f1:{}".format(mirco_metrics[2])) self.save_model() # 训练完每一轮画图,训练完一轮就很费时间 x = [i for i in range(len(y_loss))] plt.figure() plt.plot(x, y_loss) plt.savefig(f'loss_curve_{epoch}_epoch.png') # 保存图片 plt.show() def test(self): test_dataset, test_callback = NerDataset(data=self.test_data, tokenizer=self.args.tokenizer, max_len=self.args.max_seq_len, label_list=self.args.labels) collate = NerCollate(max_len=self.args.max_seq_len, label2id=self.args.label2id) test_loader = DataLoader(dataset=test_dataset, batch_size=self.args.eval_batch_size, shuffle=False, num_workers=2, collate_fn=collate.collate_fn) self.load_model() self.model.to(self.args.device) with torch.no_grad(): # s_logits, e_logits = self.eval_forward(test_loader) s_logits, e_logits, con_logits = self.half_batch_eval_forward(test_loader) role_metric, total_count = self.get_metric(s_logits, e_logits, con_logits, test_callback) mirco_metrics = np.sum(role_metric, axis=0) mirco_metrics = get_p_r_f(mirco_metrics[0], mirco_metrics[1], mirco_metrics[2]) print( '[eval] precision={:.4f} recall={:.4f} f1_score={:.4f}'.format(mirco_metrics[0], mirco_metrics[1], mirco_metrics[2])) print(classification_report(role_metric, self.args.labels, self.args.id2label, total_count)) def predict(self, **kwargs): # self.load_model() # self.model.eval() # self.model.to(self.args.device) with torch.no_grad(): print("\n********************************************\n") if kwargs: sentences = kwargs["text_list"] labels = kwargs["labels"] if "labels" in kwargs else [] else: sentences, labels = get_paper_for_predict() print(sentences) print(labels) # 对于单个句子 # encode_dict = self.args.tokenizer.encode_plus(text=[i for i in text], # max_length=self.args.max_seq_len, # padding="max_length", # truncation="only_first", # return_token_type_ids=True, # return_attention_mask=True) inputs = self.args.tokenizer(sentences, padding='max_length', truncation=True, max_length=self.args.max_seq_len, return_tensors='pt') token_ids = inputs['input_ids'].to(self.args.device) attention_mask = inputs['attention_mask'].to(self.args.device) # tokens = ['[CLS]'] + tokens + ['[SEP]'] # token_ids = torch.from_numpy(np.array(encode_dict['input_ids'])).unsqueeze(0).to(self.args.device) # attention_mask = torch.from_numpy(np.array(encode_dict['attention_mask'])).unsqueeze(0).to( # self.args.device) # token_type_ids = torch.from_numpy(np.array(encode_dict['token_type_ids'])).unsqueeze(0).to(self.args.device) output = self.model(token_ids, attention_mask) start_logits = output["ner_output"]["ner_start_logits"] end_logits = output["ner_output"]["ner_end_logits"] content_logits = output["ner_output"]["ner_content_logits"] start_logits = sigmoid(start_logits[0]) end_logits = sigmoid(end_logits[0]) con_logits = sigmoid(content_logits[0]) # print("start_logits:::", start_logits) pred_entities, topic_item_pred = topic_ner_decode(start_logits, end_logits, con_logits, sentences, self.args.id2label) # pprint(dict(pred_entities)) split_topic_idx = [] for i in pred_entities['TOPIC']: split_topic_idx.append((i[-1], i[-1]+len(i[0]))) # print(split_topic_idx) return dict(pred_entities), split_topic_idx, topic_item_pred if __name__ == '__main__': args = NerArgs() model = UIEModel(args) ner_pipeline = NerPipeline(model, args) ner_pipeline.train(dev=True) # # 批量测试 ner_pipeline.test() # 单个测试 # txts = [ # '七、解答(本题共8小题,每小题4分,共32分,在给出的四个选项中,', '每小题中只有一个选项符合题目要求。)', '11、如图所示,在$x>0$的空间中,存在沿$x$轴方向的匀强电场$E$;在$x<0$的空间中,存在沿$x$轴负方向的匀强电场,场强大小也为$E$。一电子$(-e,m)$在$x=d$处的$P$点以沿$y$轴正方向的初速度$v__0$', '开始运动,不计电子重力。求:(1)电子的$x$方向分运动的周期。(2)电子运动的轨迹与$y$轴的各个交点中,任意两个交点的距离。【图片】', '【答案】(1)$4sqrt(((2m*d)/(eE)))$;(2)$2nv__0sqrt(((2m*d)/(eE)))(n=1,2,3…)$。', '如图,一导热性能良好、内壁光滑的汽缸水平放置,横截面积$S=1.010^^-3m^2$、质量$m=2kg$、厚度不计的活', '塞与汽缸底部之', '间封闭了一部分理想气体,此时活塞与汽缸底部之间的距离$l=36cm$,在活塞的右侧距离其$d=14cm$处有一对与汽缸固定连接的卡环。气体的温度$t=27*℃$,外界大气压强$p__0=1.010^5Pa$。现将汽缸开口向上竖直放置($g$取$10m/s^2$)。 (1)求此时活塞与汽缸底部之间的距离$h$; (2)如果将缸内气体加热到$600K$,求此时气体的压强$p$。 【图片】', '【解析】(1)汽缸水平放置时:封闭气体的压强$p__1=p__0=1.010^5Pa$,温度$T__1=300K$,体积$V__1=lS$;汽缸竖直放置时:封闭气体的压强$p__2=p__0+((mg)/S)=1.210^5Pa$,温度$T__2=T__1=300K$,体', '积$V__2=hS$;由玻意耳定律$p__1V__1=p__2V__2$,$h=((p__1V__1)/(p__2S))$,解得$h=0.3m$ (2)温度升高,活塞刚达到卡环,气体做等压变化,此时$p__3=p__2$,$V__2=hS$,$V__3=(l+d)S$,$T__2=300K$,$((V__2)/(T__2))=((V__3)/(T__3))$,$T__3=500K$;汽缸内气体温度继续升高,气体做等容变化$p__3=1.210^5Pa$,$T__3=500K$,$T__4=600K$,$((p__3)/(T__3))=(p/(T__4))$,$p=1.4410^5Pa$ 答案:(1)$0.3m$; (2)$1.4410^5Pa$', '5、中国首艘航空母舰“辽宁”号正式交接入列和歼$-15$舰载战斗机成功起降,对于有效维护国家主权、促进世界和平与共同发展,具有重要意义.据了解,该航母长$304m$,宽$70.5m$,', '满载时吃水深度$11m$;若歼$-15$战斗机的质量为$3.310^4$kg,轮胎与甲板总接触面积$6000cm^2$,战斗机在约为$2.510^5N$推力作用下滑跑$20m$用时$5s$.求:$(g=10N/kg$,$ρ__海水=1.0310^3kg/m^3)$ (1)战斗机停在甲板上对甲板的压强; (2)推力做功的功率; (3)满载时海水对舰底的压强.', '答案:(1)战斗机停在甲板上对甲板', '的压强为$3.310^5N$', '; (2)推力做功的功率为$10^6W$; (3)满载时海水对舰底的压强$1.13310^5Pa$.', '解析:已知:$h=11m$,$m=3.310^4kg$,$S=6000cm^2=0.6m^2$,$F=2.510^5N$,$s=20m$,$t=5s$,$g=10N/kg$,$ρ__海水=1.0310^3kg/m^3$ 求:(1)战', '斗机停在甲板上对甲板的压强$p__1$; (2)推力做功的功率P; (3)满载时海水对舰底的压强$p__2$. 解:(1)战斗机对甲板的压力: $F__1=G=mg=3.310^4kg10N/kg=3.310^5N$, 对甲板的压强: $p__1=F__1S=3.310^5N0.6m^2=5.510^5Pa$; (2)推力做的功: $W=Fs=2.510^5N20m=510^6J$, 推力所做的功率: $P=Wt=510^6J5s=10^6W$; (3)满载时海水对舰底的压强: $p__2=ρ__海水gh=1.0310^3kg/m^310N/kg11m=1.13310^5Pa$. 答:(1)战斗机', '停在甲板上对甲板的压强为$3.310^5N$; (2)推力做功的功率为$10^6W$; (3)满载时海水对舰底的压强$1.13310^5Pa$.', '64、已知地球半径为R,地面的重力加度为g,将地球视为均第匀球体,求: (1)距地面$12R$高度处的力加速度 (2)地球的第一宇宙速度 (3)地球的自转周期为T,', '地球同步卫星离地面物的高度', '[答案](1)距地面$12R$高度处的重力加速度是$49g$ (2)地球的步第一宇宙速度是$sqrt(gR)$ (3)若地球的自转周期为T,地球同', '步卫离地面的高度是$root3*(gR^2T^24pi^2)-R$', '解析:(1)由于地球表面物体知随地球转而做圆运动的向心加速度体很小,地面上物体所受重力近似等于地球的万有引力,对地面上的', "物体有$GMmR^2=mg$,同理,对距地高度$12R$处的物体有$GMm*(RR2)^2=mg'$,解得$g'=49g$ (2)对地卫星环绕地心的匀速圆周运动有$GMmR^2=mv^2R$,解地球宇的第速一宇速度$v=sqrt(gR)$ $GMm*(Rh)^2=m*(Rh)*(2*πT)^2$,$GMmR^2=mg$,解得$h=root3*(gR^2T^24pi^2)-R$", '25、重力的方向总是垂直于接触面向下。( )', 'A.正确', 'B,错误', '在实验中,补偿阻力时要把物悬挂在细绳的一端。 A:正确', 'B,错误', '[答案]T', '4、任何情况下,物体的加速度方向始终与它所受的合力方向一致。( ) A 正确 B.错误', '【答案】不对的', '10.如果物体处于超重状态,它必然有向上的加速度。( )', 'A.正确', 'B、错误', '7.通过打出的纸带可以确定物体运动的时间和物体运动的位移。( ) A,正确', 'B.错误', '答案:错的', '六:多空(本题共11小题,总分48分。其中,1-7题为单选,每小题4分;8-11题为多选,每小题5分,全部选对的得5', '分,选对但不全的得2分,有选错得0分)', '【图片】用烧杯盛某种液体,测得液体体积V和液体与烧杯的共同质量m的关系如图所示.请观察图象并根据图象,求:烧杯的质量__________g;液体的', '密度$ρ__液=$________$kg/m^3$;其物理意义是_____.', '汤现姆孙在研究________的实验中发现了', '电子,电子的发现打破了___________的旧观。', '答案:阴极射;原子不可再分 【解析】汤姆孙在研究阴极射线的实验中发现了电子,电子的发现打破了子不可现再分的旧观念。', '1.我国载人登月已完成关键技术攻关,可送3人', '环月、2人登月。某实验小组在完成用单摆测量重力加速度实验后,讨论在月球上用单摆测量月球表面重力加速度的实验方案。用与地球上相同的实验设备,在月球上进行相同的实验,会得到更精确的结果,试写出其中的两个原因:__', '_____。', '【解析】1.周期变长,累积法测量周期的相对误差减小;2.月球上没有空气阻力,消除了空气阻尼引起的系统误差答案:周期变长,累积法测量周期的相对误差减小;月球上没有空气阻力,消除了空气阻尼引起的系统误差', '26.【图片】电阻A和B的电流与其两端电压的关系如图所示.由图可知,电阻A的阻', '值为________$Ω$;将A和B并联后接在$2.5V$的电源上,电路中的总电流为_A.', '10.教室中未搬入桌凳前说话常有嗡嗡的尾声,摆了桌凳坐满了学生后这种现象减轻到似乎听不到了,这是因为', '___________。', '答案:搬入桌凳和坐满学生后吸声面积增大,混响时间变短。' # ] # labels = [(2, 5), (5, 10), (10, 18), (18, 24), (24, 27), (27, 30), (30, 32), (32, 35), (35, 38), (40, 42), (42, 45), (45, 49), (49, 51), (51, 54)] # ner_pipeline.predict(text_list=txts, labels=labels) # ner_pipeline.predict() # print()