import sys import torch from torch.utils.data import TensorDataset, DataLoader from torch.utils.data import DataLoader, Dataset from concurrent.futures import ThreadPoolExecutor from transformers import BertTokenizer sys.path.append("/home/cv/workspace/tujintao/document_segmentation") from Utils.read_data import read_data, split_dataset tokenizer = BertTokenizer.from_pretrained('Models/bert-base-chinese', use_fast=True, do_lower_case=True) class DataSetForSingleDocumet(Dataset): def __init__(self, datasets, llabels, max_workers=10, max_seq_len=512, max_sentences_num=128): """ :param bert_tokenizer: 分词器 :param max_workers: 包含列名comment和sentiment的data frame """ self.pool = ThreadPoolExecutor(max_workers=max_workers) self.documents = datasets self.llabels = llabels self.max_seq_len = max_seq_len # self.max_sentences_num = max_sentences_num def __len__(self): return len(self.documents) def __getitem__(self, index): """ 一个dataset包含多个样本,一个样本包含多个句子,每个句子编码需要考虑上下句; 若每个句子单独编码,可以考虑bert后接个双向的lstm dataset:[[sent1, sent2,...],....] """ sentences = self.documents[index] labels = self.llabels[index] inputs = tokenizer(sentences, padding='max_length', truncation=True, max_length=self.max_seq_len, return_tensors='pt') t_seqs = torch.tensor(inputs['input_ids'].squeeze(0), dtype=torch.long) t_seq_masks = torch.tensor(inputs['attention_mask'].squeeze(0), dtype=torch.long) t_labels = torch.tensor(labels, dtype=torch.long) return t_seqs, t_seq_masks, t_labels def load_data(filepath, batch_size=1): """ 加载excel文件,有train和test 的sheet :param filepath: 文件路径 :param pretrained_model_name_or_path: 使用什么样的bert模型 :param max_seq_len: bert最大尺寸,不能超过512 :param batch_size: 小批量训练的数据 :return: 返回训练和测试数据迭代器 DataLoader形式, 一条数据代表一个文件!!!,由于显存限制,一般一次性取一个文件,但这个文件的句子数目不定 ===>还是占显存太大 需要切割文档,对文档句子作限制!!!! """ # --------txt文件数据读取------------- all_documents, all_labels = read_data(filepath) train_data, valid_data, test_data = split_dataset(all_documents, all_labels) train_doc, train_seg_labels = train_data valid_doc, valid_seg_labels = valid_data test_doc, test_seg_labels = test_data print("Train size:", len(train_doc)) print("Valid size:", len(valid_doc)) print("Test size:", len(test_doc)) # ---------------统计句长----------------- # max_tl = 0 # max_token = [] # for sentences in all_documents: # for one_sent in sentences: # token_id = tokenizer.encode(one_sent) # if len(token_id) > max_tl: # max_tl = len(token_id) # max_token = token_id # print("最大句token长:", max_tl) # 473 # print("最大token:", tokenizer.decode(max_token)) # ----------------------------------------- train_dataset = DataSetForSingleDocumet(train_doc, train_seg_labels, max_seq_len=96) valid_dataset = DataSetForSingleDocumet(valid_doc, valid_seg_labels, max_seq_len=96) test_dataset = DataSetForSingleDocumet(test_doc, test_seg_labels, max_seq_len=96) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size) test_dataloader = DataLoader(test_dataset, batch_size=batch_size) # for batch in tqdm(test_dataloader): # 测试用 # # batch_labels的shape:[1,74],这里的1是因为batch_size为1 # print(batch, batch[0].size()) return train_dataloader, valid_dataloader, test_dataloader # from Utils.configs import data_dir # from tqdm import tqdm # load_data(data_dir) sentences = ["1963年出生,工科学士,高级工程师,北京物资学院客座副教授。", "1985年8月—1993年在国家物资局、物资部、国内贸易部金属材料流通司从事国家统配钢材中特种钢材品种的全国调拔分配工作,先后任科员、副主任科员、主任科员。", "1993年5月—1999年5月受国内贸易部委派到国内贸易部、冶金部、天津市政府共同领导组建的北洋(天津)钢材批发交易市场任理事长助理、副总裁。", ] inputs = tokenizer(sentences, padding='max_length', truncation=True, max_length=96, return_tensors='pt') print(inputs)