Reputation: 1
I am trying to train the BERT model but I haven't figured out the structure of TensorFlow yet. In the line for x = self.bert_module(book)
an error occurs.
Exception encountered when calling layer 'Embedding-Token' (type TokenEmbedding).
'torch.dtype' object has no attribute 'base_dtype'
Call arguments received by layer 'Embedding-Token' (type TokenEmbedding):
• inputs=tensor([ 101, 10531, 18301, 10124, 10127, 38036, 10251, 102, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Actually, the dictionary is what I came to. It was originally x = self.bert_module(ids, mask, token_type_ids)
. But the dictionary seems to me to be more correct.
Next, I attach the code.
import sys
sys.path.append(r'C:\Users\Demiurg\AppData\Local\Programs\Python\Python38\Lib\site-packages')
import pandas as pd
import time
import torch.nn as nn
import torch
import logging
import numpy as np
import argparse
from keras_bert import load_trained_model_from_checkpoint
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', level=logging.DEBUG)
logging.getLogger().setLevel(logging.INFO)
# --- КОНСТАНТЫ ---
BERT_MODEL_NAME = 'small_bert/bert_en_uncased_L-2_H-128_A-2'
if torch.cuda.is_available():
logging.info(f"GPU: {torch.cuda.get_device_name(0)} is available.")
DEVICE = torch.device('cuda')
else:
logging.info("No GPU available. Training will run on CPU.")
DEVICE = torch.device('cpu')
def my_collate_fn(data):
zipped = zip(data)
print("!!!")
print(data)
print("!!!")
return list(zipped)
# --- Подготовка и токенизация данных ---
class BertDataset(Dataset):
def __init__(self, df, tokenizer, max_length=100):
super(BertDataset, self).__init__()
self.batch_size = 1
df.columns = ['bodyText', 'Target'] # присвоение имен столбцам
self.df=df
self.tokenizer=tokenizer
self.target=self.df['Target']
self.max_length=max_length
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
X = self.df['bodyText'].values[idx]
y = self.target.values[idx]
inputs = self.tokenizer.encode_plus(
X,
pad_to_max_length=True,
add_special_tokens=True,
return_attention_mask=True,
max_length=self.max_length,
)
ids = inputs["input_ids"]
token_type_ids = inputs["token_type_ids"]
mask = inputs["attention_mask"]
x = {
'ids': torch.tensor(ids, dtype=torch.long).to(DEVICE),
'mask': torch.tensor(mask, dtype=torch.long).to(DEVICE),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(DEVICE)
}
y = self.tokenizer(y)["input_ids"]
y = torch.tensor(y, dtype=torch.long).to(DEVICE)
return x, y
# --- Определение модели ---
class SentimentBERT(nn.Module):
def __init__(self, bert_model):
super().__init__()
self.bert_module = bert_model
self.dropout = nn.Dropout(0.1)
self.final = nn.Linear(in_features=128, out_features=3, bias=True)
def forward(self, inputs):
ids, mask, token_type_ids = inputs['ids'], inputs['mask'], inputs['token_type_ids']
book = {"Input-Token" : ids, "Input-Masked" : mask, "Input-Segment" : token_type_ids}
x = self.bert_module(book)
x = self.dropout(x['pooler_output'])
out = self.final(x)
return out
# --- Цикл обучения ---
def train(epoch, model, dataloader, loss_fn, optimizer, max_steps=None):
model.train()
total_acc, total_count = 0, 0
log_interval = 50
start_time = time.time()
for batch_idx, sample in enumerate(dataloader):
inputs = sample[batch_idx][0][0]
label = sample[batch_idx][0][1]
print(label)
print(type(model))
optimizer.zero_grad()
predicted_label = model(inputs)
loss = loss_fn(predicted_label, label)
loss.backward()
optimizer.step()
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
if idx % log_interval == 0:
elapsed = time.time() - start_time
print(
"Epoch {:3d} | {:5d}/{:5d} batches "
"| accuracy {:8.3f} | loss {:8.3f} ({:.3f}s)".format(
epoch, idx, len(dataloader), total_acc / total_count, loss.item(), elapsed
)
)
total_acc, total_count = 0, 0
start_time = time.time()
if max_steps is not None:
if idx == max_steps:
return {'loss': loss.item(), 'acc': total_acc / total_count}
return {'loss': loss.item(), 'acc': total_acc / total_count}
# --- Цикл оценки качества модели ---
def evaluate(model, dataloader, loss_fn):
model.eval()
total_acc, total_count = 0, 0
with torch.no_grad():
for idx, (inputs, label) in enumerate(dataloader):
predicted_label = model(inputs)
loss = loss_fn(predicted_label, label)
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
return {'loss': loss.item(), 'acc': total_acc / total_count}
# --- Главная функция ---
def train_and_evaluate(**params):
logging.info("running with the following params :")
logging.info(params)
# Загрузка предварительно обученного токенизатора и модели BERT
# поменяйте пути на те, которые используете
folder = 'multi_cased_L-12_H-768_A-12'
config_path = folder+'/bert_config.json'
checkpoint_path = folder+'/bert_model.ckpt'
vocab_path = folder+'/vocab.txt'
model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True)
tokenizer = BertTokenizer.from_pretrained('multi_cased_L-12_H-768_A-12/vocab.txt')
# Параметры обучения
epochs = int(params.get('epochs'))
batch_size = int(params.get('batch_size'))
learning_rate = float(params.get('learning_rate'))
# Загрузка данных
df_train = pd.read_csv(params.get('training_file'))
df_eval = pd.read_csv(params.get('validation_file'))
df_test = pd.read_csv(params.get('testing_file'))
# Создание загрузчиков данных
train_ds = BertDataset(df_train, tokenizer, max_length=100)
train_loader = DataLoader(dataset=train_ds,batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn)
eval_ds = BertDataset(df_eval, tokenizer, max_length=100)
eval_loader = DataLoader(dataset=eval_ds,batch_size=batch_size, collate_fn=my_collate_fn)
test_ds = BertDataset(df_test, tokenizer, max_length=100)
test_loader = DataLoader(dataset=test_ds,batch_size=batch_size, collate_fn=my_collate_fn)
# Создание модели
classifier = SentimentBERT(bert_model=model).to(DEVICE)
total_parameters = sum([np.prod(p.size()) for p in classifier.parameters()])
model_parameters = filter(lambda p: p.requires_grad, classifier.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
logging.info(f"Total params : {total_parameters} - Trainable : {params} ({params/total_parameters*100}% of total)")
# Оптимизатор и функция потерь
optimizer = torch.optim.Adam([p for p in classifier.parameters() if p.requires_grad], learning_rate)
loss_fn = nn.CrossEntropyLoss()
# При пробном запуске выполнить лишь следующее
logging.info(f'Training model with {BERT_MODEL_NAME}')
if args.dry_run:
logging.info("Dry run mode")
epochs = 1
steps_per_epoch = 1
else:
steps_per_epoch = None
# Вперёд!
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train_metrics = train(epoch, classifier, train_loader, loss_fn=loss_fn, optimizer=optimizer, max_steps=steps_per_epoch)
eval_metrics = evaluate(classifier, eval_loader, loss_fn=loss_fn)
print("-" * 59)
print(
"End of epoch {:3d} - time: {:5.2f}s - loss: {:.4f} - accuracy: {:.4f} - valid_loss: {:.4f} - valid accuracy {:.4f} ".format(
epoch, time.time() - epoch_start_time, train_metrics['loss'], train_metrics['acc'], eval_metrics['loss'], eval_metrics['acc']
)
)
print("-" * 59)
if args.dry_run:
# При пробном запуске не выполнять оценку качества модели
return None
test_metrics = evaluate(classifier, test_loader, loss_fn=loss_fn)
metrics = {
'train': train_metrics,
'val': eval_metrics,
'test': test_metrics,
}
logging.info(metrics)
# Сохранение модели и архитектуры в одном файле
if params.get('job_dir') is None:
logging.warning("No job dir provided, model will not be saved")
else:
logging.info("Saving model to {} ".format(params.get('job_dir')))
torch.save(classifier.state_dict(), params.get('job_dir'))
logging.info("Bye bye")
if __name__ == '__main__':
# Создание аргументов
parser = argparse.ArgumentParser()
parser.add_argument('--training-file', type=str, default='Source/LTMV3/training-file.csv')
parser.add_argument('--validation-file', type=str, default='Source/LTMV2/validation-file.csv')
parser.add_argument('--testing-file', type=str, default='Source/LTMV2/testing-file.csv')
parser.add_argument('--job-dir', type=str, default='LTMV2/')
parser.add_argument('--epochs', type=float, default=2)
parser.add_argument('--batch-size', type=float, default=1024)
parser.add_argument('--learning-rate', type=float, default=0.01)
parser.add_argument('--dry-run', action="store_true", default=False)
# Парсинг аргументов
args, _ = parser.parse_known_args()
# Запуск обучения
train_and_evaluate(**vars(args))
Upvotes: 0
Views: 46