Reputation: 333
I'm training an NLP model at work (e-commerce SEO) applying a BERT
variation for portuguese language (BERTimbau
) through Transformers
by Hugging Face.
I didn't used the Trainer
from Transformers API. I used PyTorch
to set all parameters through DataLoader.utils
and adamW
. I trained my model using run_glue.py
.
I'm training with a VM on GCP using Jupyterlab. I know that I can use Weights & Biases both for PyTorch and Transformers. But I don't know exactly how to set it using run_glue.py
. It's my first time using Weights & Biases.
After preprocessing and splitting train and test through Sklearn, my code is as it follows:
from transformers import BertTokenizer
import torch
#import torchvision
from torch.utils.data import Dataset, TensorDataset
import collections.abc as container_abcs
# To feed our text to BERT, it must be split into tokens, and then these tokens must be mapped to their index in the tokenizer vocabulary.
# Constructs a BERT tokenizer. Based on WordPiece.
# The tokenization must be performed by the tokenizer included with BERT
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased',
do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs. To convert all the titles from text into encoded form.
# We will use padding and truncation because the training routine expects all tensors within a batch to have the same dimensions.
encoded_data_train = tokenizer.batch_encode_plus(
df[df.data_type=='train'].text.values,
add_special_tokens=True, # Add '[CLS]' and '[SEP]'. Sequences encoded with special tokens relative to their model
return_attention_mask=True, # Return mask according to specific tokenizer defined by max_length
pad_to_max_length=True, # Pad & truncate all sentences. Pad all titles to certain maximum length
max_length=128, # Do not need to set max_length=256
return_tensors='pt' # Set to use PyTorch tensors
)
encoded_data_val = tokenizer.batch_encode_plus(
df[df.data_type=='val'].text.values,
add_special_tokens=True,
return_attention_mask=True,
pad_to_max_length=True,
max_length=128,
return_tensors='pt'
)
# Split the data into input_ids, attention_masks and labels.
# Converting the input data to the tensor , which can be feeded to the model
input_ids_train = encoded_data_train['input_ids'] # Add the encoded sentence to the list.
attention_masks_train = encoded_data_train['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
labels_train = torch.tensor(df[df.data_type=='train'].label.values)
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)
# Create training data and validation data
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("neuralmind/bert-large-portuguese-cased", # Select your pretrained Model
num_labels=len(label_dict), # Labels tp predict
output_attentions=False, # Whether the model returns attentions weights. We don’t really care about output_attentions.
output_hidden_states=False) # Whether the model returns all hidden-states. We also don’t need output_hidden_states
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32 # Set your batch size according to your GPU memory
dataloader_train = DataLoader(dataset_train # Use DataLoader to Optimize your model
,sampler=RandomSampler(dataset_train) # Random Sampler from your dataset
,batch_size=batch_size) # If your batch_size is too high you will get a warning when you run the model
#,num_workers=4 # Number of cores
#,pin_memory=True) # Use GPU to send your batch
dataloader_validation = DataLoader(dataset_val
,sampler=SequentialSampler(dataset_val) # For validation the order doesn't matter. Sequential Sampler consumes less GPU.
,batch_size=batch_size)
#,num_workers=4
#,pin_memory=True)
from transformers import AdamW, get_linear_schedule_with_warmup
# hyperparameters
# To construct an optimizer, we have to give it an iterable containing the parameters to optimize.
# Then, we can specify optimizer-specific options such as the learning rate, epsilon, etc.
optimizer = AdamW(model.parameters(), # AdamW is a class from the huggingface library (as opposed to pytorch)
lr=2e-5, # args.learning_rate - default is 5e-5
eps=1e-8) # args.adam_epsilon - default is 1e-8
# Number of training epochs. The BERT authors recommend between 2 and 4.
epochs = 2
# Create the learning rate scheduler that decreases linearly from the initial learning rate set in the optimizer to 0,
# after a warmup period during which it increases linearly from 0 to the initial learning rate set in the optimizer.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0, # Default value in run_glue.py
num_training_steps=len(dataloader_train)*epochs) # Total number of training steps is [number of batches] x [number of epochs].
# Note that this is not the same as the number of training samples).
from sklearn.metrics import f1_score
def f1_score_func(preds, labels):
preds_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return f1_score(labels_flat, preds_flat, average='weighted')
def accuracy_per_class(preds, labels):
label_dict_inverse = {v: k for k, v in label_dict.items()}
preds_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
for label in np.unique(labels_flat):
y_preds = preds_flat[labels_flat==label]
y_true = labels_flat[labels_flat==label]
print(f'Class: {label_dict_inverse[label]}')
print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
And here follows run_glue.py
:
import random
from tqdm import tqdm
import torch
import numpy as np
# from tqdm.notebook import trange, tqdm
'''
This training code is based on the 'run_glue.py' script here:
https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
'''
# Just right before the actual usage select your hardware
device = torch.device('cuda') # or cpu
model = model.to(device) # send your model to your hardware
# Set the seed value all over the place to make this reproducible.
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# We'll store a number of quantities such as training and validation loss, validation and timings.
def evaluate(dataloader_val):
'''
Put the model in evaluation mode--the dropout layers behave differently
during evaluation.
'''
model.eval()
loss_val_total = 0 # Tracking variables
predictions, true_vals = [], []
for batch in dataloader_val:
'''
Unpack this training batch from our dataloader.
As we unpack the batch, we'll also copy each tensor to the GPU using the
`to` method.
`batch` contains three pytorch tensors:
[0]: input ids
[1]: attention masks
[2]: labels
'''
batch = tuple(b.to(device) for b in batch)
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'labels': batch[2],
}
'''
Tell pytorch not to bother with constructing the compute graph during
the forward pass, since this is only needed for backprop (training).
'''
with torch.no_grad():
outputs = model(**inputs)
'''
Perform a forward pass (evaluate the model on this training batch).
The documentation for this `model` function is here:
https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
This will return the loss (rather than the model output)
because we have provided the `labels`.
It returns different numbers of parameters depending on what arguments
arge given and what flags are set. For our useage here, it returns
the loss (because we provided labels) and the "logits"--the model
outputs prior to activation.
'''
loss = outputs[0]
logits = outputs[1]
loss_val_total += loss.item() # Accumulate the validation loss.
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = inputs['labels'].cpu().numpy()
predictions.append(logits)
true_vals.append(label_ids)
loss_val_avg = loss_val_total/len(dataloader_val) # Calculate the average loss over all of the batches.
predictions = np.concatenate(predictions, axis=0)
true_vals = np.concatenate(true_vals, axis=0)
return loss_val_avg, predictions, true_vals
# ========================================
# Training
# ========================================
# For each epoch...
for epoch in tqdm(range(1, epochs+1)):
'''
Put the model into training mode. Don't be mislead--the call to
`train` just changes the *mode*, it doesn't *perform* the training.
`dropout` and `batchnorm` layers behave differently during training
vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
'''
model.train() # Put the model into training mode.
loss_train_total = 0 # Reset the total loss for this epoch.
progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
for batch in progress_bar:
model.zero_grad()
'''
Always clear any previously calculated gradients before performing a
backward pass. PyTorch doesn't do this automatically because
accumulating the gradients is "convenient while training RNNs".
(source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
'''
batch = tuple(b.to(device) for b in batch)
'''
Unpack this training batch from our dataloader.
As we unpack the batch, we'll also copy each tensor to the GPU using the
`to` method.
`batch` contains three pytorch tensors:
[0]: input ids
[1]: attention masks
[2]: labels
'''
inputs = {'input_ids': batch[0], #.to(device)
'attention_mask': batch[1], #.to(device)
'labels': batch[2], #.to(device)
}
outputs = model(**inputs)
loss = outputs[0] # The call to `model` always returns a tuple, so we need to pull the loss value out of the tuple.
loss_train_total += loss.item() # Accumulate the training loss over all of the batches so that we can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
loss.backward() # Perform a backward pass to calculate the gradients.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
# modified based on their gradients, the learning rate, etc.
optimizer.step() # Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
scheduler.step() # Update the learning rate.
progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model') # Save Model
tqdm.write(f'\nEpoch {epoch}') # Show running epoch
loss_train_avg = loss_train_total/len(dataloader_train) # Calculate the average loss over all of the batches.
tqdm.write(f'Training loss: {loss_train_avg}') # Show loss average
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.
# Record all statistics from this epoch.
val_loss, predictions, true_vals = evaluate(dataloader_validation)
val_f1 = f1_score_func(predictions, true_vals)
tqdm.write(f'Validation loss: {val_loss}')
tqdm.write(f'F1 Score (Weighted): {val_f1}')
Upvotes: 0
Views: 1106
Reputation: 2035
Scott from W&B here. Although you're not using the HuggingFace WandbCallback, you can still take advantage of wandb
easily using our Python API.
All you need to do is call wandb.log({'val_loss': val_loss, 'train_loss': train_avg})
with whatever you want to log after you call wandb.init
before training.
Here's an example training loop:
wandb.init(
# Set entity to specify your username or team name
# ex: entity="carey",
# Set the project where this run will be logged
project="huggingface",
# Track hyperparameters and run metadata
config={
"learning_rate": 0.02,
"architecture": "BERT",
"dataset": "my-dataset",})
# This simple block simulates a training loop logging metrics
for x in range(50):
loss, accuracy = # calculate this yourself
# Log metrics from your script to W&B
wandb.log({"acc":acc, "loss":loss})
# Mark the run as finished
wandb.finish()
If you want to log Datasets or Models, you can do that using wandb
Artifacts.
This Quickstart guide is a good place to start for more information: W&B Quickstart
Upvotes: 1