Reputation: 11
I have been trying to train a torch.nn.TransformerEncoderLayer
using the standard Pytorch-Lightning Trainer
class. Before the first epoch even starts, I face the following error:
MisconfigurationException: The closure hasn't been executed. HINT: did you call optimizer_closure()
in your optimizer_step
hook? It could also happen because the optimizer.step(optimizer_closure)
call did not execute it internally.
I have very properly defined the configure_optimizers()
method in the trainer and it works for every other model (say, LSTM, GRU, MultiHeadAttention). If I replace them with the TransformerEncoder, the aforementioned error pops up.
Here is the model code I am using:
class PositionalEncoder(nn.Module):
def __init__(self, d_model=512, max_seq_len=512):
super().__init__()
self.d_model = d_model
pe = torch.zeros(max_seq_len,
d_model)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i+1] = cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x *= sqrt(self.d_model)
x += self.pe[:,:x.size(1)]
return x
class TRANSFORMER(pl.LightningModule):
def __init__(self,
input_dim,
d_model=512,
nhead=8,
num_layers=6,
dropout=0.5,
use_scheduler=True,
num_tags=len(TAG2IDX),
total_steps=1024,
train_dataset=None,
val_dataset=None,
test_dataset=None):
super().__init__()
self.crf = CRF(num_tags=num_tags, batch_first=True)
self.fc = nn.Linear(d_model, num_tags)
self.use_scheduler = use_scheduler
self.embedding = nn.Embedding(num_embeddings=input_dim,
embedding_dim=d_model,
padding_idx=0)
self.pos_encoder = PositionalEncoder(d_model=d_model)
self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model,
nhead=nhead,
dropout=dropout,
activation="gelu",
batch_first=True)
self.encoder = nn.TransformerEncoder(encoder_layer=self.encoder_layer,
num_layers=num_layers)
## Hyperparameters ##
self.learning_rate = LEARNING_RATE
self.weight_decay = WEIGHT_DECAY
self.total_steps = total_steps
self.batch_size = BATCH_SIZE
## Datasets ##
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.test_dataset = test_dataset
## steps ##
if self.use_scheduler:
self.total_steps = len(train_dataset) // self.batch_size
# create the dataloaders
# add shuffle only for train_dataloader
# make sure num_workers is set appropriately and drop_last is set to False
def train_dataloader(self):
return DataLoader(self.train_dataset,
batch_size=self.batch_size,
num_workers=N_JOBS,
shuffle=True,
drop_last=False)
def val_dataloader(self):
return DataLoader(self.val_dataset,
batch_size=self.batch_size,
num_workers=N_JOBS,
shuffle=False,
drop_last=False)
def test_dataloader(self):
return DataLoader(self.test_dataset,
batch_size=self.batch_size,
num_workers=N_JOBS,
shuffle=False,
drop_last=False)
def forward(self, input_ids, masks):
out = self.embedding(input_ids)
out = self.pos_encoder(out)
out = self.encoder(out, src_key_padding_mask=~masks)
out = self.fc(out)
return out
def _shared_evaluation_step(self, batch, batch_idx):
ids, masks, lbls = batch
emissions = self(ids, masks)
loss = -self.crf(emissions, lbls, mask=masks)
pred = self.crf.decode(emissions, mask=masks)
r, p, f1 = f1score(lbls, pred)
return loss, r, p, f1
def training_step(self, batch, batch_idx):
loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
self.log("train_recall", r, on_step=False, on_epoch=True, prog_bar=True)
self.log("train_precision", p, on_step=False, on_epoch=True, prog_bar=True)
self.log("train_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
self.log("val_recall", r, on_step=False, on_epoch=True, prog_bar=True)
self.log("val_precision", p, on_step=False, on_epoch=True, prog_bar=True)
self.log("val_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
def test_step(self, batch, batch_idx):
loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
self.log("test_recall", r, on_step=False, on_epoch=True, prog_bar=True)
self.log("test_precision", p, on_step=False, on_epoch=True, prog_bar=True)
self.log("test_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
def predict_step(self, batch, batch_idx, dataloader_idx=0):
ids, masks, _ = batch
return self.crf.decode(self(ids, masks), mask=masks)
def configure_optimizers(self):
optimizer = Ranger(self.parameters(),
lr=self.learning_rate,
weight_decay=self.weight_decay)
if self.use_scheduler:
scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
num_warmup_steps=1,
num_training_steps=self.total_steps)
lr_scheduler = {
'scheduler': scheduler,
'interval': 'epoch',
'frequency': 1
}
return [optimizer], [lr_scheduler]
else:
return [optimizer]
and here is how I am using the trainer class:
trainer = pl.Trainer(accelerator="gpu",
max_epochs=EPOCHS,
precision=32,
log_every_n_steps=1,
callbacks=[earlystopping_callback,
checkpoint_callback])
Upvotes: 1
Views: 3657
Reputation: 886
You are right. This happens because the special optimizer you have does not call the closure when passing it to the .step()
method. But Lightning relies on this because it calls the step method like this:
optimizer.step(training_step_closure)
where training_step_closure
consists of essentially executing the LightningModule.training_step
.
It looks like Ranger does not follow the standard contract of calling the closure inside of itself.
To overcome this issue, I recommend switching to manual optimization:
self.automatic_optimization = False
in your LightningModule.Like so:
def training_step(self, batch, batch_idx):
loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
# Insert these lines:
self.manual_backward(loss)
optimizer = self.optimizers()
scheduler = self.lr_schedulers()
optimizer.step()
optimizer.zero_grad()
scheduler.step()
...
return loss
No other changes should be necessary.
Upvotes: 2