LazyCoder11011
LazyCoder11011

Reputation: 11

Pytorch-Lightning Misconfiguration Exception; The closure hasn't been executed

I have been trying to train a torch.nn.TransformerEncoderLayer using the standard Pytorch-Lightning Trainer class. Before the first epoch even starts, I face the following error:

MisconfigurationException: The closure hasn't been executed. HINT: did you call optimizer_closure() in your optimizer_step hook? It could also happen because the optimizer.step(optimizer_closure) call did not execute it internally.

I have very properly defined the configure_optimizers() method in the trainer and it works for every other model (say, LSTM, GRU, MultiHeadAttention). If I replace them with the TransformerEncoder, the aforementioned error pops up.


Here is the model code I am using:

class PositionalEncoder(nn.Module):
    def __init__(self, d_model=512, max_seq_len=512):
        super().__init__()
        self.d_model = d_model
        pe = torch.zeros(max_seq_len, 
                         d_model)
        
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i+1] = cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    def forward(self, x):
        x *= sqrt(self.d_model)
        x += self.pe[:,:x.size(1)]
        return x
class TRANSFORMER(pl.LightningModule):
    def __init__(self, 
                 input_dim,
                 d_model=512,
                 nhead=8,
                 num_layers=6,
                 dropout=0.5,
                 use_scheduler=True,
                 num_tags=len(TAG2IDX),
                 total_steps=1024,
                 train_dataset=None,
                 val_dataset=None,
                 test_dataset=None):
        
        super().__init__()
        self.crf = CRF(num_tags=num_tags, batch_first=True)
        self.fc = nn.Linear(d_model, num_tags)
        self.use_scheduler = use_scheduler
        
        self.embedding = nn.Embedding(num_embeddings=input_dim, 
                                      embedding_dim=d_model, 
                                      padding_idx=0)
        
        self.pos_encoder = PositionalEncoder(d_model=d_model)
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model,
                                                        nhead=nhead,
                                                        dropout=dropout,
                                                        activation="gelu",
                                                        batch_first=True)
        
        self.encoder = nn.TransformerEncoder(encoder_layer=self.encoder_layer,
                                             num_layers=num_layers)
        ## Hyperparameters ##
        self.learning_rate = LEARNING_RATE
        self.weight_decay = WEIGHT_DECAY
        self.total_steps = total_steps
        self.batch_size = BATCH_SIZE
        ## Datasets ##
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        ## steps ##
        if self.use_scheduler: 
            self.total_steps = len(train_dataset) // self.batch_size


    # create the dataloaders
    # add shuffle only for train_dataloader
    # make sure num_workers is set appropriately and drop_last is set to False
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=True,
                          drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)


    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size,
                          num_workers=N_JOBS,
                          shuffle=False,
                          drop_last=False)
    

    def forward(self, input_ids, masks):
        out = self.embedding(input_ids)
        out = self.pos_encoder(out)
        out = self.encoder(out, src_key_padding_mask=~masks)
        out = self.fc(out)
        return out

    
    def _shared_evaluation_step(self, batch, batch_idx):
        ids, masks, lbls = batch
        emissions = self(ids, masks)
        loss = -self.crf(emissions, lbls, mask=masks)
        pred = self.crf.decode(emissions, mask=masks)
        r, p, f1 = f1score(lbls, pred)
        return loss, r, p, f1


    def training_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)
        return loss


    def validation_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)

    
    def test_step(self, batch, batch_idx):
        loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_recall", r, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_precision", p, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_f1score", f1, on_step=False, on_epoch=True, prog_bar=True)


    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        ids, masks, _ = batch 
        return self.crf.decode(self(ids, masks), mask=masks)
    
    
    def configure_optimizers(self):           
        optimizer = Ranger(self.parameters(), 
                           lr=self.learning_rate,
                           weight_decay=self.weight_decay)

        if self.use_scheduler:
            scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                                        num_warmup_steps=1,
                                                        num_training_steps=self.total_steps)
            lr_scheduler = {
                'scheduler': scheduler, 
                'interval': 'epoch', 
                'frequency': 1
            }
            return [optimizer], [lr_scheduler]
        else:
            return [optimizer]

and here is how I am using the trainer class:

trainer = pl.Trainer(accelerator="gpu",
                     max_epochs=EPOCHS,
                     precision=32,
                     log_every_n_steps=1,
                     callbacks=[earlystopping_callback, 
                                checkpoint_callback])

Upvotes: 1

Views: 3657

Answers (1)

awaelchli
awaelchli

Reputation: 886

You are right. This happens because the special optimizer you have does not call the closure when passing it to the .step() method. But Lightning relies on this because it calls the step method like this:

optimizer.step(training_step_closure)

where training_step_closure consists of essentially executing the LightningModule.training_step.

It looks like Ranger does not follow the standard contract of calling the closure inside of itself.

To overcome this issue, I recommend switching to manual optimization:

  1. Set self.automatic_optimization = False in your LightningModule.
  2. Modify your training step by inserting manual backward, optimizer step and optionally the lr scheduler call:

Like so:

def training_step(self, batch, batch_idx):
    loss, r, p, f1 = self._shared_evaluation_step(batch, batch_idx)

    # Insert these lines:
    self.manual_backward(loss)
    
    optimizer = self.optimizers()
    scheduler = self.lr_schedulers()

    optimizer.step()
    optimizer.zero_grad()
    scheduler.step()

    ...
    return loss

No other changes should be necessary.

Upvotes: 2

Related Questions