Reputation: 3
I am facing an issue while training my deep learning model using PyTorch Lightning. During the training process, I encountered the following error:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Context: I am working on a text classification task and have implemented a custom dataset class UCC_Dataset, as well as a LightningDataModule subclass UCC_Data_Module. My model is defined as a subclass of pl.LightningModule called UCC_Comment_Classifier. The error occurs when calling trainer.fit(model, ucc_data_module).
Here is the relevant code snippet where the error occurs:
class UCC_Comment_Classifier(pl.LightningModule):
def __init__(self):
super().__init__()
self.config = config
self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
torch.nn.init.xavier_uniform_(self.classifier.weight)
self.loss_func = nn.BCEWithLogitsLoss(reduction='mean')
self.dropout = nn.Dropout()
# Activer les gradients pour tous les paramètres du modèle
for param in self.parameters():
param.requires_grad = True
def forward(self, input_ids, attention_mask, labels=None):
# roberta layer
output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = torch.mean(output.last_hidden_state, 1)
# final logits
pooled_output = self.dropout(pooled_output)
pooled_output = self.hidden(pooled_output)
pooled_output = F.relu(pooled_output)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
# calculate loss
loss = None
if labels is not None:
loss = self.loss_func(logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
return loss, logits
def training_step(self, batch, batch_index):
loss, outputs = self(**batch)
self.log("train loss ", loss, prog_bar = True, logger=True)
return {"loss":loss, "predictions":outputs, "labels": batch["labels"]}
def validation_step(self, batch, batch_index):
loss, outputs = self(**batch)
self.log("validation loss ", loss, prog_bar = True, logger=True)
return {"val_loss": loss, "predictions":outputs, "labels": batch["labels"]}
def predict_step(self, batch, batch_index , dataloader_idx: int = None):
loss, outputs = self(**batch)
return outputs
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'], no_deprecation_warning=True)
total_steps = config['train_size'] * self.config['n_epochs']
warmup_steps = math.floor(total_steps * self.config['warmup'])
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
return [optimizer],[scheduler]```
I have verified that all the model parameters have requires_grad set to True, and I am unsure why this error is happening.
Upvotes: 0
Views: 796
Reputation: 120
The trainer has gradient tracking disabled in the validation/testing loops (for speed). If you want to enable them, you can do this in your validation step:
def validation_step(self, batch, batch_idx):
torch.set_grad_enabled(True)
...
One more important step is to disable inference_mode in the Trainer
trainer = Trainer(inference_mode=False) # true by default
Full example:
import torch
from torch.utils.data import DataLoader, Dataset
from lightning.pytorch import LightningModule, Trainer
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def test_step(self, batch, batch_idx):
torch.set_grad_enabled(True)
assert torch.is_grad_enabled()
assert all(p.requires_grad for p in self.parameters())
loss = self(batch).sum()
loss.backward()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
def run():
test_data = DataLoader(RandomDataset(32, 64), batch_size=2)
model = BoringModel()
trainer = Trainer(max_epochs=1, accelerator="cpu", inference_mode=False)
trainer.test(model, dataloaders=test_data)
if __name__ == "__main__":
run()
Upvotes: 0