Reputation: 1
I'm using a TensorBoard logger via pytorch-lightning like this:
tb_logger = pl_loggers.TensorBoardLogger(args.dir)
trainer = pl.Trainer(...., logger=tb_logger, check_val_every_n_epoch=20, log_every_n_steps=500, num_sanity_val_steps=0)
in my train_epoch_end and validation_epoch_end I log everything like this:
tensorboard = self.logger.experiment
tensorboard.add_scalar('Acc', acc, self.current_epoch)
However when I monitor the training run, TensorBoard actually shows me two runs, one called default/version_0
, which has all my scalars, histograms etc. So what I want and as intended.
Another training run called GLOBAL
logs a scalar called nll_loss_output_0
.
I'm merely calling
loss = torch.nn.functional.cross_entropy(logits, targets)
and don't understand where this second run comes from.
My local TensorBoard tells me there are too many files in the GLOBAL folder (2800+) and SageMaker with TB monitor has an InternalServerError and the whole training run fails a third of the way in.
In train_step, I am returning:
log = {
'train_loss': loss.detach(),
'acc1': acc1,
'acc10': acc10
}
return {'loss': loss, 'data': log}
But I'm not calling anything "nll_loss_output_0" anywhere... could someone please advise on how to get rid of the GLOBAL run altogether? Setting log_every_n_steps to 2000000 or so might not fix it, as I'm getting multiple files per log train_step: screenshot of GLOBAL folder content
Or is there any logging implicit for learning rate schedulers?
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.parameters(), lr=self.lr)
lr_schedule = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, self.trainer.max_epochs )
self.optimizer_ = optimizer
return [optimizer],[lr_schedule]
I'm also using a checkpoint callback:
checkpoint_callback = ModelCheckpoint(
save_top_k=-1,
dirpath=os.path.join(args.output_dir,'checkpoints/'),
filename='checkpoint{epoch:04d}',
auto_insert_metric_name=False,
every_n_epochs=20,
save_on_train_epoch_end=True)
Would be great if someone could help me out!
All the best,
Jonas
Upvotes: 0
Views: 601
Reputation: 144
I don't know why global files are created, but wanted to give you general tips.
First, there is logger
option in self.log()
method, which means that you don't need to explicitly use tensorboard logger to log metrics. (docs)
Second, there is pytorch_lightning.callbacks.LearningRateMonitor
that handle logging learning rate into logger. (docs)
Below snippet will write lr, loss, train_top(1,5), valid_top(1,5) to tensorboard logger.
import os
from copy import deepcopy
import torch
from torch import nn
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision import models, transforms
from torchvision.datasets import CIFAR10
from pytorch_lightning import LightningModule, LightningDataModule, Trainer
from pytorch_lightning.callbacks import LearningRateMonitor
from torchmetrics import Accuracy, MetricCollection
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
class CIFAR(LightningDataModule):
def __init__(self, img_size=32, batch_size=32):
super().__init__()
self.img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size)
self.batch_size = batch_size
self.train_transforms = transforms.Compose([
transforms.Resize(self.img_size),
transforms.Pad(4, padding_mode='reflect'),
transforms.RandomCrop(self.img_size),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])
self.test_transforms = transforms.Compose([
transforms.Resize(self.img_size),
transforms.CenterCrop(self.img_size),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])
def prepare_data(self) -> None:
CIFAR10(root='data', train=True, download=True)
CIFAR10(root='data', train=False, download=True)
def setup(self, stage=None):
self.train_ds = CIFAR10(root='data', train=True, download=False, transform=self.train_transforms)
self.valid_ds = CIFAR10(root='data', train=False, download=False, transform=self.test_transforms)
def train_dataloader(self):
return DataLoader(self.train_ds, num_workers=4, batch_size=self.batch_size, shuffle=True)
def val_dataloader(self):
return DataLoader(self.valid_ds, num_workers=4, batch_size=self.batch_size, shuffle=False)
class BasicModule(LightningModule):
def __init__(self, lr=0.01):
super().__init__()
self.model = models.resnet18(pretrained=False)
self.criterion = nn.CrossEntropyLoss()
self.lr = lr
metric = MetricCollection({'top@1': Accuracy(top_k=1), 'top@5': Accuracy(top_k=5)})
self.train_metric = metric.clone(prefix='train/')
self.valid_metric = metric.clone(prefix='valid/')
def training_step(self, batch, batch_idx, optimizer_idx=None):
return self.shared_step(*batch, self.train_metric)
def validation_step(self, batch, batch_idx):
return self.shared_step(*batch, self.valid_metric)
def shared_step(self, x, y, metric):
y_hat = self.model(x)
loss = self.criterion(y_hat, y)
self.log_dict(metric(y_hat, y), logger=True, prog_bar=True)
return loss
def configure_optimizers(self):
return SGD(self.model.parameters(), lr=self.lr)
if __name__ == '__main__':
data = CIFAR(batch_size=512)
model = BasicModule(lr=0.01)
callbacks = [LearningRateMonitor()]
trainer = Trainer(max_epochs=2, gpus='0,1', accelerator='gpu', strategy='ddp', precision=16, callbacks=callbacks)
trainer.fit(model, data)
Upvotes: 0