Reputation: 793
Trying to implement a simple multi-label image classifier using Pytorch Lightning. Here's the model definition:
import torch
from torch import nn
# creates network class
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
# defines conv layers
self.conv_layer_b1 = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=32,
kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Flatten(),
)
# passes dummy x matrix to find the input size of the fc layer
x = torch.randn(1, 3, 800, 600)
self._to_linear = None
self.forward(x)
# defines fc layer
self.fc_layer = nn.Sequential(
nn.Linear(in_features=self._to_linear,
out_features=256),
nn.ReLU(),
nn.Linear(256, 5),
)
# defines accuracy metric
self.accuracy = pl.metrics.Accuracy()
self.confusion_matrix = pl.metrics.ConfusionMatrix(num_classes=5)
def forward(self, x):
x = self.conv_layer_b1(x)
if self._to_linear is None:
# does not run fc layer if input size is not determined yet
self._to_linear = x.shape[1]
else:
x = self.fc_layer(x)
return x
def cross_entropy_loss(self, logits, y):
criterion = nn.CrossEntropyLoss()
return criterion(logits, y)
def training_step(self, train_batch, batch_idx):
x, y = train_batch
logits = self.forward(x)
train_loss = self.cross_entropy_loss(logits, y)
train_acc = self.accuracy(logits, y)
train_cm = self.confusion_matrix(logits, y)
self.log('train_loss', train_loss)
self.log('train_acc', train_acc)
self.log('train_cm', train_cm)
return train_loss
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
logits = self.forward(x)
val_loss = self.cross_entropy_loss(logits, y)
val_acc = self.accuracy(logits, y)
return {'val_loss': val_loss, 'val_acc': val_acc}
def validation_epoch_end(self, outputs):
avg_val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
self.log("val_loss", avg_val_loss)
self.log("val_acc", avg_val_acc)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=0.0008)
return optimizer
The issue is probably not the machine since I'm using a cloud instance with 60 GBs of RAM and 12 GBs of VRAM. Whenever I run this model even for a single epoch, I get an out of memory error. On the CPU it looks like this:
RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 1966080000 bytes. Error code 12 (Cannot allocate memory)
and on the GPU it looks like this:
RuntimeError: CUDA out of memory. Tried to allocate 7.32 GiB (GPU 0; 11.17 GiB total capacity; 4.00 KiB already allocated; 2.56 GiB free; 2.00 MiB reserved in total by PyTorch)
Clearing the cache and reducing the batch size did not work. I'm a novice so clearly something here is exploding but I can't tell what. Any help would be appreciated.
Thank you!
Upvotes: 2
Views: 3064
Reputation: 4171
Indeed, it's not a machine issue; the model itself is simply unreasonably big. Typically, if you take a look at common CNN models, the fc layers occur near the end, after the inputs already pass through quite a few convolutional blocks (and have their spatial resolutions reduced).
Assuming inputs are of shape (batch, 3, 800, 600)
, while passing the conv_layer_b1
layer, the feature map shape would be (batch, 32, 400, 300)
after the MaxPool
operation. After flattening, the inputs become (batch, 32 * 400 * 300)
, ie, (batch, 3840000)
.
The immediately following fc_layer
thus contains nn.Linear(3840000, 256)
, which is simply absurd. This single linear layer contains ~983 million trainable parameters! For reference, popular image classification CNNs roughly have 3 to 30 million parameters on average, with larger variants reaching 60 to 80 million. Few ever really cross the 100 million mark.
You can count your model params with this:
def count_params(model):
return sum(map(lambda p: p.data.numel(), model.parameters()))
My advice: 800 x 600 is really a massive input size. Reduce it to something like 400 x 300, if possible. Furthermore, add several convolutional blocks similar to conv_layer_b1
, before the FC layer. For example:
def get_conv_block(C_in, C_out):
return nn.Sequential(
nn.Conv2d(in_channels=C_in, out_channels=C_out,
kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
# defines conv layers
self.conv_layer_b1 = get_conv_block(3, 16)
self.conv_layer_b2 = get_conv_block(16, 32)
self.conv_layer_b3 = get_conv_block(32, 64)
self.conv_layer_b4 = get_conv_block(64, 128)
self.conv_layer_b5 = get_conv_block(128, 256)
# passes dummy x matrix to find the input size of the fc layer
x = torch.randn(1, 3, 800, 600)
self._to_linear = None
self.forward(x)
# defines fc layer
self.fc_layer = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=self._to_linear,
out_features=256),
nn.ReLU(),
nn.Linear(256, 5)
)
# defines accuracy metric
self.accuracy = pl.metrics.Accuracy()
self.confusion_matrix = pl.metrics.ConfusionMatrix(num_classes=5)
def forward(self, x):
x = self.conv_layer_b1(x)
x = self.conv_layer_b2(x)
x = self.conv_layer_b3(x)
x = self.conv_layer_b4(x)
x = self.conv_layer_b5(x)
if self._to_linear is None:
# does not run fc layer if input size is not determined yet
self._to_linear = nn.Flatten()(x).shape[1]
else:
x = self.fc_layer(x)
return x
Here, because more conv-relu-pool layers are applied, the input is reduced to a feature map of a much smaller shape, (batch, 256, 25, 18)
, and the overall number of trainable parameters would be reduced to about ~30 million parameters.
Upvotes: 2