Reputation: 145
This is my first time writing a Pytorch-based CNN. I've finally gotten the code to run to the point of producing output for the first data batch, but on the second batch produces nan
s. I greatly simplified the model for debugging purposes, but it's still not working right. The model shown here is just a few fully connected layers with a linear output.
I am guessing that the problem is the the back-propagation step, but it's unclear to me where and why.
Here is a very simplified version of the model that still produces the error:
batch_size = 36
device = 'cuda'
# note "rollaxis" to move channel from last to first dimension
# X_train is n input images x 70 width x 70 height x 3 channels
# Y_train is n doubles
torch_train = utils.TensorDataset(torch.from_numpy(np.rollaxis(X_train, 3, 1)).float(), torch.from_numpy(Y_train).float())
train_loader = utils.DataLoader(torch_train, batch_size=batch_size, shuffle=True)
def MyCNN(**kwargs):
return MyCNN_model_simple(**kwargs)
# switched from Sequential() style to assist debugging
class MyCNN_model_simple(nn.Module):
def __init__(self, **kwargs):
super(MyCNN_model_simple, self).__init__()
self.fc1 = FullyConnected( 3 * 70 * 70, 100)
self.fc2 = FullyConnected( 100, 100)
self.last = nn.Linear(100, 1)
# self.net = nn.Sequential(
# self.fc1,
# self.fc2,
# self.last,
# nn.Flatten()
# )
def forward(self, x):
print(f"x shape A: {x.shape}")
x = torch.flatten(x, 1)
print(f"x shape B: {x.shape}")
x = self.fc1(x)
print(f"x shape C: {x.shape}")
x = self.fc2(x)
print(f"x shape D: {x.shape}")
x = self.last(x)
print(f"x shape E: {x.shape}")
x = torch.flatten(x)
print(f"x shape F: {x.shape}")
return x
# return self.net(x)
class FullyConnected(nn.Module):
def __init__(self, in_channels, out_channels, dropout=None):
super(FullyConnected, self).__init__()
layers = []
layers.append(nn.Linear(in_channels, out_channels, bias=True))
layers.append(nn.ReLU())
if dropout != None:
layers.append(nn.Dropout(p=dropout))
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
model = MyCNN()
# convert to 16-bit half-precision to save memory
model.half()
model.to(torch.device('cuda'))
loss_fn = nn.MSELoss()
dev = torch.device('cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
losses = []
max_batches = 2
def process_batch():
inputs = images.half().to(dev)
values = scores.half().to(dev)
# clear accumulated gradients
optimizer.zero_grad()
# make predictions
outputs = model(inputs)
# calculate and save the loss
model_out = torch.flatten(outputs)
print(f"Outputs: {model_out}")
loss = loss_fn(model_out.half(), torch.flatten(values))
losses.append( loss.item() )
# backpropogate the loss
loss.backward()
# adjust parameters to computed gradients
optimizer.step()
model.train()
i = 0
for images, scores in train_loader:
process_batch()
i += 1
if i > max_batches: break
x shape A: torch.Size([36, 3, 70, 70])
x shape B: torch.Size([36, 9800])
x shape C: torch.Size([36, 100])
x shape D: torch.Size([36, 100])
x shape E: torch.Size([36, 1])
x shape F: torch.Size([36])
Outputs: tensor([0.0406, 0.0367, 0.0446, 0.0529, 0.0406, 0.0391, 0.0397, 0.0391, 0.0415,
0.0443, 0.0410, 0.0406, 0.0349, 0.0396, 0.0368, 0.0401, 0.0343, 0.0419,
0.0428, 0.0385, 0.0345, 0.0431, 0.0287, 0.0328, 0.0309, 0.0416, 0.0473,
0.0352, 0.0422, 0.0375, 0.0428, 0.0345, 0.0368, 0.0319, 0.0365, 0.0382],
device='cuda:0', dtype=torch.float16, grad_fn=<AsStridedBackward>)
x shape A: torch.Size([36, 3, 70, 70])
x shape B: torch.Size([36, 9800])
x shape C: torch.Size([36, 100])
x shape D: torch.Size([36, 100])
x shape E: torch.Size([36, 1])
x shape F: torch.Size([36])
Outputs: tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
device='cuda:0', dtype=torch.float16, grad_fn=<AsStridedBackward>)
x shape A: torch.Size([36, 3, 70, 70])
x shape B: torch.Size([36, 9800])
x shape C: torch.Size([36, 100])
x shape D: torch.Size([36, 100])
x shape E: torch.Size([36, 1])
x shape F: torch.Size([36])
Outputs: tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
device='cuda:0', dtype=torch.float16, grad_fn=<AsStridedBackward>)
You can see the nan
s that are coming out of the model starting with the second batch. Is there anything obviously wrong that I'm doing? If anyone has tips on best practices for debugging pytorch module runs that I can use to track down the problem, that would be very helpful.
Thanks.
Upvotes: 4
Views: 10086
Reputation: 8527
You should switch to full precision when updating the gradients and to half precision upon training
loss.backward()
model.float() # add this here
optimizer.step()
Switch back to half precission
for images, scores in train_loader:
model.half() # add this here
process_batch()
Upvotes: 3