Why does the CrossEntropy loss not go down during training of my network?

Question

I am fairly new to Pytorch and I am currently trying to implement the network in this paper: https://arxiv.org/pdf/1811.06621.pdf?fbclid=IwAR3Ya9ZfBNN40UO0wct7dGupjlBFEpU47IRHK-wXmejI4U2UQGf03sXHMlw.

I have provided the class for this network and some training code that uses dummy data. The code compiles and runs but the loss that's printed each iteration is always the same (8.371). This leads me to believe that there is something wrong with the way I implemented my network. Is there anything glaringly wrong with my implementation?

import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable


torch.manual_seed(1)

# Hyper Parameters
sequence_length = 1
input_size = 320
hidden_size = 2048
recurrent_size = 640
num_layers = 8
num_classes = 10
batch_size = 10
num_epochs = 2
learning_rate = 0.01

# RNNT Model

class RNNTModel(nn.Module):
    def __init__(self, input_size, hidden_size, recurrent_size, bias=True):
        super(RNNTModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.recurrent_size = recurrent_size
        self.bias = bias
        self.downsample_fc = nn.Linear(self.recurrent_size * 2, self.recurrent_size)
        self.joint_fc = nn.Linear(self.recurrent_size * 2, self.recurrent_size)
        self.out_fc = nn.Linear(640, 4096)
        self.softmax = nn.LogSoftmax(dim=1)

        self.encoder_1 = nn.ModuleDict({
            'lstm1': nn.LSTM(self.input_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj1': nn.Linear(self.hidden_size, self.recurrent_size),
            'lstm2': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj2': nn.Linear(self.hidden_size, self.recurrent_size)
        })

        self.encoder_2 = nn.ModuleDict({
            'lstm3': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj3': nn.Linear(self.hidden_size, self.recurrent_size),
            'lstm4': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj4': nn.Linear(self.hidden_size, self.recurrent_size),
            'lstm5': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj5': nn.Linear(self.hidden_size, self.recurrent_size),
            'lstm6': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj6': nn.Linear(self.hidden_size, self.recurrent_size),
            'lstm7': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj7': nn.Linear(self.hidden_size, self.recurrent_size),
            'lstm8': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj8': nn.Linear(self.hidden_size, self.recurrent_size)
        })

        self.prediction_net = nn.ModuleDict({
            'fc1': nn.Linear(4096, 76),
            'lstm1': nn.LSTM(76, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj1': nn.Linear(self.hidden_size, self.recurrent_size),
            'lstm2': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
            'proj2': nn.Linear(self.hidden_size, self.recurrent_size)
        })

    def forward(self, x, ):
        y = [torch.zeros(1, x.size(1), 4096)]
        for i in range(x.size(0) // 2):


            # Unrolled loop of encoder 1
            enc_out, (h1, c1) = self.encoder_1['lstm1'](torch.stack([x[2 * i], x[2 * i + 1]]))
            enc_out = self.encoder_1['proj1'](enc_out)
            enc_out, _ = self.encoder_1['lstm2'](enc_out)
            enc_out = self.encoder_1['proj2'](enc_out)

            # Downsample by halving framrate
            enc_out = enc_out.view(1, -1, 2 * self.recurrent_size)
            enc_out = self.downsample_fc(enc_out)

            # Unrolled loop of encoder 2
            enc_out, _ = self.encoder_2['lstm3'](enc_out)
            enc_out = self.encoder_2['proj3'](enc_out)
            enc_out, _ = self.encoder_2['lstm4'](enc_out)
            enc_out = self.encoder_2['proj4'](enc_out)
            enc_out, _ = self.encoder_2['lstm5'](enc_out)
            enc_out = self.encoder_2['proj5'](enc_out)
            enc_out, _ = self.encoder_2['lstm6'](enc_out)
            enc_out = self.encoder_2['proj6'](enc_out)
            enc_out, _ = self.encoder_2['lstm7'](enc_out)
            enc_out = self.encoder_2['proj3'](enc_out)
            enc_out, _ = self.encoder_2['lstm7'](enc_out)
            enc_out = self.encoder_2['proj3'](enc_out)
            enc_out, _ = self.encoder_2['lstm8'](enc_out)
            enc_out = self.encoder_2['proj8'](enc_out)

            # Unrolled loop of prediction net
            pred_out = self.prediction_net['fc1'](y[i])
            pred_out, _ = self.prediction_net['lstm1'](pred_out)
            pred_out = self.prediction_net['proj1'](pred_out)
            pred_out, _ = self.prediction_net['lstm2'](pred_out)
            pred_out = self.prediction_net['proj2'](pred_out)

            # Unrolled loop of joint layers
            joint_out = torch.cat([enc_out, pred_out], dim=-1)
            joint_out = self.joint_fc(joint_out)
            joint_out = self.out_fc(joint_out)
            joint_out = self.softmax(joint_out)

            y.append(joint_out)

        return(torch.stack(y[1:]))


rnnt = RNNTModel(input_size, hidden_size, recurrent_size, bias=True)
# y = rnnt(torch.rand(batch_size, sequence_length, input_size))

training_data = [(torch.rand(batch_size, sequence_length, input_size), torch.ones(batch_size//2, 1, 4096).long()) for _ in range(100)]

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnnt.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for i, (x, y) in enumerate(training_data):
        x = Variable(x)
        y = Variable(y)
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = rnnt(x).view(-1, 4096)
        loss = criterion(outputs, torch.max(y, 2)[1].squeeze())
        loss.backward()
        optimizer.step

        if (i+1) % 1 == 0:
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(training_data)//batch_size, loss.item()))

Why does the CrossEntropy loss not go down during training of my network?

Answers (1)

Related Questions