Reputation: 1
I am fairly new to Pytorch and I am currently trying to implement the network in this paper: https://arxiv.org/pdf/1811.06621.pdf?fbclid=IwAR3Ya9ZfBNN40UO0wct7dGupjlBFEpU47IRHK-wXmejI4U2UQGf03sXHMlw.
I have provided the class for this network and some training code that uses dummy data. The code compiles and runs but the loss that's printed each iteration is always the same (8.371). This leads me to believe that there is something wrong with the way I implemented my network. Is there anything glaringly wrong with my implementation?
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
torch.manual_seed(1)
# Hyper Parameters
sequence_length = 1
input_size = 320
hidden_size = 2048
recurrent_size = 640
num_layers = 8
num_classes = 10
batch_size = 10
num_epochs = 2
learning_rate = 0.01
# RNNT Model
class RNNTModel(nn.Module):
def __init__(self, input_size, hidden_size, recurrent_size, bias=True):
super(RNNTModel, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.recurrent_size = recurrent_size
self.bias = bias
self.downsample_fc = nn.Linear(self.recurrent_size * 2, self.recurrent_size)
self.joint_fc = nn.Linear(self.recurrent_size * 2, self.recurrent_size)
self.out_fc = nn.Linear(640, 4096)
self.softmax = nn.LogSoftmax(dim=1)
self.encoder_1 = nn.ModuleDict({
'lstm1': nn.LSTM(self.input_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj1': nn.Linear(self.hidden_size, self.recurrent_size),
'lstm2': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj2': nn.Linear(self.hidden_size, self.recurrent_size)
})
self.encoder_2 = nn.ModuleDict({
'lstm3': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj3': nn.Linear(self.hidden_size, self.recurrent_size),
'lstm4': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj4': nn.Linear(self.hidden_size, self.recurrent_size),
'lstm5': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj5': nn.Linear(self.hidden_size, self.recurrent_size),
'lstm6': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj6': nn.Linear(self.hidden_size, self.recurrent_size),
'lstm7': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj7': nn.Linear(self.hidden_size, self.recurrent_size),
'lstm8': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj8': nn.Linear(self.hidden_size, self.recurrent_size)
})
self.prediction_net = nn.ModuleDict({
'fc1': nn.Linear(4096, 76),
'lstm1': nn.LSTM(76, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj1': nn.Linear(self.hidden_size, self.recurrent_size),
'lstm2': nn.LSTM(self.recurrent_size, self.hidden_size, num_layers=1, bias=bias, batch_first=True),
'proj2': nn.Linear(self.hidden_size, self.recurrent_size)
})
def forward(self, x, ):
y = [torch.zeros(1, x.size(1), 4096)]
for i in range(x.size(0) // 2):
# Unrolled loop of encoder 1
enc_out, (h1, c1) = self.encoder_1['lstm1'](torch.stack([x[2 * i], x[2 * i + 1]]))
enc_out = self.encoder_1['proj1'](enc_out)
enc_out, _ = self.encoder_1['lstm2'](enc_out)
enc_out = self.encoder_1['proj2'](enc_out)
# Downsample by halving framrate
enc_out = enc_out.view(1, -1, 2 * self.recurrent_size)
enc_out = self.downsample_fc(enc_out)
# Unrolled loop of encoder 2
enc_out, _ = self.encoder_2['lstm3'](enc_out)
enc_out = self.encoder_2['proj3'](enc_out)
enc_out, _ = self.encoder_2['lstm4'](enc_out)
enc_out = self.encoder_2['proj4'](enc_out)
enc_out, _ = self.encoder_2['lstm5'](enc_out)
enc_out = self.encoder_2['proj5'](enc_out)
enc_out, _ = self.encoder_2['lstm6'](enc_out)
enc_out = self.encoder_2['proj6'](enc_out)
enc_out, _ = self.encoder_2['lstm7'](enc_out)
enc_out = self.encoder_2['proj3'](enc_out)
enc_out, _ = self.encoder_2['lstm7'](enc_out)
enc_out = self.encoder_2['proj3'](enc_out)
enc_out, _ = self.encoder_2['lstm8'](enc_out)
enc_out = self.encoder_2['proj8'](enc_out)
# Unrolled loop of prediction net
pred_out = self.prediction_net['fc1'](y[i])
pred_out, _ = self.prediction_net['lstm1'](pred_out)
pred_out = self.prediction_net['proj1'](pred_out)
pred_out, _ = self.prediction_net['lstm2'](pred_out)
pred_out = self.prediction_net['proj2'](pred_out)
# Unrolled loop of joint layers
joint_out = torch.cat([enc_out, pred_out], dim=-1)
joint_out = self.joint_fc(joint_out)
joint_out = self.out_fc(joint_out)
joint_out = self.softmax(joint_out)
y.append(joint_out)
return(torch.stack(y[1:]))
rnnt = RNNTModel(input_size, hidden_size, recurrent_size, bias=True)
# y = rnnt(torch.rand(batch_size, sequence_length, input_size))
training_data = [(torch.rand(batch_size, sequence_length, input_size), torch.ones(batch_size//2, 1, 4096).long()) for _ in range(100)]
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnnt.parameters(), lr=learning_rate)
# Train the model
for epoch in range(num_epochs):
for i, (x, y) in enumerate(training_data):
x = Variable(x)
y = Variable(y)
# Forward + Backward + Optimize
optimizer.zero_grad()
outputs = rnnt(x).view(-1, 4096)
loss = criterion(outputs, torch.max(y, 2)[1].squeeze())
loss.backward()
optimizer.step
if (i+1) % 1 == 0:
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
%(epoch+1, num_epochs, i+1, len(training_data)//batch_size, loss.item()))
Upvotes: 0
Views: 172