Reputation: 43
I'm trying to train a bidirectional lstm with pack_padded_sequence and pad_packed_sequence, but the accuracy keeps decreasing while the loss increasing.
This is my data loader:
X1 (X[0]): tensor([[1408, 1413, 43, ..., 0, 0, 0],
[1452, 1415, 2443, ..., 0, 0, 0],
[1434, 1432, 2012, ..., 0, 0, 0],
...,
[1408, 3593, 1431, ..., 0, 0, 0],
[1408, 1413, 1402, ..., 0, 0, 0],
[1420, 1474, 2645, ..., 0, 0, 0]]), shape: torch.Size([64, 31])
len_X1 (X[3]): [9, 19, 12, 7, 7, 15, 4, 13, 9, 8, 14, 19, 7, 23, 7, 13, 7, 12, 10, 12, 13, 11, 31, 8, 20, 17, 8, 9, 9, 29, 8, 5, 5, 13, 9, 8, 10, 17, 13, 8, 8, 11, 7, 29, 15, 10, 6, 7, 10, 9, 10, 10, 4, 16, 11, 10, 16, 8, 13, 8, 8, 20, 7, 12]
X2 (X[1]): tensor([[1420, 1415, 51, ..., 0, 0, 0],
[1452, 1415, 2376, ..., 1523, 2770, 35],
[1420, 1415, 51, ..., 0, 0, 0],
...,
[1408, 3593, 1474, ..., 0, 0, 0],
[1408, 1428, 2950, ..., 0, 0, 0],
[1474, 1402, 3464, ..., 0, 0, 0]]), shape: torch.Size([64, 42])
len_X2 (X[4]): [14, 42, 13, 18, 12, 31, 8, 19, 5, 7, 15, 19, 7, 17, 6, 11, 12, 16, 8, 8, 19, 8, 12, 10, 11, 9, 9, 9, 9, 21, 7, 5, 8, 13, 14, 8, 15, 8, 8, 8, 12, 13, 7, 14, 4, 10, 6, 11, 12, 7, 8, 11, 9, 13, 30, 10, 15, 9, 9, 7, 9, 8, 7, 20]
t (X[2]): tensor([0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1]), shape: torch.Size([64])
This is my model class:
class BiLSTM(nn.Module):
def __init__(self, n_vocabs, embed_dims, n_lstm_units, n_lstm_layers, n_output_classes):
super(BiLSTM, self).__init__()
self.v = n_vocabs
self.e = embed_dims
self.u = n_lstm_units
self.l = n_lstm_layers
self.o = n_output_classes
self.padd_idx = tokenizer.get_vocab()['[PAD]']
self.embed = nn.Embedding(
self.v,
self.e,
self.padd_idx
)
self.bilstm = nn.LSTM(
self.e,
self.u,
self.l,
batch_first = True,
bidirectional = True,
dropout = 0.5
)
self.linear = nn.Linear(
self.u * 4,
self.o
)
def forward(self, X):
# initial_hidden
h0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
c0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
# embedding
out1 = self.embed(X[0].to(device))
out2 = self.embed(X[1].to(device))
# # pack_padded_sequence
out1 = nn.utils.rnn.pack_padded_sequence(out1, X[3], batch_first=True, enforce_sorted=False)
out2 = nn.utils.rnn.pack_padded_sequence(out2, X[4], batch_first=True, enforce_sorted=False)
# NxTxh, lxNxh
out1, _ = self.bilstm(out1, (h0, c0))
out2, _ = self.bilstm(out2, (h0, c0))
# # pad_packed_sequence
out1, _ = nn.utils.rnn.pad_packed_sequence(out1, batch_first=True)
out2, _ = nn.utils.rnn.pad_packed_sequence(out2, batch_first=True)
# take only the final time step
out1 = out1[:, -1, :]
out2 = out2[:, -1, :]
# concatenate out1&2
out = torch.cat((out1, out2), 1)
# linear layer
out = self.linear(out)
iout = torch.max(out, 1)[1]
return iout, out
And if I remove pack_padded_sequence - pad_packed_sequence, the model training works just fine:
class BiLSTM(nn.Module):
def __init__(self, n_vocabs, embed_dims, n_lstm_units, n_lstm_layers, n_output_classes):
super(BiLSTM, self).__init__()
self.v = n_vocabs
self.e = embed_dims
self.u = n_lstm_units
self.l = n_lstm_layers
self.o = n_output_classes
self.padd_idx = tokenizer.get_vocab()['[PAD]']
self.embed = nn.Embedding(
self.v,
self.e,
self.padd_idx
)
self.bilstm = nn.LSTM(
self.e,
self.u,
self.l,
batch_first = True,
bidirectional = True,
dropout = 0.5
)
self.linear = nn.Linear(
self.u * 4,
self.o
)
def forward(self, X):
# initial_hidden
h0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
c0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
# embedding
out1 = self.embed(X[0].to(device))
out2 = self.embed(X[1].to(device))
# pack_padded_sequence
# out1 = nn.utils.rnn.pack_padded_sequence(out1, X[3], batch_first=True, enforce_sorted=False)
# out2 = nn.utils.rnn.pack_padded_sequence(out2, X[4], batch_first=True, enforce_sorted=False)
# NxTxh, lxNxh
out1, _ = self.bilstm(out1, (h0, c0))
out2, _ = self.bilstm(out2, (h0, c0))
# pad_packed_sequence
# out1, _ = nn.utils.rnn.pad_packed_sequence(out1, batch_first=True)
# out2, _ = nn.utils.rnn.pad_packed_sequence(out2, batch_first=True)
# take only the final time step
out1 = out1[:, -1, :]
out2 = out2[:, -1, :]
# concatenate out1&2
out = torch.cat((out1, out2), 1)
# linear layer
out = self.linear(out)
iout = torch.max(out, 1)[1]
return iout, out
Upvotes: 1
Views: 467
Reputation: 1408
These lines of your code are wrong.
# take only the final time step
out1 = out1[:, -1, :]
out2 = out2[:, -1, :]
You say you are taking the final time step but you are forgetting that each sequence has different lengths.
nn.utils.rnn.pad_packed_sequence
will pad the output of each sequence until it's length equals that of the longest so that they all have the same length.
In other words you are slicing out vectors of zeros (the padding) for most sequence.
This should do what you want.
# take only the final time step
out1 = out1[range(out1.shape[0]), X3 - 1, :]
out2 = out2[range(out2.shape[0]), X4 - 1, :]
This is assuming X3
and X4
are tensors.
Upvotes: 1