Training accuracy decrease and loss increase when using pack_padded_sequence - pad_packed_sequence

Question

I'm trying to train a bidirectional lstm with pack_padded_sequence and pad_packed_sequence, but the accuracy keeps decreasing while the loss increasing.

This is my data loader:

X1 (X[0]): tensor([[1408, 1413,   43,  ...,    0,    0,    0],
    [1452, 1415, 2443,  ...,    0,    0,    0],
    [1434, 1432, 2012,  ...,    0,    0,    0],
    ...,
    [1408, 3593, 1431,  ...,    0,    0,    0],
    [1408, 1413, 1402,  ...,    0,    0,    0],
    [1420, 1474, 2645,  ...,    0,    0,    0]]), shape: torch.Size([64, 31])

len_X1 (X[3]): [9, 19, 12, 7, 7, 15, 4, 13, 9, 8, 14, 19, 7, 23, 7, 13, 7, 12, 10, 12, 13, 11, 31, 8, 20, 17, 8, 9, 9, 29, 8, 5, 5, 13, 9, 8, 10, 17, 13, 8, 8, 11, 7, 29, 15, 10, 6, 7, 10, 9, 10, 10, 4, 16, 11, 10, 16, 8, 13, 8, 8, 20, 7, 12]

X2 (X[1]): tensor([[1420, 1415,   51,  ...,    0,    0,    0],
        [1452, 1415, 2376,  ..., 1523, 2770,   35],
        [1420, 1415,   51,  ...,    0,    0,    0],
        ...,
        [1408, 3593, 1474,  ...,    0,    0,    0],
        [1408, 1428, 2950,  ...,    0,    0,    0],
        [1474, 1402, 3464,  ...,    0,    0,    0]]), shape: torch.Size([64, 42])

len_X2 (X[4]): [14, 42, 13, 18, 12, 31, 8, 19, 5, 7, 15, 19, 7, 17, 6, 11, 12, 16, 8, 8, 19, 8, 12, 10, 11, 9, 9, 9, 9, 21, 7, 5, 8, 13, 14, 8, 15, 8, 8, 8, 12, 13, 7, 14, 4, 10, 6, 11, 12, 7, 8, 11, 9, 13, 30, 10, 15, 9, 9, 7, 9, 8, 7, 20]

t (X[2]): tensor([0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1]), shape: torch.Size([64])

This is my model class:

class BiLSTM(nn.Module):
def __init__(self, n_vocabs, embed_dims, n_lstm_units, n_lstm_layers, n_output_classes):
    super(BiLSTM, self).__init__()
    self.v = n_vocabs
    self.e = embed_dims
    self.u = n_lstm_units
    self.l = n_lstm_layers
    self.o = n_output_classes
    self.padd_idx = tokenizer.get_vocab()['[PAD]']
    self.embed = nn.Embedding(
        self.v,
        self.e,
        self.padd_idx
        )
    self.bilstm = nn.LSTM(
        self.e,
        self.u,
        self.l,
        batch_first = True,
        bidirectional = True,
        dropout = 0.5
        )
    self.linear = nn.Linear(
        self.u * 4,
        self.o
        )  
  
  def forward(self, X):
    # initial_hidden
    h0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
    c0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
    
    # embedding
    out1 = self.embed(X[0].to(device))
    out2 = self.embed(X[1].to(device))

    # # pack_padded_sequence
    out1 = nn.utils.rnn.pack_padded_sequence(out1, X[3], batch_first=True, enforce_sorted=False)
    out2 = nn.utils.rnn.pack_padded_sequence(out2, X[4], batch_first=True, enforce_sorted=False)
    
    # NxTxh, lxNxh
    out1, _ = self.bilstm(out1, (h0, c0))
    out2, _ = self.bilstm(out2, (h0, c0))
    
    # # pad_packed_sequence
    out1, _ = nn.utils.rnn.pad_packed_sequence(out1, batch_first=True)
    out2, _ = nn.utils.rnn.pad_packed_sequence(out2, batch_first=True)

    # take only the final time step
    out1 = out1[:, -1, :]
    out2 = out2[:, -1, :]
    
    # concatenate out1&2
    out = torch.cat((out1, out2), 1)
    
    # linear layer
    out = self.linear(out)

    iout = torch.max(out, 1)[1]
    return iout, out

And if I remove pack_padded_sequence - pad_packed_sequence, the model training works just fine:

class BiLSTM(nn.Module):
def __init__(self, n_vocabs, embed_dims, n_lstm_units, n_lstm_layers, n_output_classes):
    super(BiLSTM, self).__init__()
    self.v = n_vocabs
    self.e = embed_dims
    self.u = n_lstm_units
    self.l = n_lstm_layers
    self.o = n_output_classes
    self.padd_idx = tokenizer.get_vocab()['[PAD]']
    self.embed = nn.Embedding(
        self.v,
        self.e,
        self.padd_idx
        )
    self.bilstm = nn.LSTM(
        self.e,
        self.u,
        self.l,
        batch_first = True,
        bidirectional = True,
        dropout = 0.5
        )
    self.linear = nn.Linear(
        self.u * 4,
        self.o
        )  
  
  def forward(self, X):
    # initial_hidden
    h0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
    c0 = torch.zeros(self.l * 2, X[0].size(0), self.u).to(device)
    
    # embedding
    out1 = self.embed(X[0].to(device))
    out2 = self.embed(X[1].to(device))

    # pack_padded_sequence
    # out1 = nn.utils.rnn.pack_padded_sequence(out1, X[3], batch_first=True, enforce_sorted=False)
    # out2 = nn.utils.rnn.pack_padded_sequence(out2, X[4], batch_first=True, enforce_sorted=False)
    
    # NxTxh, lxNxh
    out1, _ = self.bilstm(out1, (h0, c0))
    out2, _ = self.bilstm(out2, (h0, c0))
    
    # pad_packed_sequence
    # out1, _ = nn.utils.rnn.pad_packed_sequence(out1, batch_first=True)
    # out2, _ = nn.utils.rnn.pad_packed_sequence(out2, batch_first=True)

    # take only the final time step
    out1 = out1[:, -1, :]
    out2 = out2[:, -1, :]
    
    # concatenate out1&2
    out = torch.cat((out1, out2), 1)
    
    # linear layer
    out = self.linear(out)

    iout = torch.max(out, 1)[1]
    return iout, out

Nerveless_child · Accepted Answer

These lines of your code are wrong.

# take only the final time step
out1 = out1[:, -1, :]
out2 = out2[:, -1, :]

You say you are taking the final time step but you are forgetting that each sequence has different lengths.

nn.utils.rnn.pad_packed_sequence will pad the output of each sequence until it's length equals that of the longest so that they all have the same length.

In other words you are slicing out vectors of zeros (the padding) for most sequence.

This should do what you want.

# take only the final time step
out1 = out1[range(out1.shape[0]), X3 - 1, :]
out2 = out2[range(out2.shape[0]), X4 - 1, :]

This is assuming X3 and X4 are tensors.

Training accuracy decrease and loss increase when using pack_padded_sequence - pad_packed_sequence

Answers (1)

Related Questions