youcloudsofdoom
youcloudsofdoom

Reputation: 65

Keras/tensorflow 'ValueError: output of generator should be a tuple...' error after first epoch

I'm trying to get the keras-based sequence to sequence example from here working: https://github.com/ml4a/ml4a-guides/blob/master/notebooks/sequence_to_sequence.ipynb

Here's the code I'm running, using keras 1.2.2/python 3.5.2:

import numpy as np
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Activation, Dense, RepeatVector, Input, merge

import json

data = json.load(open('../data/en_de_corpus.json', 'r'))

# to deal with memory issues,
# limit the dataset
# we could also generate the training samples on-demand
# with a generator and use keras models' `fit_generator` method
max_len = 6
max_examples = 80000
max_vocab_size = 10000

def get_texts(source_texts, target_texts, max_len, max_examples):
    """extract texts
    training gets difficult with widely varying lengths
    since some sequences are mostly padding
    long sequences get difficult too, so we are going
    to cheat and just consider short-ish sequences.
    this assumes whitespace as a token delimiter
    and that the texts are already aligned.
    """
    sources, targets = [], []
    for i, source in enumerate(source_texts):
        # assume we split on whitespace
        if len(source.split(' ')) <= max_len:
            target = target_texts[i]
            if len(target.split(' ')) <= max_len:
                sources.append(source)
                targets.append(target)
    return sources[:max_examples], targets[:max_examples]

en_texts, de_texts = get_texts(data['en'], data['de'], max_len, max_examples)
n_examples = len(en_texts)

# add start and stop tokens
start_token = '^'
end_token = '$'
en_texts = [' '.join([start_token, text, end_token]) for text in en_texts]
de_texts = [' '.join([start_token, text, end_token]) for text in de_texts]

# characters for the tokenizers to filter out
# preserve start and stop tokens
filter_chars = '!"#$%&()*+,-./:;<=>?@[\\]^_{|}~\t\n\'`“”–'.replace(start_token, '').replace(end_token, '')

source_tokenizer = Tokenizer(max_vocab_size, filters=filter_chars)
source_tokenizer.fit_on_texts(en_texts)
target_tokenizer = Tokenizer(max_vocab_size, filters=filter_chars)
target_tokenizer.fit_on_texts(de_texts)

# vocab sizes
# idx 0 is reserved by keras (for padding)
# and not part of the word_index,
# so add 1 to account for it
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# find max length (in tokens) of input and output sentences
max_input_length = max(len(seq) for seq in source_tokenizer.texts_to_sequences_generator(en_texts))
max_output_length = max(len(seq) for seq in target_tokenizer.texts_to_sequences_generator(de_texts))

sequences = pad_sequences(source_tokenizer.texts_to_sequences(en_texts[:1]), maxlen=max_input_length)
print(en_texts[0])
# >>> ^ I took the bus back. $
print(sequences[0])
# >>> [  0   0   0   2   4 223   3 461 114   1]

def build_one_hot_vecs(sequences):
    """generate one-hot vectors from token sequences"""
    # boolean to reduce memory footprint
    X = np.zeros((len(sequences), max_input_length, source_vocab_size), dtype=np.bool)
    for i, sent in enumerate(sequences):
        word_idxs = np.arange(max_input_length)
        X[i][[word_idxs, sent]] = True
    return X

def build_target_vecs():
    """encode words in the target sequences as one-hots"""
    y = np.zeros((n_examples, max_output_length, target_vocab_size), dtype=np.bool)
    for i, sent in enumerate(pad_sequences(target_tokenizer.texts_to_sequences(de_texts), maxlen=max_output_length)):
        word_idxs = np.arange(max_output_length)
        y[i][[word_idxs, sent]] = True
    return y


hidden_dim  = 128
embedding_dim = 128


def build_model(one_hot=False, bidirectional=False):
    """build a vanilla sequence-to-sequence model.
    specify `one_hot=True` to build it for one-hot encoded inputs,
    otherwise, pass in sequences directly and embeddings will be learned.
    specify `bidirectional=False` to use a bidirectional LSTM"""
    if one_hot:
        input = Input(shape=(max_input_length,source_vocab_size))
        input_ = input
    else:
        input = Input(shape=(max_input_length,), dtype='int32')
        input_ = Embedding(source_vocab_size, embedding_dim, input_length=max_input_length)(input)

    # encoder; don't return sequences, just give us one representation vector
    if bidirectional:
        forwards = LSTM(hidden_dim, return_sequences=False)(input_)
        backwards = LSTM(hidden_dim, return_sequences=False, go_backwards=True)(input_)
        encoder = merge([forwards, backwards], mode='concat', concat_axis=-1)
    else:
        encoder = LSTM(hidden_dim, return_sequences=False)(input_)

    # repeat encoder output for each desired output from the decoder
    encoder = RepeatVector(max_output_length)(encoder)

    # decoder; do return sequences (timesteps)
    decoder = LSTM(hidden_dim, return_sequences=True)(encoder)

    # apply the dense layer to each timestep
    # give output conforming to target vocab size
    decoder = TimeDistributed(Dense(target_vocab_size))(decoder)

    # convert to a proper distribution
    predictions = Activation('softmax')(decoder)
    return Model(input=input, output=predictions)




target_reverse_word_index  = {v:k for k,v in target_tokenizer.word_index.items()}

def decode_outputs(predictions):
    outputs = []
    for probs in predictions:
        preds = probs.argmax(axis=-1)
        tokens = []
        for idx in preds:
            tokens.append(target_reverse_word_index.get(idx))
        outputs.append(' '.join([t for t in tokens if t is not None]))
    return outputs


def build_seq_vecs (sequences):
    return np.array(sequences)

import math

def generate_batches(batch_size, one_hot=False):
    # each epoch
    n_batches = math.ceil(n_examples/batch_size)
    while True:
        sequences = pad_sequences(source_tokenizer.texts_to_sequences(en_texts), maxlen=max_input_length)

        if one_hot:
            X = build_one_hot_vecs(sequences)
        else:
            X = build_seq_vecs(sequences)
        y = build_target_vecs()

        # shuffle
        idx = np.random.permutation(len(sequences))
        X = X[idx]
        y = y[idx]

        for i in range(n_batches):
            start = batch_size * i
            end = start+batch_size
            yield X[start:end], y[start:end]
n_epochs = 100
batch_size = 128

model = build_model(one_hot=False, bidirectional=False)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(generator=generate_batches(batch_size, one_hot=False), samples_per_epoch=n_examples, nb_epoch=n_epochs, verbose=1)

def translate(model, sentences, one_hot=False):
    seqs = pad_sequences(source_tokenizer.texts_to_sequences(sentences), maxlen=max_input_length)
    if one_hot:
        input = build_one_hot_vecs(seqs)
    else:
        input = build_seq_vecs(seqs)
    preds = model.predict(input, verbose=0)
    return decode_outputs(preds)

print(en_texts[0])
print(de_texts[0])
print(translate(model, [en_texts[0]], one_hot=True))
# >>> ^ I took the bus back. $
# >>> ^ Ich nahm den Bus zurück. $
# >>> ^ ich ich die die verloren $

It seems to start up fine, but when it tries to move to the second epoch, I get this error:

Epoch 2/100
Exception in thread Thread-1:
Traceback (most recent call last):
  File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
    self.run()
  File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\site-packages\keras-1.2.2-py3.5.egg\keras\engine\training.py", line 429, in data_generator_task
    generator_output = next(self._generator)
  File "C:\Users\Tobias\Desktop\Augury\seq2seq2.py", line 168, in generate_batches
    y = y[idx]
MemoryError

Traceback (most recent call last):
  File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\runpy.py", line 174, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\runpy.py", line 109, in _get_module_details
    __import__(pkg_name)
  File "C:\Users\Tobias\Desktop\Augury\seq2seq2.py", line 179, in <module>
    model.fit_generator(generator=generate_batches(batch_size, one_hot=False), samples_per_epoch=n_examples, nb_epoch=n_epochs, verbose=1)
  File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\site-packages\keras-1.2.2-py3.5.egg\keras\engine\training.py", line 1532, in fit_generator
    str(generator_output))
ValueError: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None

Does anyone have any thoughts on what might be going wrong here?

Upvotes: 1

Views: 879

Answers (1)

ixeption
ixeption

Reputation: 2050

You can test your generator with:

next(generate_batches(batch_size, one_hot=False))

If it works in this case you should take a look at the memory consumption. Because your seq2seq2.py throws an MemoryError, which could also be the root of the problem. Probably your generator is returning None, because if this.

BTW in Keras you can use LSTM Layerwrappers (Bidirectional), which does what you do manually.

Upvotes: 1

Related Questions