Reputation: 65
I'm trying to get the keras-based sequence to sequence example from here working: https://github.com/ml4a/ml4a-guides/blob/master/notebooks/sequence_to_sequence.ipynb
Here's the code I'm running, using keras 1.2.2/python 3.5.2:
import numpy as np
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Activation, Dense, RepeatVector, Input, merge
import json
data = json.load(open('../data/en_de_corpus.json', 'r'))
# to deal with memory issues,
# limit the dataset
# we could also generate the training samples on-demand
# with a generator and use keras models' `fit_generator` method
max_len = 6
max_examples = 80000
max_vocab_size = 10000
def get_texts(source_texts, target_texts, max_len, max_examples):
"""extract texts
training gets difficult with widely varying lengths
since some sequences are mostly padding
long sequences get difficult too, so we are going
to cheat and just consider short-ish sequences.
this assumes whitespace as a token delimiter
and that the texts are already aligned.
"""
sources, targets = [], []
for i, source in enumerate(source_texts):
# assume we split on whitespace
if len(source.split(' ')) <= max_len:
target = target_texts[i]
if len(target.split(' ')) <= max_len:
sources.append(source)
targets.append(target)
return sources[:max_examples], targets[:max_examples]
en_texts, de_texts = get_texts(data['en'], data['de'], max_len, max_examples)
n_examples = len(en_texts)
# add start and stop tokens
start_token = '^'
end_token = '$'
en_texts = [' '.join([start_token, text, end_token]) for text in en_texts]
de_texts = [' '.join([start_token, text, end_token]) for text in de_texts]
# characters for the tokenizers to filter out
# preserve start and stop tokens
filter_chars = '!"#$%&()*+,-./:;<=>?@[\\]^_{|}~\t\n\'`“”–'.replace(start_token, '').replace(end_token, '')
source_tokenizer = Tokenizer(max_vocab_size, filters=filter_chars)
source_tokenizer.fit_on_texts(en_texts)
target_tokenizer = Tokenizer(max_vocab_size, filters=filter_chars)
target_tokenizer.fit_on_texts(de_texts)
# vocab sizes
# idx 0 is reserved by keras (for padding)
# and not part of the word_index,
# so add 1 to account for it
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
# find max length (in tokens) of input and output sentences
max_input_length = max(len(seq) for seq in source_tokenizer.texts_to_sequences_generator(en_texts))
max_output_length = max(len(seq) for seq in target_tokenizer.texts_to_sequences_generator(de_texts))
sequences = pad_sequences(source_tokenizer.texts_to_sequences(en_texts[:1]), maxlen=max_input_length)
print(en_texts[0])
# >>> ^ I took the bus back. $
print(sequences[0])
# >>> [ 0 0 0 2 4 223 3 461 114 1]
def build_one_hot_vecs(sequences):
"""generate one-hot vectors from token sequences"""
# boolean to reduce memory footprint
X = np.zeros((len(sequences), max_input_length, source_vocab_size), dtype=np.bool)
for i, sent in enumerate(sequences):
word_idxs = np.arange(max_input_length)
X[i][[word_idxs, sent]] = True
return X
def build_target_vecs():
"""encode words in the target sequences as one-hots"""
y = np.zeros((n_examples, max_output_length, target_vocab_size), dtype=np.bool)
for i, sent in enumerate(pad_sequences(target_tokenizer.texts_to_sequences(de_texts), maxlen=max_output_length)):
word_idxs = np.arange(max_output_length)
y[i][[word_idxs, sent]] = True
return y
hidden_dim = 128
embedding_dim = 128
def build_model(one_hot=False, bidirectional=False):
"""build a vanilla sequence-to-sequence model.
specify `one_hot=True` to build it for one-hot encoded inputs,
otherwise, pass in sequences directly and embeddings will be learned.
specify `bidirectional=False` to use a bidirectional LSTM"""
if one_hot:
input = Input(shape=(max_input_length,source_vocab_size))
input_ = input
else:
input = Input(shape=(max_input_length,), dtype='int32')
input_ = Embedding(source_vocab_size, embedding_dim, input_length=max_input_length)(input)
# encoder; don't return sequences, just give us one representation vector
if bidirectional:
forwards = LSTM(hidden_dim, return_sequences=False)(input_)
backwards = LSTM(hidden_dim, return_sequences=False, go_backwards=True)(input_)
encoder = merge([forwards, backwards], mode='concat', concat_axis=-1)
else:
encoder = LSTM(hidden_dim, return_sequences=False)(input_)
# repeat encoder output for each desired output from the decoder
encoder = RepeatVector(max_output_length)(encoder)
# decoder; do return sequences (timesteps)
decoder = LSTM(hidden_dim, return_sequences=True)(encoder)
# apply the dense layer to each timestep
# give output conforming to target vocab size
decoder = TimeDistributed(Dense(target_vocab_size))(decoder)
# convert to a proper distribution
predictions = Activation('softmax')(decoder)
return Model(input=input, output=predictions)
target_reverse_word_index = {v:k for k,v in target_tokenizer.word_index.items()}
def decode_outputs(predictions):
outputs = []
for probs in predictions:
preds = probs.argmax(axis=-1)
tokens = []
for idx in preds:
tokens.append(target_reverse_word_index.get(idx))
outputs.append(' '.join([t for t in tokens if t is not None]))
return outputs
def build_seq_vecs (sequences):
return np.array(sequences)
import math
def generate_batches(batch_size, one_hot=False):
# each epoch
n_batches = math.ceil(n_examples/batch_size)
while True:
sequences = pad_sequences(source_tokenizer.texts_to_sequences(en_texts), maxlen=max_input_length)
if one_hot:
X = build_one_hot_vecs(sequences)
else:
X = build_seq_vecs(sequences)
y = build_target_vecs()
# shuffle
idx = np.random.permutation(len(sequences))
X = X[idx]
y = y[idx]
for i in range(n_batches):
start = batch_size * i
end = start+batch_size
yield X[start:end], y[start:end]
n_epochs = 100
batch_size = 128
model = build_model(one_hot=False, bidirectional=False)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(generator=generate_batches(batch_size, one_hot=False), samples_per_epoch=n_examples, nb_epoch=n_epochs, verbose=1)
def translate(model, sentences, one_hot=False):
seqs = pad_sequences(source_tokenizer.texts_to_sequences(sentences), maxlen=max_input_length)
if one_hot:
input = build_one_hot_vecs(seqs)
else:
input = build_seq_vecs(seqs)
preds = model.predict(input, verbose=0)
return decode_outputs(preds)
print(en_texts[0])
print(de_texts[0])
print(translate(model, [en_texts[0]], one_hot=True))
# >>> ^ I took the bus back. $
# >>> ^ Ich nahm den Bus zurück. $
# >>> ^ ich ich die die verloren $
It seems to start up fine, but when it tries to move to the second epoch, I get this error:
Epoch 2/100
Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
self.run()
File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\threading.py", line 862, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\site-packages\keras-1.2.2-py3.5.egg\keras\engine\training.py", line 429, in data_generator_task
generator_output = next(self._generator)
File "C:\Users\Tobias\Desktop\Augury\seq2seq2.py", line 168, in generate_batches
y = y[idx]
MemoryError
Traceback (most recent call last):
File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\runpy.py", line 174, in _run_module_as_main
mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\runpy.py", line 109, in _get_module_details
__import__(pkg_name)
File "C:\Users\Tobias\Desktop\Augury\seq2seq2.py", line 179, in <module>
model.fit_generator(generator=generate_batches(batch_size, one_hot=False), samples_per_epoch=n_examples, nb_epoch=n_epochs, verbose=1)
File "C:\Users\Tobias\AppData\Local\Programs\Python\Python35\lib\site-packages\keras-1.2.2-py3.5.egg\keras\engine\training.py", line 1532, in fit_generator
str(generator_output))
ValueError: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None
Does anyone have any thoughts on what might be going wrong here?
Upvotes: 1
Views: 879
Reputation: 2050
You can test your generator with:
next(generate_batches(batch_size, one_hot=False))
If it works in this case you should take a look at the memory consumption. Because your seq2seq2.py throws an MemoryError, which could also be the root of the problem. Probably your generator is returning None, because if this.
BTW in Keras you can use LSTM Layerwrappers (Bidirectional), which does what you do manually.
Upvotes: 1