Correctly structuring text data for text generation with Tensorflow model

Question

I am trying to train my model to generate sentences no longer that 210 characters. From what I have read I have only seen training on 'continuous' text. Like a book. However I am trying to train my model on single sentences.

I'm pretty new to tensorflow and ML so right now I am able to train my model but it generates garbage, seemingly random text. I have 10,000 sentences so I think I have sufficient data.

Overview of my data

Structure [['SENTENCE'], ['SENTENCE2']...]

Data Prep

tokenizer = keras.preprocessing.text.Tokenizer(num_words=209, lower=False, char_level=True, filters='#$%&()*+-<=>@[\]^_`{|}~	
')
tokenizer.fit_on_texts(df['title'].values)
df['encoded_with_keras'] = tokenizer.texts_to_sequences(df['title'].values)

dataset = df['encoded_with_keras'].values
dataset = tf.keras.preprocessing.sequence.pad_sequences(dataset, padding='post')

dataset = dataset.flatten()

dataset = tf.data.Dataset.from_tensor_slices(dataset)

sequences = dataset.batch(seq_len+1, drop_remainder=True)

def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

dataset = sequences.map(create_seq_targets)

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

Model

def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size, None],input_length=209, mask_zero=True))
    model.add(LSTM(rnn_neurons, return_sequences=True, stateful=True,))
    model.add(Dropout(0.2))
    model.add(Dense(258, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(optimizer='adam', loss="sparse_categorical_crossentropy")
    return model

When I give the model a sequence to start from I get back absolute nonsense and eventually the model predicts a 0 which is not in the char_index mapping.

Edit

Text Generation


epochs = 2

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


model = create_model(vocab_size = vocab_size,
  embed_dim=embed_dim,
  rnn_neurons=rnn_neurons,
  batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

def generate_text(model, start_string):
  num_generate = 200

  input_eval = [char_2_index[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  temperature = 1

  # model.reset_states()
  for i in range(num_generate):
      print(text_generated)
      predictions = model(input_eval)

      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature

      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      print(predicted_id)

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(index_2_char[predicted_id])

  return (start_string + ''.join(text_generated))

Correctly structuring text data for text generation with Tensorflow model

Answers (1)

Generating new senteces

Related Questions