Reputation: 4349
I am currently looking to leverage the tf.data.dataset
functionality to perform scalable training on a text like dataset but struggling to identify a way to use the built in TF functions to generate a hierarchical 4D representation of multi-sentence strings. In the past I would have used something along the lines of
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
max_sent_length = 50
max_sents = 5
max_nb_words = 100
min_freq = 0
text = ["This game is a bit hard to get the hang of, but when you do it's great.", "I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon."]
df = pd.DataFrame({"text":text})
tokenizer = Tokenizer(num_words= 100, filters='.')
tokenizer.fit_on_texts(df['text'].values)
encoded_docs = tokenizer.texts_to_sequences(df['text'].values)
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))
# limit vocabulary size by token frequence
vocab = [k for k in tokenizer.word_counts.keys() if tokenizer.word_counts[k] > min_freq]
print('Vocabulary size with frequency > %d = %d' % (min_freq, len(vocab)))
max_nb_words = min(max_nb_words, len(vocab)) + 1 # index 0 is not used
print('Max number of words = %d' % max_nb_words)
def create_array(input_text=text, max_sents=5, max_num_words=1000, max_sent_length=50, tokenizer=tokenizer):
data = np.zeros((1, max_sents, max_sent_length), dtype='float32')
for j, sent in enumerate(sent_tokenize(input_text)):
if j < max_sents:
wordTokens = text_to_word_sequence(sent, filters='.', lower=True, split=' ')
k = 0
for _, word in enumerate(wordTokens):
if k < max_sent_length:
if (word in tokenizer.word_index) and (tokenizer.word_index[word] <= max_num_words):
data[0, j, k] = tokenizer.word_index[word]
else:
data[0, j, k] = max_num_words
k = k + 1
return data
my_list = [create_array(i, tokenizer=tokenizer, max_sent_length=max_sent_length, max_sents=max_sents) for i in df['text'].tolist()]
my_list
The resultant output should be:
[array([[[14., 6., 15., 1., 10., 11., 2., 16., 3., 17., 18., 7.,
19., 20., 21., 22., 23., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.]]], dtype=float32),
array([[[ 4., 24., 5., 1., 25., 7., 5., 8., 26., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[ 3., 12., 8., 1., 10., 9., 27., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[ 3., 13., 28., 29., 30., 6., 2., 12., 3., 13., 9., 1.,
11., 31., 4., 32., 33., 34., 35., 1., 6., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[ 7., 36., 37., 9., 38., 5., 8., 39., 4., 40., 5., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.],
[41., 4., 42., 43., 44., 2., 45., 46., 4., 47., 48., 2.,
49., 50., 51., 2., 3., 52., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.]]], dtype=float32)]
I tried to leverage the info from this post to create the 4D array within a py_func
:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
text = ["This game is a bit hard to get the hang of, but when you do it's great.", "I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon."]
df = pd.DataFrame({"text":text})
training_dataset = (
tf.data.Dataset.from_tensor_slices(
tf.cast(df.text.values, tf.string)))
for ex in training_dataset.take(5):
print(ex)
# Create a tokenizer instance to tokenize text data.
tokenizer = tfds.features.text.Tokenizer()
# Find unique tokens in the dataset.
lowercase = True # set this to `False` if case-sensitivity is important.
vocabulary = Counter()
for text in training_dataset:
if lowercase:
text = tf.strings.lower(text)
tokens = tokenizer.tokenize(text.numpy())
vocabulary.update(tokens)
# Select the most common tokens as final vocabulary set.
# Note: if you want all the tokens to be included,
# set `vocab_size = len(vocabulary)` instead.
vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))
# Set this to a non-zero integer if you want the texts
# to be truncated when they have more than `max_len` tokens.
max_len = 50
max_sent = 5
def encode(text):
sent_list = []
sents = tf.strings.split(text, sep=". ").numpy()
if max_sent:
sents = sents[:max_sents]
for sent in sents:
text_encoded = encoder.encode(sent.numpy())
if max_len:
text_encoded = text_encoded[:max_len]
sent_list.append[text_encoded]
encoded_text = tf.stack(sent_list)
return encoded_text
# Wrap `encode` function inside `tf.py_function` so that
# it could be used with `map` method.
def encode_pyfn(text):
text_encoded = tf.py_function(encode,
inp=text,
Tout=tf.int32)
# (optional) Set the shapes for efficiency.
text_encoded.set_shape([None])
return text_encoded
# Apply encoding and then padding.
# Note: if you want the sequences in all the batches
# to have the same length, set `padded_shapes` argument accordingly.
training_dataset = training_dataset.map(encode_pyfn)
but get the following error:
TypeError: Expected list for 'input' argument to 'EagerPyFunc' Op, not Tensor("args_0:0", shape=(), dtype=string).
Is there a better way to encode and then stack each of the individual sentences to generate the necessary 4D array?
Upvotes: 1
Views: 281
Reputation: 36704
You had a few problems. I made a custom, bogus dataset and fixed the mistakes. The "dataset" is just random letters:
tf.Tensor(b'xf wl dy fp ke dj ye xp fs', shape=(), dtype=string)
tf.Tensor(b'ek xn ir yd jp pz cw', shape=(), dtype=string)
tf.Tensor(b'gu iz hp jl uf', shape=(), dtype=string)
tf.Tensor(b'nu kc ai zo du qo fu bj nn', shape=(), dtype=string)
tf.Tensor(b'xw zo az mn vf nu', shape=(), dtype=string)
Here's what I did:
py_function
expected listsset_shape
is optional, I removed itnumpy()
methodimport tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
import pandas as pd
import numpy as np
from string import ascii_lowercase as letters
inputs = [' '.join([''.join(np.random.choice(list(letters), 2))
for i in range(np.random.randint(5, 10))])
for ii in range(100)]
outputs = np.random.randint(0, 2, 100).astype(str)
df = pd.DataFrame(zip(inputs, outputs), columns=['text', 'string'])
training_dataset = (
tf.data.Dataset.from_tensor_slices(
tf.cast(df.text.values, tf.string)))
for ex in training_dataset.take(5):
print(ex)
tokenizer = tfds.features.text.Tokenizer()
lowercase = True
vocabulary = Counter()
for text in training_dataset:
if lowercase:
text = tf.strings.lower(text)
tokens = tokenizer.tokenize(text.numpy())
vocabulary.update(tokens)
vocab_size = 128
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))
max_len = 50
max_sent = 5
encoder = tfds.features.text.TokenTextEncoder(vocabulary,
lowercase=True,
tokenizer=tokenizer)
def encode(text):
sent_list = []
sents = tf.strings.split(text, sep=". ").numpy()
sents = sents[:max_sent]
for sent in sents:
text_encoded = encoder.encode(sent.decode())
if max_len:
text_encoded = text_encoded[:max_len]
sent_list.append(text_encoded)
encoded_text = tf.stack(sent_list)
return encoded_text
def encode_pyfn(text):
[text_encoded] = tf.py_function(encode,
inp=[text],
Tout=[tf.int32])
return text_encoded
training_dataset = training_dataset.map(encode_pyfn).\
padded_batch(batch_size=3, padded_shapes=([1, max_len]))
next(iter(training_dataset))
Final result:
<tf.Tensor: shape=(3, 1, 50), dtype=int32, numpy=
array([[[129, 1, 14, 129, 56, 15, 57, 129, 129, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0]],
[[129, 16, 129, 58, 59, 60, 129, 129, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0]],
[[129, 61, 129, 17, 129, 129, 129, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0]]])>
You posted another example dataset as I posted my answer, so here's the same thing I did above, but with your example:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
text = ["This game is a bit hard to get the hang of, but when you do it's great.",
"I played it a while but it was alright. The steam was a bit of trouble."
" The more they move these game to steam the more of a hard time I have"
" activating and playing a game. But in spite of that it was fun, I "
"liked it. Now I am looking forward to anno 2205 I really want to "
"play my way to the moon."]
df = pd.DataFrame({"text": text})
training_dataset = (
tf.data.Dataset.from_tensor_slices(
tf.cast(df.text.values, tf.string)))
for ex in training_dataset.take(5):
print(ex)
tokenizer = tfds.features.text.Tokenizer()
lowercase = True
vocabulary = Counter()
for text in training_dataset:
if lowercase:
text = tf.strings.lower(text)
tokens = tokenizer.tokenize(text.numpy())
vocabulary.update(tokens)
vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))
max_len = 15
max_sent = 5
encoder = tfds.features.text.TokenTextEncoder(vocabulary,
lowercase=True,
tokenizer=tokenizer)
def encode(text):
sent_list = []
sents = tf.strings.split(text, sep=". ").numpy()
if max_sent:
sents = sents[:max_sent]
for sent in sents:
text_encoded = encoder.encode(sent.decode())
if max_len:
text_encoded = text_encoded[:max_len]
sent_list.append(text_encoded)
encoded_text = tf.concat(sent_list, axis=0)
return encoded_text
def encode_pyfn(text):
[text_encoded] = tf.py_function(encode, inp=[text], Tout=[tf.int32])
return text_encoded
training_dataset = training_dataset.map(encode_pyfn).\
padded_batch(batch_size=4, padded_shapes=([max_len*max_sent,]))
next(iter(training_dataset))
Output:
<tf.Tensor: shape=(2, 75), dtype=int32, numpy=
array([[14, 7, 15, 1, 10, 11, 2, 16, 3, 17, 6, 8, 18, 19, 20, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 5, 23, 4, 1, 24, 8, 4, 9, 25, 3, 12, 9, 1, 10, 6, 26,
3, 13, 27, 28, 29, 7, 2, 12, 3, 13, 6, 1, 11, 30, 5, 8,
35, 36, 6, 37, 4, 9, 38, 5, 39, 4, 40, 5, 41, 42, 43, 2,
44, 45, 5, 46, 47, 2, 48, 49, 50, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>
Here's an update based on your last comment:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
text = ["I played it a while but it was alright. The steam was a bit of trouble."
" The more they move these game to steam the more of a hard time I have"
" activating and playing a game. But in spite of that it was fun, I "
"liked it. Now I am looking forward to anno 2205 I really want to "
"play my way to the moon.",
"This game is a bit hard to get the hang of, but when you do it's great."]
df = pd.DataFrame({"text": text})
training_dataset = (
tf.data.Dataset.from_tensor_slices(
tf.cast(df.text.values, tf.string)))
for ex in training_dataset.take(5):
print(ex)
tokenizer = tfds.features.text.Tokenizer()
lowercase = True
vocabulary = Counter()
for text in training_dataset:
if lowercase:
text = tf.strings.lower(text)
tokens = tokenizer.tokenize(text.numpy())
vocabulary.update(tokens)
vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))
max_len = 15
max_sent = 5
encoder = tfds.features.text.TokenTextEncoder(vocabulary,
lowercase=True,
tokenizer=tokenizer)
def encode(text):
sent_list = []
sents = tf.strings.split(text, sep=". ").numpy()
if max_sent:
sents = sents[:max_sent]
for sent in sents:
text_encoded = encoder.encode(sent.decode())
if max_len:
text_encoded = text_encoded[:max_len]
sent_list.append(pad_sequences([text_encoded], max_len))
if len(sent_list) < 5:
sent_list.append([tf.zeros(max_len) for _ in range(5 - len(sent_list))])
return tf.concat(sent_list, axis=0)
def encode_pyfn(text):
[text_encoded] = tf.py_function(encode, inp=[text], Tout=[tf.int32])
return text_encoded
training_dataset = training_dataset.map(encode_pyfn).batch(batch_size=4)
next(iter(training_dataset))
<tf.Tensor: shape=(2, 5, 15), dtype=int32, numpy=
array([[[14, 7, 15, 1, 10, 11, 2, 16, 3, 17, 6, 8, 18, 19, 20],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
[[ 0, 0, 0, 0, 0, 0, 5, 23, 4, 1, 24, 8, 4, 9, 25],
[ 0, 0, 0, 0, 0, 0, 0, 0, 3, 12, 9, 1, 10, 6, 26],
[ 3, 13, 27, 28, 29, 7, 2, 12, 3, 13, 6, 1, 11, 30, 5],
[ 0, 0, 0, 0, 8, 35, 36, 6, 37, 4, 9, 38, 5, 39, 4],
[40, 5, 41, 42, 43, 2, 44, 45, 5, 46, 47, 2, 48, 49, 50]]])>
Upvotes: 1