Generating hierarchical text representation within TensorFlow dataset

Question

I am currently looking to leverage the tf.data.dataset functionality to perform scalable training on a text like dataset but struggling to identify a way to use the built in TF functions to generate a hierarchical 4D representation of multi-sentence strings. In the past I would have used something along the lines of

import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

max_sent_length = 50        
max_sents       = 5
max_nb_words    = 100
min_freq        = 0

text = ["This game is a bit hard to get the hang of, but when you do it's great.", "I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon."]

df = pd.DataFrame({"text":text})

tokenizer = Tokenizer(num_words= 100, filters='.')
tokenizer.fit_on_texts(df['text'].values)

encoded_docs = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

# limit vocabulary size by token frequence
vocab = [k for k in tokenizer.word_counts.keys() if tokenizer.word_counts[k] > min_freq]
print('Vocabulary size with frequency > %d = %d' % (min_freq, len(vocab)))

max_nb_words = min(max_nb_words, len(vocab)) + 1 # index 0 is not used

print('Max number of words = %d' % max_nb_words)

def create_array(input_text=text, max_sents=5, max_num_words=1000, max_sent_length=50, tokenizer=tokenizer):
    data = np.zeros((1, max_sents, max_sent_length), dtype='float32')

    for j, sent in enumerate(sent_tokenize(input_text)):
        if j < max_sents:
            wordTokens = text_to_word_sequence(sent, filters='.', lower=True, split=' ')
            k = 0
            for _, word in enumerate(wordTokens):
                if k < max_sent_length: 
                    if (word in tokenizer.word_index) and (tokenizer.word_index[word] <= max_num_words):
                        data[0, j, k] = tokenizer.word_index[word]
                    else:
                        data[0, j, k] = max_num_words
                    k = k + 1

    return data

my_list = [create_array(i, tokenizer=tokenizer, max_sent_length=max_sent_length, max_sents=max_sents) for i in df['text'].tolist()]
my_list

The resultant output should be:

[array([[[14.,  6., 15.,  1., 10., 11.,  2., 16.,  3., 17., 18.,  7.,
          19., 20., 21., 22., 23.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.]]], dtype=float32),
 array([[[ 4., 24.,  5.,  1., 25.,  7.,  5.,  8., 26.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [ 3., 12.,  8.,  1., 10.,  9., 27.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [ 3., 13., 28., 29., 30.,  6.,  2., 12.,  3., 13.,  9.,  1.,
          11., 31.,  4., 32., 33., 34., 35.,  1.,  6.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [ 7., 36., 37.,  9., 38.,  5.,  8., 39.,  4., 40.,  5.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.],
         [41.,  4., 42., 43., 44.,  2., 45., 46.,  4., 47., 48.,  2.,
          49., 50., 51.,  2.,  3., 52.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.]]], dtype=float32)]

I tried to leverage the info from this post to create the 4D array within a py_func:

import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter

text = ["This game is a bit hard to get the hang of, but when you do it's great.", "I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon."]

df = pd.DataFrame({"text":text})


training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        tf.cast(df.text.values, tf.string)))

for ex in training_dataset.take(5):
    print(ex)
    
# Create a tokenizer instance to tokenize text data.
tokenizer = tfds.features.text.Tokenizer()

# Find unique tokens in the dataset.
lowercase = True  # set this to `False` if case-sensitivity is important.
vocabulary = Counter()
for text in training_dataset:
    if lowercase:
        text = tf.strings.lower(text)
    tokens = tokenizer.tokenize(text.numpy())
    vocabulary.update(tokens)

# Select the most common tokens as final vocabulary set.
# Note: if you want all the tokens to be included,
# set `vocab_size = len(vocabulary)` instead.
vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))
    
# Set this to a non-zero integer if you want the texts
# to be truncated when they have more than `max_len` tokens.
max_len = 50
max_sent = 5

def encode(text):
    sent_list = []
    sents = tf.strings.split(text, sep=". ").numpy()
    if max_sent:
        sents = sents[:max_sents]
    for sent in sents: 
        text_encoded = encoder.encode(sent.numpy())
        if max_len:
            text_encoded = text_encoded[:max_len]
            sent_list.append[text_encoded]
    encoded_text = tf.stack(sent_list)
    
    return encoded_text


# Wrap `encode` function inside `tf.py_function` so that
# it could be used with `map` method.
def encode_pyfn(text):
    text_encoded = tf.py_function(encode,
                                  inp=text,
                                  Tout=tf.int32)

    # (optional) Set the shapes for efficiency.
    text_encoded.set_shape([None])

    return text_encoded

# Apply encoding and then padding.
# Note: if you want the sequences in all the batches 
# to have the same length, set `padded_shapes` argument accordingly.
training_dataset = training_dataset.map(encode_pyfn)

but get the following error: TypeError: Expected list for 'input' argument to 'EagerPyFunc' Op, not Tensor("args_0:0", shape=(), dtype=string).

Is there a better way to encode and then stack each of the individual sentences to generate the necessary 4D array?

Nicolas Gervais · Accepted Answer

You had a few problems. I made a custom, bogus dataset and fixed the mistakes. The "dataset" is just random letters:

tf.Tensor(b'xf wl dy fp ke dj ye xp fs', shape=(), dtype=string)
tf.Tensor(b'ek xn ir yd jp pz cw', shape=(), dtype=string)
tf.Tensor(b'gu iz hp jl uf', shape=(), dtype=string)
tf.Tensor(b'nu kc ai zo du qo fu bj nn', shape=(), dtype=string)
tf.Tensor(b'xw zo az mn vf nu', shape=(), dtype=string)

Here's what I did:

The argument in py_function expected lists
You had brackets instead of parentheses while trying to append a list
set_shape is optional, I removed it
Removed the numpy() method

import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
import pandas as pd
import numpy as np
from string import ascii_lowercase as letters


inputs = [' '.join([''.join(np.random.choice(list(letters), 2))
                    for i in range(np.random.randint(5, 10))])
          for ii in range(100)]

outputs = np.random.randint(0, 2, 100).astype(str)

df = pd.DataFrame(zip(inputs, outputs), columns=['text', 'string'])

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        tf.cast(df.text.values, tf.string)))

for ex in training_dataset.take(5):
    print(ex)


tokenizer = tfds.features.text.Tokenizer()

lowercase = True
vocabulary = Counter()
for text in training_dataset:
    if lowercase:
        text = tf.strings.lower(text)
    tokens = tokenizer.tokenize(text.numpy())
    vocabulary.update(tokens)

vocab_size = 128
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))

max_len = 50
max_sent = 5

encoder = tfds.features.text.TokenTextEncoder(vocabulary,
                                              lowercase=True,
                                              tokenizer=tokenizer)

def encode(text):
    sent_list = []
    sents = tf.strings.split(text, sep=". ").numpy()
    sents = sents[:max_sent]
    for sent in sents:
        text_encoded = encoder.encode(sent.decode())
        if max_len:
            text_encoded = text_encoded[:max_len]
            sent_list.append(text_encoded)
    encoded_text = tf.stack(sent_list)

    return encoded_text


def encode_pyfn(text):
    [text_encoded] = tf.py_function(encode,
                                  inp=[text],
                                  Tout=[tf.int32])
    return text_encoded

training_dataset = training_dataset.map(encode_pyfn).\
    padded_batch(batch_size=3, padded_shapes=([1, max_len]))

next(iter(training_dataset))

Final result:

You posted another example dataset as I posted my answer, so here's the same thing I did above, but with your example:

import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter


text = ["This game is a bit hard to get the hang of, but when you do it's great.",
        "I played it a while but it was alright. The steam was a bit of trouble."
        " The more they move these game to steam the more of a hard time I have"
        " activating and playing a game. But in spite of that it was fun, I "
        "liked it. Now I am looking forward to anno 2205 I really want to "
        "play my way to the moon."]


df = pd.DataFrame({"text": text})

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        tf.cast(df.text.values, tf.string)))

for ex in training_dataset.take(5):
    print(ex)

tokenizer = tfds.features.text.Tokenizer()

lowercase = True
vocabulary = Counter()
for text in training_dataset:
    if lowercase:
        text = tf.strings.lower(text)
    tokens = tokenizer.tokenize(text.numpy())
    vocabulary.update(tokens)


vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))

max_len = 15
max_sent = 5
encoder = tfds.features.text.TokenTextEncoder(vocabulary,
                                              lowercase=True,
                                              tokenizer=tokenizer)

def encode(text):
    sent_list = []
    sents = tf.strings.split(text, sep=". ").numpy()
    if max_sent:
        sents = sents[:max_sent]
    for sent in sents:
        text_encoded = encoder.encode(sent.decode())
        if max_len:
            text_encoded = text_encoded[:max_len]
            sent_list.append(text_encoded)
    encoded_text = tf.concat(sent_list, axis=0)

    return encoded_text


def encode_pyfn(text):
    [text_encoded] = tf.py_function(encode, inp=[text], Tout=[tf.int32])
    return text_encoded


training_dataset = training_dataset.map(encode_pyfn).\
    padded_batch(batch_size=4, padded_shapes=([max_len*max_sent,]))

next(iter(training_dataset))

Output:

Here's an update based on your last comment:

import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences


text = ["I played it a while but it was alright. The steam was a bit of trouble."
        " The more they move these game to steam the more of a hard time I have"
        " activating and playing a game. But in spite of that it was fun, I "
        "liked it. Now I am looking forward to anno 2205 I really want to "
        "play my way to the moon.",
        "This game is a bit hard to get the hang of, but when you do it's great."]


df = pd.DataFrame({"text": text})

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        tf.cast(df.text.values, tf.string)))

for ex in training_dataset.take(5):
    print(ex)

tokenizer = tfds.features.text.Tokenizer()

lowercase = True
vocabulary = Counter()
for text in training_dataset:
    if lowercase:
        text = tf.strings.lower(text)
    tokens = tokenizer.tokenize(text.numpy())
    vocabulary.update(tokens)


vocab_size = 5000
vocabulary, _ = zip(*vocabulary.most_common(vocab_size))

max_len = 15
max_sent = 5
encoder = tfds.features.text.TokenTextEncoder(vocabulary,
                                              lowercase=True,
                                              tokenizer=tokenizer)

def encode(text):
    sent_list = []
    sents = tf.strings.split(text, sep=". ").numpy()
    if max_sent:
        sents = sents[:max_sent]
    for sent in sents:
        text_encoded = encoder.encode(sent.decode())
        if max_len:
            text_encoded = text_encoded[:max_len]
            sent_list.append(pad_sequences([text_encoded], max_len))
    if len(sent_list) < 5:
        sent_list.append([tf.zeros(max_len) for _ in range(5 - len(sent_list))])
    return tf.concat(sent_list, axis=0)


def encode_pyfn(text):
    [text_encoded] = tf.py_function(encode, inp=[text], Tout=[tf.int32])
    return text_encoded


training_dataset = training_dataset.map(encode_pyfn).batch(batch_size=4)

next(iter(training_dataset))

Generating hierarchical text representation within TensorFlow dataset

Answers (1)

Related Questions