gorjan
gorjan

Reputation: 5555

Tensorflow bucketing: Pad with a sequence

Given the following code:

def elements_gen():
    sequence = [
        [[11, 22], [22, 22], [33, 22]],
        [[33, 22], [44, 22], [55, 22], [66, 22], [77, 22]],
        [[11, 22], [22, 22]],
        [[88, 22], [99, 22], [11, 22], [22, 22]],
    ]

    label = [1, 2, 1, 2]
    for x, y in zip(sequence, label):
        yield (x, y)


def element_length_fn(x, y):
    return tf.shape(x)[0]


dataset = tf.data.Dataset.from_generator(
    generator=elements_gen,
    output_shapes=([None, 2], []),
    output_types=(tf.int32, tf.int32),
)

dataset = dataset.apply(
    tf.data.experimental.bucket_by_sequence_length(
        element_length_func=element_length_fn,
        bucket_batch_sizes=[2, 2, 2],
        bucket_boundaries=[0, 5],
        padding_values=[0, 0],
    )
)

batch = dataset.make_one_shot_iterator().get_next()

with tf.Session() as sess:
    for _ in range(4):
        print("Get_next:")
        print(sess.run(batch))

I get the following error:

TypeError: If shallow structure is a sequence, input must also be a sequence. Input has type: <class 'list'>.

What I want to achieve is pad the sequences with the following element [0, 0].

Please let me know if the question seems confusing so I can include more info or express myself in a more clear way.

Upvotes: 2

Views: 483

Answers (1)

gorjan
gorjan

Reputation: 5555

The recommended approach by @jdehesa works for what I needed and results in the following code:

def elements_gen():
    sequence = [
        [[11, 22], [22, 22], [33, 22]],
        [[33, 22], [44, 22], [55, 22], [66, 22], [77, 22]],
        [[11, 22], [22, 22]],
        [[88, 22], [99, 22], [11, 22], [22, 22]],
    ]

    label = [1, 2, 1, 2]
    for x, y in zip(sequence, label):
        yield (x, y)


def element_length_fn(x, y):
    return tf.shape(x)[0]


dataset = tf.data.Dataset.from_generator(
    generator=elements_gen, output_shapes=([None, 2], []), output_types=(tf.int32, tf.int32)
)

dataset = dataset.apply(
    tf.data.experimental.bucket_by_sequence_length(
        element_length_func=element_length_fn,
        bucket_batch_sizes=[2, 2, 2],
        bucket_boundaries=[0, 5],
        padding_values=(0,0),
    )
)

batch = dataset.make_one_shot_iterator().get_next()

with tf.Session() as sess:
    for _ in range(2):
        print("Get_next:")
        print(sess.run(batch))

Upvotes: 3

Related Questions