Array Read From tfrecord Does Not Match Array Written To It

Question

For some reason, the numpy array (of shape 55,290) that I write to a tensorflow record does not match the output of the same tensorflow record once I read it in again.

Here is the code I used to write the tfrecord:

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def serialize_data(X, y):
    feature = {
        'n_wavelength_channels': _int64_feature(55),
        'n_time_steps': _int64_feature(290),
        'rel_radii': _float_feature(y),
        'rel_flux': _float_feature(X.flatten()),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()

def tf_record_generator():
        X_file_chunk = ["E:/ml_data_challenge_database/noisy_train/0001_01_01.txt"]
        y_file_chunk = ["E:/ml_data_challenge_database/params_train/0001_01_01.txt"]

        data = []
        labels = []
        for X_file, y_file in zip(X_file_chunk, y_file_chunk):
            X = np.genfromtxt(X_file, dtype=np.float32)[:,10:]
            y = np.genfromtxt(y_file, dtype=np.float32)
            yield serialize_data(X, y)

n_splits = 1
tfrecord_filename = "training_record_{}.tfrecords"

for index in range(n_splits): # Number of splits
    writer = tf.data.experimental.TFRecordWriter(tfrecord_filename.format(index))

    serialized_features_dataset = tf.data.Dataset.from_generator(tf_record_generator, output_types=tf.string, output_shapes=())

    writer.write(serialized_features_dataset)

and here is the code I use to read the record that was just written:

def parse_record(record):
    name_to_features = {
        'n_wavelength_channels': tf.io.FixedLenFeature([], tf.int64),
        'n_time_steps': tf.io.FixedLenFeature([], tf.int64),
        'rel_radii': tf.io.FixedLenFeature([55], tf.float32),
        'rel_flux': tf.io.FixedLenFeature([55*290], tf.float32),
    }
    return tf.io.parse_single_example(record, name_to_features)
def decode_record(record):
    parsed_record = parse_record(record)
    flux = parsed_record['rel_flux']
    radii = parsed_record['rel_radii']
    return flux, radii
def get_batched_dataset(filenames):
    option_no_order = tf.data.Options()
    option_no_order.experimental_deterministic = False
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.with_options(option_no_order)
    dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(decode_record, num_parallel_calls=tf.data.AUTOTUNE)

    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) 
    dataset = dataset.prefetch(tf.data.AUTOTUNE) #

    return dataset
def get_training_dataset():
    return get_batched_dataset(training_filenames)

BATCH_SIZE=1
training_filenames = tf.io.gfile.glob("training_record_*.tfrecords")
training_data = get_training_dataset()
X_batch, y_batch = next(iter(training_data))

def show_batch(X_batch, y_batch):
    for i in X_batch:
        plt.plot(i.reshape(290,55))
        plt.show()


show_batch(X_batch.numpy(), y_batch.numpy())

This is part of the input for a neural network I'm working on, and I tried to modify it to create a tfrecord from a single training observation and then output that observation.

Here is what the output of the tfrecord looks like:

and here is what it should look like (the raw observation):

X = np.genfromtxt("E:/ml_data_challenge_database/noisy_train/0001_01_01.txt")
plt.plot(X.T[10:,:])
plt.show()

(plotting all 55 rows simultaneously).

The y values read in from the tfrecord actually match the true y values, but I've got no idea why the X data seems to be incorrect. I've been following a number of guides closely but am very new at working with TF data. Could someone please take a look at my code and point out anything I may have done wrong? Thank you very much in advance!

Here's a Google drive link to the X data (referenced in "X_file_chunk" inside tf_record_generator) and here's one to the y data (also inside tf_record_generator)

PermanentPon · Accepted Answer

When you're reshaping back to 2D you mixing up with dimensions - it should be i.reshape(55,290).T

In this case, the plot is identical to the original data.

BTW, your data is indeed in float64 format, so when you read/plot original data you use float64. Data from the tf.Dataset is float32. Although it's not the reason your plots are different.

Array Read From tfrecord Does Not Match Array Written To It

Answers (1)

Related Questions