How to load .npy files from different directories in tensorflow data pipeline from a list containing filenames?

I am trying to load numpy array (x, 1, 768) and labels (1, 768) into tf.data. my code is as below:

import pandas as pdb
import pdb
import numpy as np
import os, glob
import tensorflow as tf
#from tensorflow import keras
from tensorflow.keras import layers, initializers
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras import layers
#from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from natsort import natsorted
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#################################################################

#File Paths
text_path = 'data/featured/*'
tags_path ='data/encoded_tags/*'

text_files = natsorted(glob.glob(text_path)) # Load the array filenames
tags_files = natsorted(glob.glob(tags_path)) # Load the label filenames

text_train = text_files[:round(0.9*len(text_files))]
tags_train = tags_files[:round(0.9*len(tags_files))]

#Parameters

AUTO = tf.data.experimental.AUTOTUNE
index = 0
PADDING_LENGTH = 768
BATCH_LENGTH = 1
LEARNING_RATE = 0.01
OPTIMISER = 'ADAM'

            #Define the training parameters here.
#################################################################

#@tf.function
def load_files(filename1, filename2):
    tags = np.load(filename[1], allow_pickle=True)
    arr = np.load(filename[0], allow_pickle=True)

    # Perform padding and convert back to tensor

    return arr, tags

def load_dataset(text_files, tag_files):
    dataset = tf.data.Dataset.from_tensor_slices([text_files, tag_files])
    print(dataset)
    #dataset = dataset.map(load_files)
    #dataset = dataset.map(lambda x: tf.py_function(load_files, [x], tf.float64))
    dataset = dataset.map(map_func=load_files, num_parallel_calls=AUTO)
    return dataset

def get_batch_dataset(filename1, filename2):
    dataset = load_dataset(filename1, filename2)
    dataset = dataset.batch(BATCH_LENGTH)
    dataset = dataset.prefetch(AUTO).repeat()
    return dataset

def get_training_dataset():
    return get_batch_dataset(text_train, tags_train) 


dataset = get_batch_dataset(text_train, tags_train)

When I try to read the numpy array by its filenames of arrays and its label it throws the following error:

TypeError: expected str, bytes or os.PathLike object, not Tensor

Things I have tried:

filename1.numpy() # doesnt work:
AttributeError: 'Tensor' object has no attribute 'numpy'
filename.as_string() # doesnt work either:
AttributeError: 'Tensor' object has no attribute 'as_string'

i just need to read the arrays as numpy array because I need to pad them, I have tried reading them as tf.io.read_file() but somehow it messes us with the arrays and the returned shape is (None,). Each array (for a unique filename) is of length x as mentioned earlier and I need to perform padding and output a fixed size array in order to feed it a neural network.

Thank you in advance for the help.

Upvotes: 1

Answers (2)

DatenDenker

Reputation: 60

The answer given by @AAudibert is nearly perfect. You might encounter problems with losing the shape (shape of tensors in dataset will be unknown). If you know what the shape has to be, just map the dataset to a function like this:

def reshape(x, y):
  X = tf.reshape(x, [16, 160, 160])
  Y = tf.reshape(y, [])
  return X, Y

In case you don't know the shape and have to determine it at runtime:

def load_files_py(filename1, filename2):
    tags = np.load(filename1.numpy(), allow_pickle=True)
    arr = np.load(filename2.numpy(), allow_pickle=True)
    return arr, tags, arr.shape(), tags.shape()


def load_file(filename1, filename2):
    temp= tf.py_function(load_files_py, inp=[filename1, filename2], Tout=[arr_type, tags_type, int32, int32])
    arr = tf.reshape(temp[0], [temp[2])
    tags = tf.reshape(temp[1], [temp[2])
    return arr, tags

I know there might be some syntactical mistakes but you'll get the idea

Upvotes: 0

AAudibert

Reputation: 1273

The function passed to dataset.map will be traced and executed as a Tensorflow graph. The arguments passed to the function will be Tensors. That is why you get the

TypeError: expected str, bytes or os.PathLike object, not Tensor

If you want your function to operate on strings instead of Tensors, you can use the tf.py_function op:

def load_files_py(filename1, filename2):
    tags = np.load(filename1.numpy(), allow_pickle=True)
    arr = np.load(filename2.numpy(), allow_pickle=True)

    # Perform padding and convert back to tensor

    return arr, tags

def load_file(filename1, filename2):
    return tf.py_function(load_files_py, inp=[filename1, filename2], Tout=[arr_type, tags_type])

The code below demonstrates the difference in the arguments passed to the function with and without py_function:

import tensorflow as tf

def load_py(a):
  # Arguments to py_function are eager tensors, so we can use `.numpy()` to get their string values.
  tf.print(type(a))  # <class 'tensorflow.python.framework.ops.EagerTensor'>
  tf.print(a.numpy())  # b'a'
  return a

def load(a):
  # `load` is executed in graph mode, so `a` and `b` are non-eager Tensors.
  tf.print(type(a))  # <class 'tensorflow.python.framework.ops.Tensor'>
  return tf.py_function(load_py, inp=[a], Tout=[tf.string])

ds = tf.data.Dataset.from_tensor_slices((["a", "b"]))
ds = ds.map(load)
next(iter(ds))

https://colab.research.google.com/drive/1Tr04ykdBGx01uCMUHdyBLXV4VQMi-6dU

Upvotes: 3

How to load .npy files from different directories in tensorflow data pipeline from a list containing filenames?

Answers (2)

Related Questions