Reputation: 11
I am trying to load numpy array (x, 1, 768) and labels (1, 768) into tf.data. my code is as below:
import pandas as pdb
import pdb
import numpy as np
import os, glob
import tensorflow as tf
#from tensorflow import keras
from tensorflow.keras import layers, initializers
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras import layers
#from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from natsort import natsorted
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#################################################################
#File Paths
text_path = 'data/featured/*'
tags_path ='data/encoded_tags/*'
text_files = natsorted(glob.glob(text_path)) # Load the array filenames
tags_files = natsorted(glob.glob(tags_path)) # Load the label filenames
text_train = text_files[:round(0.9*len(text_files))]
tags_train = tags_files[:round(0.9*len(tags_files))]
#Parameters
AUTO = tf.data.experimental.AUTOTUNE
index = 0
PADDING_LENGTH = 768
BATCH_LENGTH = 1
LEARNING_RATE = 0.01
OPTIMISER = 'ADAM'
#Define the training parameters here.
#################################################################
#@tf.function
def load_files(filename1, filename2):
tags = np.load(filename[1], allow_pickle=True)
arr = np.load(filename[0], allow_pickle=True)
# Perform padding and convert back to tensor
return arr, tags
def load_dataset(text_files, tag_files):
dataset = tf.data.Dataset.from_tensor_slices([text_files, tag_files])
print(dataset)
#dataset = dataset.map(load_files)
#dataset = dataset.map(lambda x: tf.py_function(load_files, [x], tf.float64))
dataset = dataset.map(map_func=load_files, num_parallel_calls=AUTO)
return dataset
def get_batch_dataset(filename1, filename2):
dataset = load_dataset(filename1, filename2)
dataset = dataset.batch(BATCH_LENGTH)
dataset = dataset.prefetch(AUTO).repeat()
return dataset
def get_training_dataset():
return get_batch_dataset(text_train, tags_train)
dataset = get_batch_dataset(text_train, tags_train)
When I try to read the numpy array by its filenames of arrays and its label it throws the following error:
TypeError: expected str, bytes or os.PathLike object, not Tensor
Things I have tried:
filename1.numpy() # doesnt work:
AttributeError: 'Tensor' object has no attribute 'numpy'
filename.as_string() # doesnt work either:
AttributeError: 'Tensor' object has no attribute 'as_string'
i just need to read the arrays as numpy array because I need to pad them, I have tried reading them as tf.io.read_file() but somehow it messes us with the arrays and the returned shape is (None,). Each array (for a unique filename) is of length x as mentioned earlier and I need to perform padding and output a fixed size array in order to feed it a neural network.
Thank you in advance for the help.
Upvotes: 1
Views: 2484
Reputation: 60
The answer given by @AAudibert is nearly perfect. You might encounter problems with losing the shape (shape of tensors in dataset will be unknown). If you know what the shape has to be, just map the dataset to a function like this:
def reshape(x, y):
X = tf.reshape(x, [16, 160, 160])
Y = tf.reshape(y, [])
return X, Y
In case you don't know the shape and have to determine it at runtime:
def load_files_py(filename1, filename2):
tags = np.load(filename1.numpy(), allow_pickle=True)
arr = np.load(filename2.numpy(), allow_pickle=True)
return arr, tags, arr.shape(), tags.shape()
def load_file(filename1, filename2):
temp= tf.py_function(load_files_py, inp=[filename1, filename2], Tout=[arr_type, tags_type, int32, int32])
arr = tf.reshape(temp[0], [temp[2])
tags = tf.reshape(temp[1], [temp[2])
return arr, tags
I know there might be some syntactical mistakes but you'll get the idea
Upvotes: 0
Reputation: 1273
The function passed to dataset.map
will be traced and executed as a Tensorflow graph. The arguments passed to the function will be Tensor
s. That is why you get the
TypeError: expected str, bytes or os.PathLike object, not Tensor
If you want your function to operate on strings instead of Tensors, you can use the tf.py_function op:
def load_files_py(filename1, filename2):
tags = np.load(filename1.numpy(), allow_pickle=True)
arr = np.load(filename2.numpy(), allow_pickle=True)
# Perform padding and convert back to tensor
return arr, tags
def load_file(filename1, filename2):
return tf.py_function(load_files_py, inp=[filename1, filename2], Tout=[arr_type, tags_type])
The code below demonstrates the difference in the arguments passed to the function with and without py_function
:
import tensorflow as tf
def load_py(a):
# Arguments to py_function are eager tensors, so we can use `.numpy()` to get their string values.
tf.print(type(a)) # <class 'tensorflow.python.framework.ops.EagerTensor'>
tf.print(a.numpy()) # b'a'
return a
def load(a):
# `load` is executed in graph mode, so `a` and `b` are non-eager Tensors.
tf.print(type(a)) # <class 'tensorflow.python.framework.ops.Tensor'>
return tf.py_function(load_py, inp=[a], Tout=[tf.string])
ds = tf.data.Dataset.from_tensor_slices((["a", "b"]))
ds = ds.map(load)
next(iter(ds))
https://colab.research.google.com/drive/1Tr04ykdBGx01uCMUHdyBLXV4VQMi-6dU
Upvotes: 3