Reputation: 63
I tried Split .tfrecords file into many .tfrecords files but it works weird.
This code create too much tfrecord.(each tfrecord is about 10MB).
Is there any idea how can i split tfrecord into what I want quantities?
Upvotes: 0
Views: 448
Reputation: 502
You have to define the number of records that you want and the number of items per record.
Try commenting the convert function calls and replacing the values of the number of items and number of paths etc.. with testing values to see how the code behaves if you couldn't understand at first.
path_list = paths.values # List of the data paths
n_paths = len(path_list) # Gets the lenght
n_items = 10000 # Defines the number of items per TFRecord
# Defines the total number of files, the "1" added here was manually placed by me
# as the necessary number of files to place the remaining items from. (Basically i have calculated that 1 extra file would fit the remaining
# data that could not be equally distributed over the other files)
n_files = int(n_paths / n_items) + 1
rest = n_paths % n_items # In case the number of items can not be equally distributed
file_path = DATA_DIR+'TFRecords/train/train_{}.tfrecords' # Format the output path
for record in range(n_files):
print('Record: '+ str(record)+' from: ', n_folders + number_of_extra_files)
fmt_path = file_path.format(record)
if not sample_index == distributed_total:
limit = sample_index + n_items
print('converting from: ' + str(sample_index)+' to: ' + str(limit-1))
path_subset = path_list[sample_index : (limit -1)]
sample_index = limit
convert(path_subset, None, fmt_path)
else:
path_subset = path_list[sample_index : (sample_index + (rest -1))]
print('converting from: ' + str(sample_index)+' to: ' + str(sample_index + (rest -1)))
convert(path_subset, None, fmt_path)
sample_index = sample_index + rest
def convert(image_paths, labels, out_path):
# Args:
# image_paths List of file-paths for the images.
# labels Class-labels for the images.
# out_path File-path for the TFRecords output file.
print("Converting: " + out_path)
# Number of images. Used when printing the progress.
num_images = len(image_paths)
# Open a TFRecordWriter for the output-file.
with tf.python_io.TFRecordWriter(out_path) as writer:
# Iterate over all the image-paths and class-labels.
for i in range(num_images):
# Print the percentage-progress.
print_progress(count=i, total=num_images-1)
# Load the image-file using matplotlib's imread function.
path = image_paths[i]
img = imread(path)
path = path.split('/')
# Convert the image to raw bytes.
img_bytes = img.tostring()
# Get the label index
label = int(path[4])
# Create a dict with the data we want to save in the
# TFRecords file. You can add more relevant data here.
data = \
{
'image': wrap_bytes(img_bytes),
'label': wrap_int64(label)
}
# Wrap the data as TensorFlow Features.
feature = tf.train.Features(feature=data)
# Wrap again as a TensorFlow Example.
example = tf.train.Example(features=feature)
# Serialize the data.
serialized = example.SerializeToString()
# Write the serialized data to the TFRecords file.
writer.write(serialized)
Upvotes: 1