Reputation: 91

Tensorflow: How to setup a CheckpointSaverHook

When I start the training on my tf.estimator.Estimator object, Tensorflow automatically creates a CheckpointSaverHook whilst printing INFO:tensorflow:Create CheckpointSaverHook.

This automatically created SaverHook will save my model at the very start and the end of the training.

What I want though is to create a checkpoint every n training steps. For this I created my own saving hook and passed it to my estimator when training.

saver_hook = tf.train.CheckpointSaverHook(
        checkpoint_dir = model_dir,
        save_steps = 100
)

model.train(input_fn,steps=1500,hooks=[saver_hook])

This works in theory but my own CheckpointSaverHook will just save *.meta files, while the automatically created one saves *.meta, *.index and *.data-XXXXX-of-XXXXX files.

How can I configure my own SaverHook to do that aswell?

EDIT: Added my whole network definition
network.py

import pickle
import random   
import numpy as np
import tensorflow as tf

LEARNING_RATE = 0.002

class TFDotNet:
    def __init__(self,model_dir):
        # model def
        self.model_dir = model_dir
        self.model = tf.estimator.Estimator(model_fn=model_fn,model_dir=model_dir)

        # hooks
        self.summary_hook = tf.train.SummarySaverHook(
            save_steps=50,
            output_dir=model_dir,
            scaffold=tf.train.Scaffold()
        )
        self.saver_hook = tf.train.CheckpointSaverHook(
            checkpoint_dir=model_dir,
            save_steps=100,
        )

    def train(self,x_train,y_train,steps=1500,batch_size=128):
        """ train the neuralnetwork """
        tf.logging.set_verbosity(tf.logging.INFO)
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x={'x': x_train}, y=y_train,batch_size=batch_size, num_epochs=None, shuffle=True
        )
        self.model.train(input_fn,steps=steps,hooks=[self.summary_hook,self.saver_hook])

    def predict(self,x_predict):
        """ predict some inputs """
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x={'x':x_predict}, y=None, batch_size=1, shuffle=False
        )
        return list(self.model.predict(input_fn))

    def evaluate(self,x_test,y_test):
        """ evaluate network on testset """
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x={'x': x_test}, y=y_test,batch_size=1, shuffle=False
        )
        return self.model.evaluate(input_fn)

    def load_dataset(self,dataset_path):
        """ loads a dataset from a serialized data file """
        with open(dataset_path,'rb') as f:
            return pickle.load(f)

    def split_dataset(self,dataset,ratio,random_state=42):
        """ splits a loaded dataset into training and testset """
        random.seed(random_state)
        random.shuffle(dataset)
        length = int(ratio * len(dataset))
        test_data = dataset[:length]
        training_data = dataset[length:]
        x_train = np.hstack([x for (x, y) in training_data]).transpose().astype('float32')
        y_train = np.asarray([y for (x, y) in training_data]).reshape(-1, 1).astype('float32')
        x_test = np.hstack([x for (x, y) in test_data]).transpose().astype('float32')
        y_test = np.asarray([y for (x, y) in test_data]).reshape(-1, 1).astype('float32')
        return x_train, y_train, x_test, y_test

    def export(self):
        """ exports the conv net """
        def serving_input_receiver_fn():
            # The outer dimension (None) allows us to batch up inputs for
            # efficiency. However, it also means that if we want a prediction
            # for a single instance, we'll need to wrap it in an outer list.
            inputs = {"x": tf.placeholder(shape=[None, 900], dtype=tf.float32)}
            return tf.estimator.export.ServingInputReceiver(inputs, inputs)

        self.model.export_savedmodel(
            export_dir_base=self.model_dir,
            serving_input_receiver_fn=serving_input_receiver_fn)


def cnn_layout(features,reuse,is_training):
    with tf.variable_scope('cnn',reuse=reuse):
        # resize input to [batchsize,height,width,channel]
        x = tf.reshape(features['x'], shape=[-1,30,30,1])
        # conv1, 32 filter, 5 kernel
        conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu, name='conv1')
        # pool1, 2 stride, 2 kernel
        pool1 = tf.layers.max_pooling2d(conv1, 2, 2, name='pool1')
        # conv2, 64 filter, 3 kernel
        conv2 = tf.layers.conv2d(pool1, 64, 3, activation=tf.nn.relu, name='conv2')
        # pool2, 2 stride, 2 kernel
        pool2 = tf.layers.max_pooling2d(conv2, 2, 2, name='pool2')
        # flatten pool2
        flatten = tf.contrib.layers.flatten(pool2)
        # fc1 with 1024 neurons
        fc1 = tf.layers.dense(flatten, 1024, name='fc1')
        # 75% dropout
        drop = tf.layers.dropout(fc1, rate=0.75, training=is_training, name='dropout')
        # output logits
        output = tf.layers.dense(drop, 1, name='output_logits')
        return output


def model_fn(features, labels, mode):
    # setup two networks one for training one for prediction while sharing weights
    logits_train = cnn_layout(features=features,reuse=False,is_training=True)
    logits_test = cnn_layout(features=features,reuse=True,is_training=False)

    # predictions
    probabilites = tf.sigmoid(logits_test, name='probabilities')
    predictions = tf.round(probabilites,name='predictions')
    export_outputs = tf.estimator.export.PredictOutput(outputs={'predictions':predictions,'probabilities':probabilites})
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs={'outputs':export_outputs})

    # define loss and optimizer
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits_train,labels=labels),name='loss')
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE, name='optimizer')
    train = optimizer.minimize(loss, global_step=tf.train.get_global_step(),name='train')

    # accuracy for evaluation
    accuracy = tf.metrics.accuracy(labels=labels,predictions=predictions,name='accuracy')

    # summarys for tensorboard
    tf.summary.scalar('loss',loss)

    # return training and evalution spec
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        loss=loss,
        train_op=train,
        eval_metric_ops={'accuracy':accuracy}
    )

training.py

from network import TFDotNet
from time import time

# settings
training_steps = 10000
mini_batch_size = 128
model_dir = 'neuralnet_data/02_networks/network01'
dataset_path = 'neuralnet_data/01_datasets/dataset.data'

# init dotnet
dotnet = TFDotNet(model_dir=model_dir)

# load dataset
print('loading dataset ...')
dataset = dotnet.load_dataset(dataset_path)

# split dataset
x_train, y_train, x_test, y_test = dotnet.split_dataset(dataset,0.1)

# train network
print('starting training ...')
t0 = time()
dotnet.train(x_train,y_train,steps=training_steps,batch_size=mini_batch_size)
print('Training took {}s'.format(time()-t0))

Upvotes: 2

Answers (2)

scott huang

Reputation: 2678

here my my code. it works fine, the complete code is on mygithub

start_time = datetime.datetime.now()
saver_hook = tf.train.CheckpointSaverHook(
checkpoint_dir=FLAGS.train_dir,
save_steps=100,
)
config = tf.estimator.RunConfig()
config = config.replace(session_config=sess_config)
per_example_hook = ExamplesPerSecondHook(FLAGS.train_batch_size, every_n_steps=100)
hooks = [per_example_hook,saver_hook]
classifier = tf.estimator.Estimator(
    model_fn=model_fn_cnn,
    model_dir= FLAGS.train_dir,
    config=config,

)
classifier.train(input_fn=functools.partial(input_fn,subset="training"),
                 steps=FLAGS.train_steps,
                 hooks=hooks
                 )

train_time = datetime.datetime.now() - start_time

Upvotes: 0

ngc92

Reputation: 91

The problem here is that, when no Saver is specified (either directly or by the scaffold), CheckpointSaverHook will create a new Saver in its constructor. If the __init__ is not run in the same Graph as your model, then it won't find any variables so nothing will be saved (https://github.com/tensorflow/tensorflow/issues/13265).

Assuming you are using the tf.estimator framework, then the Graph you want simply does not exist yet before the call to train. You should be able to work around that by creating the saver inside your model_fn, and pass it as a hook to the EstimatorSpec.

Upvotes: 1

Tensorflow: How to setup a CheckpointSaverHook

Answers (2)

Related Questions