Reputation: 1375
I use tensorflow 0.9.
I want to save my model and then restore it.
I simply add tf.train.Saver()
to save and restore my training variables.
This is my code:
import tensorflow as tf
import input_data
import os
checkpoint_dir='./ckpt_dir/'
mnist = input_data.read_data_sets("MNIST_data", one_hot = True)
x = tf.placeholder(tf.float32, shape = [None , 784])
y_ = tf.placeholder(tf.float32, [None, 10])
sess = tf.InteractiveSession()
def load_model(sess, saver, checkpoint_dir ):
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
print(ckpt.model_checkpoint_path)
saver.restore(sess, ckpt.model_checkpoint_path)
else:
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
sess.run(init)
return
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev = 0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape= shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides = [1, 1, 1, 1], padding = "SAME")
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1],
padding = "SAME")
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
x_image = tf.reshape(x, [-1, 28, 28, 1])
#
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1))
h_pool1 = max_pool_2x2(h_conv1)
#
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2))
h_pool2 = max_pool_2x2(h_conv2)
W_fc1 = weight_variable([7764, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7764])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
#
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop,W_fc2) +b_fc2)
#
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices = [1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.initialize_all_variables()
saver = tf.train.Saver()
load_model(sess, saver, checkpoint_dir)
for i in range(1):
batch = mnist.train.next_batch(50)
if i%10 == 0:
train_accuracy = accuracy.eval(feed_dict = {x : batch[0] , y_ : batch[1], keep_prob : 1.0})
print("step %d, training accuracy %g"%(i, train_accuracy))
train_step.run(feed_dict = {x : batch[0], y_ : batch[1], keep_prob : 0.5})
print("test accuracy %g"%accuracy.eval(feed_dict={
x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
tf.scalar_summary("accuracy", accuracy)
saver.save(sess,checkpoint_dir+'model.ckpt')
When I restore the checkpoint:
saver.restore(sess, ckpt.model_checkpoint_path)
TensorFlow throws this error:
Traceback (most recent call last):
.
.
.
NotFoundError: Tensor name "global_step_7" not found in checkpoint files ./ckpt_dir/model.ckpt-0
[[Node: save_18/restore_slice_438 = RestoreSlicedt=DT_INT32, preferred_shard=-1, _device="/job:localhost/replica:0/task:0/cpu:0"]]
Caused by op 'save_18/restore_slice_438', defined at:
File "/home/m/anaconda3/lib/python3.5/site-packages/spyderlib/widgets/externalshell/start_ipython_kernel.py", line 205, in
ipythonkernel.start()
.
.
.
File "/home/m/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1224, in __init
raise TypeError("Control input must be an Operation, "
I use anaconda. The first time I run this code in spyder or ipython with "run filename.py", it saves the model in the checkpoint, but when I run this code again, it throws the error.
But when I close spyder or ipython, open it again and run the code it restores the checkpoint correctly.
Also when I run in terminal "python filename.py" it always runs and doesn't throw any error.
Upvotes: 4
Views: 4280
Reputation: 28198
You need to reset the default graph at the beginning of your call when you run again the file.
If you don't reset the default graph, and run two times the line:
x = tf.Variable(1, name='x')
print x.name
You will see the first time that x
has name "x:0"
and the second time its name is "x_1:0"
. This is what confuses tf.train.Saver
:
x
using name "x:0"
x
, but now the name of the variable is "x_1:0"
, so the saver tries to load a saved value under the name "x_1:0"
but cannot find it, and returns an error.However, you can reset the default graph at the beginning using tf.reset_default_graph()
. This will create an empty graph and use it as default graph.
Here the name of x
can be the same in those two graphs:
# First run
tf.reset_default_graph()
x = tf.Variable(1, name='x')
print x.name # prints 'x:0'
# Next run
tf.reset_default_graph()
x = tf.Variable(1, name='x')
print x.name # prints 'x:0'
The two Variables can now have the same name because they are no longer in the same graph.
Another way of doing it is to create a graph at the beginning and use it as default graph:
graph = tf.Graph()
with graph.as_default():
x = tf.Variable(1, name='x')
Upvotes: 5