Reputation: 11
I want to train a network using tensorflow, I choose the "Inception_resnet_v2" as the net(from here), here is my train code,
def train(train_dir, annotations, max_step, checkpoint_dir='./checkpoint2/'):
# train the model
features = tf.placeholder("float32", shape=[None, IMAGE_SIZE, IMAGE_SIZE, IMAGE_CHANNEL], name="features")
labels = tf.placeholder("float32", [None], name="labels")
one_hot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=80)
keep_prob = tf.placeholder("float32")
isTraining = tf.placeholder("bool")
#train_step, cross_entropy, logits, keep_prob = network.inference(features, one_hot_labels)
logits, _=inception_resnet_v2.inception_resnet_v2(features,80,isTraining,keep_prob)
# calculate loss
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=logits))
train_step = tf.train.AdamOptimizer(LEARNINGRATE).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
image_list, label_list = scene_input2.get_files(train_dir, annotations)
image_batch, label_batch = scene_input2.get_batch(image_list, label_list, IMAGE_SIZE, IMAGE_SIZE, BATCH_SIZE)
with tf.Session() as sess:
saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
print('Restore the model from checkpoint %s' % ckpt.model_checkpoint_path)
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
start_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
else:
sess.run(tf.global_variables_initializer())
start_step = 0
print('start training from new state')
logger = scene_input.train_log(LOGNAME)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
# Check if stop was requested.
step=start_step
while not coord.should_stop() and step<start_step + max_step:
start_time = time.time()
x, y = sess.run([image_batch, label_batch])
#y = tf.one_hot(indices=tf.cast(y, tf.int32), depth=80)
#y = sess.run(y)
sess.run(train_step, feed_dict={features: x, labels: y, isTraining: True, keep_prob: 0.5})
if step % 50 == 0:
train_accuracy = sess.run(accuracy, feed_dict={features: x, labels: y, isTraining: False, keep_prob: 1})
train_loss = sess.run(cross_entropy, feed_dict={features: x, labels: y, isTraining:False, keep_prob: 1})
duration = time.time() - start_time
logger.info("step %d: training accuracy %g, loss is %g (%0.3f sec)" % (step, train_accuracy, train_loss, duration))
if step % 1000 == 1:
saver.save(sess, CHECKFILE, global_step=step)
print('writing checkpoint at step %s' % step)
step=step+1
except tf.errors.OutOfRangeError:
print('done!')
finally:
#Request that the threads stop.After this is called, calls to should_stop() will return True.
coord.request_stop()
coord.join(threads)
but when i train the net, i meet an error:
Traceback (most recent call last):
File "scene2.py", line 245, in <module>
train(FLAGS.train_dir, FLAGS. annotations, FLAGS.max_step)
File "scene2.py", line 82, in train
logits, _=inception_resnet_v2.inception_resnet_v2(features,80,isTraining,keep_prob)
File "/home/vision/inception_resnet_v2.py", line 357, in inception_resnet_v2
scope='Dropout')
File "/usr/local/lib/python3.4/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1216, in dropout
_scope=sc)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/layers/core.py", line 247, in __init__
self.rate = min(1., max(0., rate))
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 564, in __bool__
raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. "
TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.
vision@Hjl:~/$ CUDA_VISIBLE_DEVICES=0 python3 scene2.py --mode train
Traceback (most recent call last):
File "scene2.py", line 245, in <module>
train(FLAGS.train_dir, FLAGS. annotations, FLAGS.max_step)
File "scene2.py", line 82, in train
logits, _=inception_resnet_v2.inception_resnet_v2(features,80,isTraining,keep_prob)
File "/home/vision/inception_resnet_v2.py", line 357, in inception_resnet_v2
scope='Dropout')
File "/usr/local/lib/python3.4/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1216, in dropout
_scope=sc)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/layers/core.py", line 247, in __init__
self.rate = min(1., max(0., rate))
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 564, in __bool__
raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. "
TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.
when i pass keep_prob and keep_prob to inception_resnet_v2.inception_resnet_v2(features,80,isTraining,keep_prob), the error occurred. How can I solve this problem?
when i train the network, i want to set keep_prob = 0.5, isTraining = True, but at the same time , every 50 step, i want to watch the model's train_accuracy and train_loss, so i should set keep_prob = 1.0, isTraining = False, am I right? How can I implement it?
Upvotes: 0
Views: 3052
Reputation: 56
If your ultimate objective is to perform training and evaluation concurrently, and you are using the neural network implementations provided with the tf-slim library, then it may be easiest to follow the methodology prescribed by tf-slim co-author Nathan Silberman.
In short, training and evaluation are performed by two separate processes, with the evaluation process pointing to a checkpoint directory, waiting (infinitely) for a new checkpoint to be written to that directory by the training process, and then automatically performing evaluation on the newly written checkpoint and writing summaries to a specified eval output directory.
To get started, you should take a look at the train_image_classifier.py and eval_image_classifier.py scripts provided in the TensorFlow-Slim image classification model library.
In eval_image_classifier.py, you will want to replace the code:
if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
else:
checkpoint_path = FLAGS.checkpoint_path
tf.logging.info('Evaluating %s' % checkpoint_path)
slim.evaluation.evaluate_once(
master=FLAGS.master,
checkpoint_path=checkpoint_path,
logdir=FLAGS.eval_dir,
num_evals=num_batches,
eval_op=list(names_to_updates.values()),
variables_to_restore=variables_to_restore)
with the code:
tf.logging.info('Evaluating %s' % FLAGS.checkpoint_path)
slim.evaluation.evaluation_loop(
master=FLAGS.master,
checkpoint_dir=FLAGS.checkpoint_path,
logdir=FLAGS.eval_dir,
num_evals=num_batches,
eval_op=list(names_to_updates.values()),
variables_to_restore=variables_to_restore)
If you want both processes to utilize your GPU(s) without running into OOM errors, you can allocate a fraction of GPU memory to each process by creating a ConfigProto object and passing it as the argument of the session_config
parameter of slim.learning.train()
or slim.evaluation.evaluation_loop()
. See the "Allowing GPU memory growth" section of this tensorflow.org article for reference.
Regarding the parameterization of is_training
, you will notice that the train and eval scripts pass True
and False
respectively as their arguments to the is_training
parameter of nets_factory.get_netowrk_fn()
.
Regarding the parameterization of keep_prob
, nets_factory does not expose the dropout_keep_prob
parameter of the slim nets. Instead, slim.dropout()
accepts is_training
as a parameter and replaces the computations that constitute dropout with the identity function. In other words, tf-slim is so awesome that it automatically "disables" dropout when you pass is_training=False
to nets_factory.get_netowrk_fn()
, as is the case in eval_image_classifier.py.
If you want to expose dropout_keep_prob
to train_image_classifier.py directly (e.g. for hyperparameter tuning purposes), you will have to fiddle with the implementation of nets_factory.get_network_fn()
.
Upvotes: 1
Reputation: 3743
If you are using this method then it expects a python boolean
and float
value not tensor
. So you need to pass values like,
keep_prob = 0.5
isTraining = True
instead of
keep_prob = tf.placeholder("float32")
isTraining = tf.placeholder("bool")
But if you need to feed them on the training time, I think the easiest way will be editing the inception_resnet_v2
method parameters at this line as below (remove the default param value),
def inception_resnet_v2(inputs, num_classes=1001, is_training,
dropout_keep_prob,
reuse=None,
scope='InceptionResnetV2',
create_aux_logits=True,
activation_fn=tf.nn.relu):
then you will be able to pass your keep_prob
and isTraining
. Hope it helps
Upvotes: 0