Reputation: 171
In the following code, I have modified the Deep MNIST example from the tensorflow tutorials (official).
Modifications -- Added weight decay into the loss function and also modifying the weights as well. (If its incorrect please do let me know).
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
from hyperopt import STATUS_OK, STATUS_FAIL
Flags2=None
def build_and_optimize(hp_space):
global Flags2
Flags2 = {}
Flags2['dp'] = hp_space['dropout_global']
Flags2['wd'] = hp_space['wd']
res = main(Flags2)
results = {
'loss': res,
'status': STATUS_OK
}
return results
def deepnn(x):
"""deepnn builds the graph for a deep net for classifying digits.
args:
x: an input tensor with the dimensions (N_examples, 784), where 784 is the number of piexs in a standard MNIST image.
returns:
a tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values equal to the logits of classifying the digit into one of classes (the digits 0-9). keep_prob is a scalar placeholder for the probability of dropout.
"""
# reshape to use within a convolutional neural net
# last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for RGB, 4 for RGBA, etc.
x_image = tf.reshape(x, [-1, 28, 28, 1])
wd = tf.placeholder(tf.float32)
# first convolutional layer - maps one grayscale image to 32 feature maps
W_conv1 = weight_variable([5, 5, 1, 32], wd)
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
# pooling layer - downsamples by 2X
h_pool1 = max_pool_2X2(h_conv1)
# second convolutional layer --maps 32 feature maps to 64
W_conv2 = weight_variable([5, 5, 32, 64], wd)
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
# second pooling layer - downsamples by 2X
h_pool2 = max_pool_2X2(h_conv2)
# fully connected layer 1 -- after 2 round of downsampleing, our 28x28 image
# is done to 7x7x64 feature maps --maps this to 1025 features.
W_fc1 = weight_variable([7*7*64, 1024], wd)
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
# dropout - controls the complexity of the model, prevents co-adaptation of features.
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# map the 1024 features to 10 classes, one for each digit
W_fc2 = weight_variable([1024, 10], wd)
b_fc2 = bias_variable([10])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
return y_conv, keep_prob, wd
def conv2d(x, W):
"""conv2d returns a 2d convolution layer with full stride."""
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2X2(x):
"""max_pool_2x2 downsamples a feature map by 2X."""
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
def weight_variable(shape, wd = None):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
# weight decay
if wd is not None:
weight_decay = tf.multiply(tf.nn.l2_loss(initial), wd, name = 'weight_loss')
tf.add_to_collection('losses', weight_decay)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def main(_):
global Flags2
if Flags2 is None:
Flags2 = {}
if 'keep_prob' not in Flags2:
Flags2 = {}
Flags2['dp'] = 1.0
Flags2['wd'] = 0.0
print(Flags2)
# import data
mnist = input_data.read_data_sets('/tmp/tensorflow/mnist/input_data', one_hot=True)
# create the model
x = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.float32, [None, 10])
# build the graph for the deep net
y_conv, keep_prob, wd = deepnn(x)
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
# adding weight decay
tf.add_to_collection('losses', cross_entropy)
total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
train_step = tf.train.AdamOptimizer(1e-4).minimize(total_loss)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(1000):
batch =mnist.train.next_batch(200)
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={
x: batch[0], y_:batch[1], keep_prob: Flags2['dp'], wd: Flags2['wd']})
print('step %d, training accuracy %g' %(i, train_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: Flags2['dp'], wd: Flags2['wd']})
test_accuracy = accuracy.eval(feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0, wd: Flags2['wd']})
print('test accuracy %g' % test_accuracy)
return test_accuracy
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str,
default='/tmp/tensorflow/mnist/input_data',
help='directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
Hyperopt is used to tune the hyper-parameters (weight decay factor and dropout probability).
from hyperopt import fmin, tpe, hp, Trials
import pickle
import traceback
from my_mnist_convnet import build_and_optimize
space = {
'dropout_global': hp.uniform('conv_dropout_prob', 0.4, 0.6),
'wd': hp.uniform('wd', 0.0, 0.01)
}
def run_a_trail():
"""Run one TPE meta optimisation step and save its results."""
max_evals = nb_evals = 3
print("Attempt to resume a past training if it exists:")
try:
trials = pickle.load(open("results.pkl", "rb"))
print("Found saved Trials! Loading...")
max_evals = len(trials.trials) + nb_evals
print("Rerunning from {} trials to add another one.".format(
len(trials.trials)))
except:
trials = Trials()
print("Starting from scratch: new trials.")
best = fmin(
build_and_optimize,
space,
algo=tpe.suggest,
trials=trials,
max_evals=max_evals
)
pickle.dump(trials, open("results.pkl", "wb"))
print(best)
return
def plot_base_and_best_models():
return
if __name__ == "__main__":
"""plot the model and run the optimisation forever (and save results)."""
run_a_trail()
When hyperopt code is used, the code runs fine for only one TPE run, however, if the number of trails is increased then it reports the following error.
self._traceback = _extract_stack()
InvalidArgumentError (see above for traceback): Shape [-1,784] has negative dimensions
[[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[?,784], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Upvotes: 2
Views: 1632
Reputation: 126184
This problem is most likely arising because each call to build_and_optimize()
is adding nodes to the same TensorFlow graph, and the tf.train.AdamOptimizer
is attempting to optimize variables from all of the previous graphs in addition to the current graph. To work around this problem, modify build_and_optimize()
so that it runs main()
in a different TensorFlow graph, using the following change:
def build_and_optimize(hp_space):
global Flags2
Flags2 = {}
Flags2['dp'] = hp_space['dropout_global']
Flags2['wd'] = hp_space['wd']
# Create a new, empty graph for each trial to avoid interference from
# previous trials.
with tf.Graph().as_default():
res = main(Flags2)
results = {
'loss': res,
'status': STATUS_OK
}
return results
Upvotes: 6