Incorrect: usage of hyperopt with tensorflow

Question

In the following code, I have modified the Deep MNIST example from the tensorflow tutorials (official).

Modifications -- Added weight decay into the loss function and also modifying the weights as well. (If its incorrect please do let me know).

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf

from hyperopt import STATUS_OK, STATUS_FAIL

Flags2=None

def build_and_optimize(hp_space):
    global Flags2
    Flags2 = {}
    Flags2['dp'] = hp_space['dropout_global']
    Flags2['wd'] = hp_space['wd']

    res = main(Flags2)

    results = {
        'loss': res,
        'status': STATUS_OK
    }
    return results

def deepnn(x):
    """deepnn builds the graph for a deep net for classifying digits.
        args:
            x: an input tensor with the dimensions (N_examples, 784), where 784 is the number of piexs in a standard MNIST image.

        returns:
            a tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values equal to the logits of classifying the digit into one of classes (the digits 0-9). keep_prob is a scalar placeholder for the probability of dropout.
    """

    # reshape to use within a convolutional neural net
    # last dimension is for "features" - there is only one here, since images are
    # grayscale -- it would be 3 for RGB, 4 for RGBA, etc.
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    wd = tf.placeholder(tf.float32)

    # first convolutional layer - maps one grayscale image to 32 feature maps
    W_conv1 = weight_variable([5, 5, 1, 32], wd)
    b_conv1 = bias_variable([32])
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)

    # pooling layer - downsamples by 2X
    h_pool1 = max_pool_2X2(h_conv1)

    # second convolutional layer --maps 32 feature maps to 64
    W_conv2 = weight_variable([5, 5, 32, 64], wd)
    b_conv2 = bias_variable([64])
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)

    # second pooling layer - downsamples by 2X
    h_pool2 = max_pool_2X2(h_conv2)

    # fully connected layer 1 -- after 2 round of downsampleing, our 28x28 image
    # is done to 7x7x64 feature maps --maps this to 1025 features.
    W_fc1 = weight_variable([7*7*64, 1024], wd)
    b_fc1 = bias_variable([1024])

    h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

    # dropout - controls the complexity of the model, prevents co-adaptation of features.
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # map the 1024 features to 10 classes, one for each digit
    W_fc2 = weight_variable([1024, 10], wd)
    b_fc2 = bias_variable([10])

    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    return y_conv, keep_prob, wd

def conv2d(x, W):
    """conv2d returns a 2d convolution layer with full stride."""
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2X2(x):
    """max_pool_2x2 downsamples a feature map by 2X."""
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1], padding='SAME')

def weight_variable(shape, wd = None):
    """weight_variable generates a weight variable of a given shape."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    # weight decay
    if wd is not None:
        weight_decay = tf.multiply(tf.nn.l2_loss(initial), wd, name = 'weight_loss')
        tf.add_to_collection('losses', weight_decay)
    return tf.Variable(initial)

def bias_variable(shape):
    """bias_variable generates a bias variable of a given shape."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def main(_):
    global Flags2
    if Flags2 is None:
        Flags2 = {}
    if 'keep_prob' not in Flags2:
        Flags2 = {}
        Flags2['dp'] = 1.0
        Flags2['wd'] = 0.0

    print(Flags2)

    # import data
    mnist = input_data.read_data_sets('/tmp/tensorflow/mnist/input_data', one_hot=True)

    # create the model
    x = tf.placeholder(tf.float32, [None, 784])
    y_ = tf.placeholder(tf.float32, [None, 10])

    # build the graph for the deep net
    y_conv, keep_prob, wd = deepnn(x)

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
    # adding weight decay
    tf.add_to_collection('losses', cross_entropy)
    total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')

    train_step = tf.train.AdamOptimizer(1e-4).minimize(total_loss)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())


        for i in range(1000):
            batch =mnist.train.next_batch(200)

            if i % 100 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x: batch[0], y_:batch[1], keep_prob: Flags2['dp'], wd: Flags2['wd']})
                print('step %d, training accuracy %g' %(i, train_accuracy))
            train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: Flags2['dp'], wd: Flags2['wd']})

        test_accuracy = accuracy.eval(feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0, wd: Flags2['wd']})
        print('test accuracy %g' % test_accuracy)

    return  test_accuracy

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str,
                        default='/tmp/tensorflow/mnist/input_data',
                        help='directory for storing input data')
    FLAGS, unparsed = parser.parse_known_args()
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

Hyperopt is used to tune the hyper-parameters (weight decay factor and dropout probability).

from hyperopt import fmin, tpe, hp, Trials

import pickle
import traceback

from my_mnist_convnet import build_and_optimize

space = {
    'dropout_global': hp.uniform('conv_dropout_prob', 0.4, 0.6),
    'wd': hp.uniform('wd', 0.0, 0.01)
}

def run_a_trail():
    """Run one TPE meta optimisation step and save its results."""
    max_evals = nb_evals = 3

    print("Attempt to resume a past training if it exists:")

    try:
        trials = pickle.load(open("results.pkl", "rb"))
        print("Found saved Trials! Loading...")
        max_evals = len(trials.trials) + nb_evals
        print("Rerunning from {} trials to add another one.".format(
            len(trials.trials)))
    except:
        trials = Trials()
        print("Starting from scratch: new trials.")

    best = fmin(
        build_and_optimize,
        space,
        algo=tpe.suggest,
        trials=trials,
        max_evals=max_evals
    )
    pickle.dump(trials, open("results.pkl", "wb"))

    print(best)

    return

def plot_base_and_best_models():
    return

if __name__ == "__main__":
    """plot the model and run the optimisation forever (and save results)."""
    run_a_trail()

When hyperopt code is used, the code runs fine for only one TPE run, however, if the number of trails is increased then it reports the following error.

self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Shape [-1,784] has negative dimensions
         [[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[?,784], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

mrry · Accepted Answer

This problem is most likely arising because each call to build_and_optimize() is adding nodes to the same TensorFlow graph, and the tf.train.AdamOptimizer is attempting to optimize variables from all of the previous graphs in addition to the current graph. To work around this problem, modify build_and_optimize() so that it runs main() in a different TensorFlow graph, using the following change:

def build_and_optimize(hp_space):
    global Flags2
    Flags2 = {}
    Flags2['dp'] = hp_space['dropout_global']
    Flags2['wd'] = hp_space['wd']

    # Create a new, empty graph for each trial to avoid interference from
    # previous trials.
    with tf.Graph().as_default():
        res = main(Flags2)

    results = {
        'loss': res,
        'status': STATUS_OK
    }
    return results

Incorrect: usage of hyperopt with tensorflow

Answers (1)

Related Questions