Reputation: 788

TensorFlow: How to write multistep decay

There is multistep decay in Caffe. It is calculated as base_lr * gamma ^ (floor(step)) where step is incremented after each of your decay steps. For example with [100, 200] decay steps and global step=101 I want get base_lr * gamma ^ 1, for global step=201 and more I want get base_lr * gamma ^ 2 and so on.

I tried to implement it based on exponential decay sources but I can do nothing. Here is code of exponential decay (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/training/learning_rate_decay.py#L27 ):

def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
staircase=False, name=None):
  with ops.name_scope(name, "ExponentialDecay",
                      [learning_rate, global_step,
                       decay_steps, decay_rate]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    decay_rate = math_ops.cast(decay_rate, dtype)
    p = global_step / decay_steps
    if staircase:
      p = math_ops.floor(p)
return math_ops.mul(learning_rate, math_ops.pow(decay_rate, p), name=name)

I must pass decay_steps as some sort of array - python array or Tensor. Also I must(?) pass current_decay_step (step in above formula).

First option: In pure python without tensors it is very simple:

decay_steps.append(global_step)
p = sorted(decay_steps).index(global_step) # may be there must be `+1` or `-1`. I hope that main idea is clear

I cant' do it because there is no sort in TF. I don't know how many time it take to implement it.

Second option: something like code below. It doesn't work for many reasons. Firstly I don't know how to pass args to funtion in tf.cond. Secondly, it may not work even if I will pass args: Can cond support TF ops with side effects?

def new_decay_step(decay_steps):
        decay_steps = decay_steps[1:]
        current_decay_step.assign(current_decay_step + 1)
        return tf.no_op()

tf.cond(tf.greater(tf.shape(decay_steps)[0], 0),
                             tf.cond(tf.greater(global_step, decay_steps[0]), new_decay_step, tf.no_op()),
tf.no_op())

p = current_decay_step

Third option: It will not work because I can't get element with tensor[another_tensor].

    # if len(decay_steps) > (current_step + 1):
    #    if global_step > decay_steps[current_step + 1]:
    #        current_step += 1


    current_decay_step = tf.cond(tf.greater(tf.shape(current_decay_step)[0], tf.add(current_decay_step,1)),
                                 tf.cond(tf.greater(global_step, decay_steps[tf.add(current_decay_step + 1]), tf.add(current_decay_step,1), tf.add(current_decay_step,0)),
                                 tf.add(current_decay_step, 0)

What can I do?

UPD: I almost can make it with second option.

I can make

   def nothing: return tf.no_op()
   tf.cond(tf.greater(global_step, decay_steps[0]),
                    functools.partial(new_decay_step, decay_steps),
                    nothing)

But for some reason inner tf.cond doesn't work

For this code I get error fn1 must be callable

   def nothing: return tf.no_op()
   tf.cond(tf.greater(tf.shape(decay_steps)[0], 0),
            tf.cond(tf.greater(global_step, decay_steps[0]),
                    functools.partial(new_decay_step, decay_steps),
                    nothing),
            nothing)

UPD2: Inner tf.cond will not work because they return tensor and args must be functions.

I didn't check it but seems like it works (at least it doesn't crash with errors):

 tf.cond(tf.logical_and(tf.greater(tf.shape(decay_steps)[0], 0),  tf.greater(global_step, decay_steps[0])),
                    functools.partial(new_decay_step, decay_steps),
                    nothing)

UPD3: I realized that code in UPD2 wil not work because I can't change list inside the function.

Also I don't know what parts of tf.logical_and are really executed.

I made following code:

class ohmy:
    def __init__(self, decay_steps):
        self.decay_steps = decay_steps

    def multistep_decay(self, learning_rate, global_step, current_decay_step, decay_steps, decay_rate,
                    staircase=False, name=None):

        learning_rate = tf.convert_to_tensor(learning_rate, name="learning_rate")
        dtype = learning_rate.dtype
        global_step = tf.cast(global_step, dtype)

        decay_rate = tf.cast(decay_rate, dtype)

        def new_step():
            self.decay_steps = self.decay_steps[1:]
            current_decay_step.assign(current_decay_step + 1)
            return current_decay_step

        def curr_step():
            return current_decay_step

        current_decay_step = tf.cond(tf.logical_and(tf.greater(tf.shape(self.decay_steps)[0], 0),  tf.greater(global_step, self.decay_steps[0])),
                new_step,
                curr_step)

        a = tf.Print(global_step, [global_step], "global")
        b = tf.Print(self.decay_steps, [self.decay_steps], "decay_steps")
        c = tf.Print(current_decay_step, [current_decay_step], "step")

        with tf.control_dependencies([a, b, c, current_decay_step]):
            p = current_decay_step

            if staircase:
                p = tf.floor(p)

            return tf.mul(learning_rate, tf.pow(decay_rate, p), name=name)


decay_steps = [3,4,5,6,7]
decay_steps = tf.convert_to_tensor(decay_steps, dtype=tf.float32)
current_decay_step = tf.Variable(0.0, trainable=False)
global_step = tf.Variable(0, trainable=False)
decay_rate = 0.5

c=ohmy(decay_steps)
lr = ohmy.multistep_decay(c, 0.010, global_step, current_decay_step, decay_steps, decay_rate)
#lr = tf.train.exponential_decay(0.001, global_step=global_step, decay_steps=2, decay_rate=0.5, staircase=True)
tf.scalar_summary('learning_rate', lr)

opt = tf.train.AdamOptimizer(lr)
#...train loop and so on

It doesn't work at all. Here is output :

I tensorflow/core/kernels/logging_ops.cc:79] step[0]
I tensorflow/core/kernels/logging_ops.cc:79] global[0]
E tensorflow/core/client/tensor_c_api.cc:485] The tensor returned for MergeSummary/MergeSummary:0 was not valid.
Traceback (most recent call last):
  File "flownet_new.py", line 528, in <module>
    summary_str = sess.run(summary_op)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 382, in run
    run_metadata_ptr)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 655, in _run
    feed_dict_string, options, run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 723, in _do_run
    target_list, options, run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 743, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors.InvalidArgumentError: The tensor returned for MergeSummary/MergeSummary:0 was not valid.

As you can see there is no output of decay steps. I can't even debug it!

Now I definitely don't know how to make it with one function. Btw, either I do something wrong, or tf.contrib.slim doesn't work with learning rate decay.

For now most simple solution is make what you want in train loop as cleros said.

Upvotes: 1

Answers (3)

rvinas

Reputation: 11895

Use tf.train.exponential_decay(), it's exactly what you're looking for. The decayed learning rate is computed as follows:

decayed_learning_rate = learning_rate *
                    decay_rate ^ (global_step / decay_steps)

Note that decay_steps parameter is an integer (not an array nor tensor) holding the period of iterations in which the learning rate changes. In your example decay_steps=100.

Upvotes: 1

emdn2121

Reputation: 21

I was looking for this feature in Tensorflow and I found out it can be easily implemented using the "tf.train.piecewise_constant" function. Here is an example from Tensorflow API: Piece-wise constant

Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5 for steps 100001 to 110000, and 0.1 for any additional steps.

global_step = tf.Variable(0, trainable=False)
boundaries = [100000, 110000]
values = [1.0, 0.5, 0.1]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)

Later, whenever we perform optimization, we make an increment of the "global_step".

Upvotes: 2

manipopopo

Reputation: 1

You can try case, switch, and merge.

For example, suppose base_lr is 0.1, and gamma is 0.1, you can use

import tensorflow as tf
from tensorflow.python.ops import control_flow_ops

global_step = global_step = tf.placeholder(dtype=tf.int64)

learning_rate = tf.case(
    [(tf.less(global_step, 100), lambda: tf.constant(0.1)),
     (tf.less(global_step, 200), lambda: tf.constant(0.01))],
    default=lambda: tf.constant(0.001))

with tf.Session() as sess:
    print(sess.run(learning_rate, {global_step: 0}))   # 0.1
    print(sess.run(learning_rate, {global_step: 1}))   # 0.1
    print(sess.run(learning_rate, {global_step: 99}))  # 0.1
    print(sess.run(learning_rate, {global_step: 100})) # 0.01
    print(sess.run(learning_rate, {global_step: 101})) # 0.01
    print(sess.run(learning_rate, {global_step: 199})) # 0.01
    print(sess.run(learning_rate, {global_step: 200})) # 0.001
    print(sess.run(learning_rate, {global_step: 201})) # 0.001

import tensorflow as tf
from tensorflow.python.ops import control_flow_ops

global_step = global_step = tf.placeholder(dtype=tf.int64)

learning_rate = control_flow_ops.merge(
    [control_flow_ops.switch(tf.constant(0.1), 
                             tf.less(global_step, 100))[1],
     control_flow_ops.switch(tf.constant(0.01), 
                             tf.logical_and(tf.greater_equal(global_step, 100),
                                            tf.less(global_step, 200)))[1],
     control_flow_ops.switch(tf.constant(0.001), 
                             tf.greater_equal(global_step, 200))[1]])[0]

with tf.Session() as sess:
    print(sess.run(learning_rate, {global_step: 0}))   # 0.1
    print(sess.run(learning_rate, {global_step: 1}))   # 0.1
    print(sess.run(learning_rate, {global_step: 99}))  # 0.1
    print(sess.run(learning_rate, {global_step: 100})) # 0.01
    print(sess.run(learning_rate, {global_step: 101})) # 0.01
    print(sess.run(learning_rate, {global_step: 199})) # 0.01
    print(sess.run(learning_rate, {global_step: 200})) # 0.001
    print(sess.run(learning_rate, {global_step: 201})) # 0.001

The code is tested with tensorflow 0.12.1.

Upvotes: 0

TensorFlow: How to write multistep decay

Answers (3)

Related Questions