Adam optimizer updates momentum and velocity differently in eager and non-eager execution (Tensorflow)

Question

I'm doing reinforcement learning on Atari games with Tensorflow and Gymnasium, and realised that it takes more iterations to converge when using tf.function with lazy execution compared to eager execution. Though lazy execution are still faster overall, even with the additional iterations.

Lazy execution seems to need more iterations because of how the optimizer's momentum and velocity variables are updated. Below is a minimal example showing that the momentum and velocity variables are different after five training iterations (the first two defs are just setting up some functions).

import tensorflow as tf
import numpy as np
import gymnasium as gym

### FOR FORUM

@tf.function
def discounted_cumulative_sums( #from https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic
    rewards: tf.Tensor,
    gamma: float,
    standardize: bool = True) -> tf.Tensor:
    """Compute expected returns per timestep."""

    n = tf.shape(rewards)[0]
    returns = tf.TensorArray(dtype=tf.float32, size=n)

    # Start from the end of `rewards` and accumulate reward sums
    # into the `returns` array
    rewards = tf.cast(rewards[::-1], dtype=tf.float32)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    for i in tf.range(n):
        reward = rewards[i]
        discounted_sum = reward + gamma * discounted_sum
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(i, discounted_sum)
    returns = returns.stack()[::-1]

    if standardize:
        returns = ((returns - tf.math.reduce_mean(returns)) /
                   tf.math.reduce_std(returns))

    return returns

def make_model(input_shape,output_nodes) : 
    tf.keras.utils.set_random_seed(1956) #set seed so it always returns identical model    
    input_layer = tf.keras.layers.Input(input_shape)
    x = tf.keras.layers.Dense(64,activation = 'relu')(input_layer)
    x = tf.keras.layers.Dense(64,activation = 'relu')(x)
    output_layer = tf.keras.layers.Dense(output_nodes,activation = 'linear')(x)
    model = tf.keras.Model(input_layer,output_layer)
    return model

@tf.function
def train_model(old_states, A, model, optimizer) : 
    for i in tf.range(5) : 
        with tf.GradientTape() as tape : 
            loss = tf.reduce_mean(tf.squeeze(model(old_states),1) * A)

        grads = tape.gradient(loss,model.trainable_variables)
        optimizer.apply_gradients(zip(grads,model.trainable_variables))

    return optimizer.variables

# create data
td_error = tf.random.normal(shape=(100,))
old_states = tf.random.normal(shape=(100,8))

# create models and optimizers
eager_model = make_model((8,),1)
non_eager_model = make_model((8,),1)
eager_optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0005)
non_eager_optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0005)

tf.config.run_functions_eagerly(True) # EAGER EXECUTION
A = discounted_cumulative_sums(td_error,.99)
eager_opt_variables = train_model(old_states,A,eager_model,eager_optimizer)

tf.config.run_functions_eagerly(False) # NON-EAGER EXECUTION
A = discounted_cumulative_sums(td_error,.99) # if this is commented out, there will be no differences between optimizer variables
non_eager_opt_variables = train_model(old_states,A,non_eager_model,non_eager_optimizer)

# Show differences between optimizer variables after training
for x in range(22) : 
    print(eager_opt_variables[x].numpy()==non_eager_opt_variables[x].numpy())

Running this code shows that eager_opt_variables and non_eager_opt_variables are different in many cases. While the differences are small, they must either grow over time, or have a magnified impact on convergence speed.

The interesting thing is that this seems to be because of the A = discounted_cumulative_sums(td_error) part. If we don't run the discounted_cumulative_sums function (for example by setting A = td_error) then there are no differences in optimizer variables between eager and lazy execution.

My question is why does this happen, and how can I get lazy execution to produce the same results as eager?

Adam optimizer updates momentum and velocity differently in eager and non-eager execution (Tensorflow)

Answers (1)

Related Questions