Minjae Kwon
Minjae Kwon

Reputation: 31

Why tf_agent variables do not change even after training?

tf version : 2.3.0

import numpy as np
import tensorflow as tf

from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym, tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

tf.compat.v1.enable_v2_behavior()

env_name='CartPole-v0'
num_iterations=1
collect_episodes_per_iteration=2
replay_buffer_capacity=2000

fc_layer_params=(100, )

learning_rate=1e-3
log_interval=5
num_eval_episodes=10
eval_interval=10

env=suite_gym.load(env_name)

env.reset()

time_step=env.reset()

train_py_env=suite_gym.load(env_name)

train_env=tf_py_environment.TFPyEnvironment(train_py_env)
actor_net=actor_distribution_network.ActorDistributionNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params)

optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter=tf.compat.v2.Variable(0)

tf_agent=reinforce_agent.ReinforceAgent(train_env.time_step_spec(),
train_env.action_spec(),
actor_network=actor_net,
optimizer=optimizer,
normalize_returns=True,
train_step_counter=train_step_counter)

tf_agent.initialize()

eval_policy=tf_agent.policy
collect_policy=tf_agent.collect_policy

replay_buffer=tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=replay_buffer_capacity
)

tf_agent.train=common.function(tf_agent.train)

def collect_episode(environment, policy, num_episodes):
    episode_counter=0
    environment.reset()

    while episode_counter<num_episodes:
        time_step=environment.current_time_step()
        action_step=policy.action(time_step)
        next_time_step=environment.step(action_step.action)
        traj=trajectory.from_transition(time_step, action_step, next_time_step)
      
        replay_buffer.add_batch(traj)

        if traj.is_boundary():
            episode_counter+=1

collect_episode(train_env, tf_agent.collect_policy, 1)
experience=replay_buffer.gather_all()

for _ in range(num_iterations):
    collect_episode(train_env, tf_agent.collect_policy, collect_episodes_per_iteration)

    from copy import copy

    before=copy(tf_agent.trainable_variables)
    experience=replay_buffer.gather_all()
    train_loss=tf_agent.train(experience)
    replay_buffer.clear()

    after=copy(tf_agent.trainable_variables)
    print('before==after?', before==after)
   

https://www.tensorflow.org/agents/tutorials/6_reinforce_tutorial

I was following tutorials of TFAgents, but I found that

before=copy(tf_agent.trainable_variables)

tf_agent.train(experience)     

after=copy(tf_agent.trainable_variables)

Then 'before' should be different with 'after'. But (before==after) always represents 'True'.

I am very confusing about this. I thought that gradients might be zero.

However, it it unreasonable cause the model's loss continues to decrease in a training step.

On reinforce_agent module, gradient tape step was written well..

I can't find what is the problem... even tf_agent.policy.trainable_variables is the same regardless of a training step..

Upvotes: 3

Views: 170

Answers (1)

Federico Malerba
Federico Malerba

Reputation: 815

The reason for this is that tf_agent.trainable_variables is a tuple of tf.Variables (which are tensors). Copying these Tensors with copy doesn't do what you expect and what happens is that the tensors in before get updated as the agent trains. To actually see the difference try this code:

before = []

for element in tf_agent.trainable_variables:
  before.append(tf.identity(element))

tf_agent.train(experience)

after = []

for element in tf_agent.trainable_variables:
  after.append(tf.identity(element))
print(before == after)

Upvotes: 1

Related Questions