Reputation: 31
import numpy as np
import tensorflow as tf
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym, tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
tf.compat.v1.enable_v2_behavior()
env_name='CartPole-v0'
num_iterations=1
collect_episodes_per_iteration=2
replay_buffer_capacity=2000
fc_layer_params=(100, )
learning_rate=1e-3
log_interval=5
num_eval_episodes=10
eval_interval=10
env=suite_gym.load(env_name)
env.reset()
time_step=env.reset()
train_py_env=suite_gym.load(env_name)
train_env=tf_py_environment.TFPyEnvironment(train_py_env)
actor_net=actor_distribution_network.ActorDistributionNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params)
optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter=tf.compat.v2.Variable(0)
tf_agent=reinforce_agent.ReinforceAgent(train_env.time_step_spec(),
train_env.action_spec(),
actor_network=actor_net,
optimizer=optimizer,
normalize_returns=True,
train_step_counter=train_step_counter)
tf_agent.initialize()
eval_policy=tf_agent.policy
collect_policy=tf_agent.collect_policy
replay_buffer=tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=replay_buffer_capacity
)
tf_agent.train=common.function(tf_agent.train)
def collect_episode(environment, policy, num_episodes):
episode_counter=0
environment.reset()
while episode_counter<num_episodes:
time_step=environment.current_time_step()
action_step=policy.action(time_step)
next_time_step=environment.step(action_step.action)
traj=trajectory.from_transition(time_step, action_step, next_time_step)
replay_buffer.add_batch(traj)
if traj.is_boundary():
episode_counter+=1
collect_episode(train_env, tf_agent.collect_policy, 1)
experience=replay_buffer.gather_all()
for _ in range(num_iterations):
collect_episode(train_env, tf_agent.collect_policy, collect_episodes_per_iteration)
from copy import copy
before=copy(tf_agent.trainable_variables)
experience=replay_buffer.gather_all()
train_loss=tf_agent.train(experience)
replay_buffer.clear()
after=copy(tf_agent.trainable_variables)
print('before==after?', before==after)
https://www.tensorflow.org/agents/tutorials/6_reinforce_tutorial
I was following tutorials of TFAgents, but I found that
before=copy(tf_agent.trainable_variables)
tf_agent.train(experience)
after=copy(tf_agent.trainable_variables)
Then 'before' should be different with 'after'. But (before==after) always represents 'True'.
I am very confusing about this. I thought that gradients might be zero.
However, it it unreasonable cause the model's loss continues to decrease in a training step.
On reinforce_agent module, gradient tape step was written well..
I can't find what is the problem... even tf_agent.policy.trainable_variables is the same regardless of a training step..
Upvotes: 3
Views: 170
Reputation: 815
The reason for this is that tf_agent.trainable_variables
is a tuple of tf.Variables (which are tensors). Copying these Tensors with copy
doesn't do what you expect and what happens is that the tensors in before
get updated as the agent trains. To actually see the difference try this code:
before = []
for element in tf_agent.trainable_variables:
before.append(tf.identity(element))
tf_agent.train(experience)
after = []
for element in tf_agent.trainable_variables:
after.append(tf.identity(element))
print(before == after)
Upvotes: 1