Reputation: 835
I successfully followed this official tensorflow tutorial for training an agent to solve the 'CartPole-v0' gym environment. I only diverged from the tutorial in that I did not use reverb, because it's not supported on Windows. I tried to modify the example to train the agent to solve my own (extremely simple) environment, but it fails to converge on a solution after 10,000 iterations, which I feel should be more than plenty.
I tried adjusting training iterations, learning rates, batch sizes, discounts, and everything else I could think of. Nothing had an effect on the result.
I would like the agent to converge on a policy that always gets +1 reward (ideally in only a few hundred iterations, since this environment is so extremely simple), instead of one that occasionally dips to -1. Instead, here's a graph of the actual outcome:
(The text is small so I will say that orange is episode length in steps, and blue is the average reward. The X axis is the number of training iterations, from 0 to 10,000.)
Imports
import numpy as np
import tf_agents as tfa
import tensorflow as tf
# for reproducability
np.random.seed(100)
tf.random.set_seed(100)
Environment. You can probably skip reading the whole class, it passes validate_py_environment
just fine, so I don't believe there is a problem here.
# This environment is very simple. You start at position = 0
# and at every step you can either move left, right, or do nothing
# if you move right 3 times (and get to position = 3) then you win.
# otherwise, if you go left to position = -3, the you lose.
# you also lose if it takes more than 10 steps.
# losing gives you -1 reward, winning gives you +1
class SimpleGame(tfa.environments.py_environment.PyEnvironment):
def __init__(self):
# 0 - move left
# 1 - do nothing
# 2 - move right
self._action_spec = tfa.specs.array_spec.BoundedArraySpec(
shape = (),
dtype = np.int32,
minimum = 0,
maximum = 2,
name = 'action'
)
self._observation_spec = tfa.specs.array_spec.BoundedArraySpec(
shape = (1,),
dtype = np.int32,
minimum = -3,
maximum = 3,
name = 'observation'
)
self._position = 0
self._step_counter = 0
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _observe(self):
return np.array([self._position], dtype = np.int32)
def _reset(self):
self._position = 0
self._step_counter = 0
return tfa.trajectories.time_step.restart(self._observe())
def _step(self, action):
if abs(self._position) >= 3 or self._step_counter >= 10:
return self.reset()
self._step_counter += 1
if action == 0:
self._position -= 1
elif action == 1:
pass
elif action == 2:
self._position += 1
else:
raise ValueError('`action` should be 0 (left), 1 (do nothing) or 2 (right). You gave `%s`' % action)
reward = 0
if self._position >= 3:
reward = 1
elif self._position <= -3 or self._step_counter >= 10:
reward = -1
if reward != 0:
return tfa.trajectories.time_step.termination(
self._observe(),
reward
)
else: # this game isn't over yet
return tfa.trajectories.time_step.transition(
self._observe(),
reward = 0,
discount = 1.0
)
# no issue here:
tfa.environments.utils.validate_py_environment(SimpleGame(), episodes=10)
Environment Instances
train_py_env = SimpleGame()
test_py_env = SimpleGame()
train_env = tfa.environments.tf_py_environment.TFPyEnvironment(train_py_env)
test_env = tfa.environments.tf_py_environment.TFPyEnvironment(test_py_env)
Agent Creation
q_network = tfa.networks.sequential.Sequential([
tf.keras.layers.Dense(16, activation = 'relu'),
tf.keras.layers.Dense(3, activation = None)
])
agent = tfa.agents.dqn.dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network = q_network,
optimizer = tf.keras.optimizers.Adam(),
td_errors_loss_fn = tfa.utils.common.element_wise_squared_loss,
n_step_update = 1
)
agent.initialize()
agent.train = tfa.utils.common.function(agent.train)
Policy Evaluator. You can probably skip reading this. I believe it is correct.
# simulate some episodes by following the given policy
# return the average reward and episode length
def evaluate_policy(env, policy, episodes = 10):
total_reward = 0.0
total_steps = 0
for ep in range(episodes):
time_step = env.reset()
# this will always just add 0, but kept it for completion
total_reward += time_step.reward.numpy()[0]
while not time_step.is_last():
action_step = policy.action(time_step)
action_tensor = action_step.action
action = action_tensor.numpy()[0]
time_step = env.step(action)
total_reward += time_step.reward.numpy()[0]
total_steps += 1
average_reward = total_reward / episodes
average_ep_length = total_steps / episodes
return average_reward, average_ep_length
# evaluate policy before any training
avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
print("initial policy gives average reward of %.2f after an average %d steps" % (avg_reward, avg_length))
#> initial policy gives average reward of -1.00 after an average 10 steps
Replay Buffer
replay_buffer = tfa.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec = agent.collect_data_spec,
batch_size = train_env.batch_size,
max_length = 10000
)
replay_dataset = replay_buffer.as_dataset(
num_parallel_calls = 3,
sample_batch_size = 64,
num_steps = 2
).prefetch(3)
def record_experience(buffer, time_step, action_step, next_time_step):
buffer.add_batch(
tfa.trajectories.trajectory.from_transition(time_step, action_step, next_time_step)
)
replay_dataset_iterator = iter(replay_dataset)
Training Process
time_step = train_env.reset()
episode_length_history = []
reward_history = []
for step in range(10000 + 1): # +1 just to so 10000 is included in the plot
for _ in range(10):
action_step = agent.collect_policy.action(time_step)
action_tensor = action_step.action
action = action_tensor.numpy()[0]
new_time_step = train_env.step(action)
reward = new_time_step.reward.numpy()[0]
record_experience(replay_buffer, time_step, action_step, new_time_step)
time_step = new_time_step
training_experience, unused_diagnostics_info = next(replay_dataset_iterator)
train_step = agent.train(training_experience)
loss = train_step.loss
print("step: %d, loss: %d" % (step, loss))
if step % 100 == 0:
avg_reward, avg_length = evaluate_policy(test_env, agent.policy)
print("average reward: %.2f average steps: %d" % (avg_reward, avg_length))
# record for the plot
reward_history.append(avg_reward)
episode_length_history.append(avg_length)
Plotting (not relevant to the issue, just included for completion)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_xlabel('train iterations')
fig.subplots_adjust(right=0.8)
reward_ax = ax
length_ax = ax.twinx()
length_ax.set_frame_on(True)
length_ax.patch.set_visible(False)
length_ax.set_ylabel('episode length', color = "orange")
length_ax.tick_params(axis = 'y', colors = "orange")
reward_ax.set_ylabel('reward', color = "blue")
reward_ax.tick_params(axis = 'y', colors = "blue")
train_iterations = [i * 100 for i in range(len(reward_history))]
reward_ax.plot(train_iterations, reward_history, color = "blue")
length_ax.plot(train_iterations, episode_length_history, color = "orange")
plt.show()
Upvotes: 3
Views: 977
Reputation: 835
The cause of the issue was that the agent had no incentive to quickly solve the problem, because going to the right after 10 steps and after 3 steps both result in equal reward. Because the step counter was not observed, the agent could not possibly correlate taking too long with losing; so it would occasionally take more then 10 steps, lose, and be unable to learn from the experience.
I solved this by giving a -0.1 reward on every step, which incentivized the agent to solve the environment in as few steps as possible (causing it to never break the 10 step loss rule).
I also sped up the learning process by increasing the epsilon_greedy parameter of the DqnAgent's constructor to 0.5 (from it's default of 0.1) to allow it to more quickly explore the entire environment.
Upvotes: 2