Reputation: 3
I am trying to use the EpisodicReplayBuffer in Tensorflow TF-Agents with a simple custom TF-agents environment. I am trying to use it with the DQNAgent and a q_rnn_network. I'm using an adapted version of the code in this tutorial: https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial, so I'm trying to use the iterator generated by as_dataset() to grab episodes one at a time.
When I try to run my code, I get the following error when trying to use agent.train on the experience from a single episode:
ValueError: All of the Tensors in `value` must have two outer dimensions: batch size and time. Specifically, tensors must have shape `[B, T] + spec.shape.
Full shapes of value tensors:
Trajectory(step_type=TensorShape([101]), observation=TensorShape([101, 1]), action=TensorShape([101]), policy_info=(), next_step_type=TensorShape([101]), reward=TensorShape([101]), discount=TensorShape([101])).
Expected shapes (excluding the two outer dimensions):
Trajectory(step_type=TensorShape([]), observation=TensorShape([1]), action=TensorShape([]), policy_info=(), next_step_type=TensorShape([]), reward=TensorShape([]), discount=TensorShape([])).
Obviously, the 101 mentioned in the error is the number of steps in the particular episode and could change, in general, for different episodes. When I've used the UniformReplayBuffer, there is another dimension, i.e. the number of batches, so that e.g. the observation would have had a shape of ([50, 50, 1]). In this case the dimensions correspond to time steps, batches, and then the observation spec. So, if I had to guess, I would think that for some reason the EpisodicReplayBuffer should have e.g. given an observation shape of [101, 1, 1], but instead just gave [101, 1]. I don't know how to make it do this though.
Here is some code that may help make more sense of things:
from __future__ import absolute_import, division, print_function
import base64
import numpy as np
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.networks import q_rnn_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.replay_buffers import episodic_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from SimpleMemEnv import Environment as Env
from tf_agents.environments import wrappers
from tf_agents.policies import policy_saver
def compute_avg_return(environment, policy, num_episodes=10, policy_state=()):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
if policy_state:
action_step = policy.action(time_step, policy_state)
policy_state = action_step.state
else:
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
def collect_step(environment, policy, buffer, id, policy_state):
time_step = environment.current_time_step()
if policy_state:
action_step = policy.action(time_step, policy_state)
policy_state = action_step.state
else:
action_step = policy.action(time_step)
next_time_step = environment.step(action_step.action)
traj = trajectory.from_transition(time_step, action_step, next_time_step)
id_tensor = tf.constant(id, dtype=tf.int64)
buffer.add_batch(traj, id_tensor)
if time_step.is_last():
id[0] += 1
return policy_state
def collect_data(env, policy, buffer, steps, id, policy_state=()):
for _ in range(steps):
policy_state = collect_step(env, policy, buffer, id, policy_state)
return policy_state
tf.compat.v1.enable_v2_behavior()
num_iterations = 100000
collect_steps_per_iteration = 1
initial_collect_steps = 100
replay_buffer_max_length = 100000
batch_size = 55
learning_rate = 1e-4
log_interval = 200
num_eval_episodes = 1
eval_interval = 1000
train_py_env = wrappers.TimeLimit(Env(), duration=1000)
eval_py_env = wrappers.TimeLimit(Env(), duration=1000)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
fc_layer_params = (100,)
q_net = q_network.QNetwork(
train_env.observation_spec(),
train_env.action_spec(),
fc_layer_params=fc_layer_params)
rq_net = q_rnn_network.QRnnNetwork(
train_env.observation_spec(),
train_env.action_spec(),
input_fc_layer_params=fc_layer_params,
output_fc_layer_params=fc_layer_params,
lstm_size= fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=rq_net,
optimizer=optimizer,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter)
agent.initialize()
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
train_env.action_spec())
replay_buffer = episodic_replay_buffer.EpisodicReplayBuffer(
data_spec=agent.collect_data_spec,
capacity = 1000,
completed_only = True)
episode_id = [0]
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps, episode_id)
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
sample_batch_size=None,
num_steps=None)
iterator = iter(dataset)
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)
# Reset the train step
agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
policy_state = agent.policy.get_initial_state(batch_size=train_env.batch_size)
avg_return = 0
returns = [avg_return]
print('step = {0}: Average Return = {1}'.format(0, avg_return))
collect_policy_state = agent.collect_policy.get_initial_state(batch_size=train_env.batch_size)
step = 0
train_loss = None
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
collect_policy_state = collect_data(train_env,
agent.collect_policy,
replay_buffer,
collect_steps_per_iteration,
episode_id,
collect_policy_state)
# Sample a batch of data from the buffer and update the agent's network.
step += 1
if step % 150 == 0:
experience, unused_info = next(iterator)
train_loss = agent.train(experience).loss
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss))
if step % eval_interval == 0:
policy_state = agent.policy.get_initial_state(batch_size=train_env.batch_size)
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes, policy_state)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.show()
A simple environment is (in SimpleMemEnv.py):
import abc
import tensorflow as tf
import numpy as np
from matplotlib import colors
import random
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.environments import wrappers
from enum import Enum
import matplotlib.pyplot as plt
tf.compat.v1.enable_v2_behavior()
class Environment(py_environment.PyEnvironment):
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(shape = (),
dtype = np.int32,
minimum = 0,
maximum = 1,
name = 'action')
self._observation_spec = array_spec.BoundedArraySpec(shape = (1,),
dtype = np.int32,
minimum = [0],
maximum = [1],
name = 'observation')
self.state = np.zeros((1,), dtype = np.int32)
self.stateCounter = 0
self.count = 0
self.maxSteps = 100
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._episode_ended = False
self.count = 0
self.state = np.zeros((1,), dtype=np.int32)
self.stateCounter = 0
return ts.restart(self.state)
def _step(self,action):
if self._episode_ended:
return self.reset()
if self.stateCounter == 1:
if self.state[0] == 1:
self.state[0] = 0
else:
self.state[0] = 1
self.stateCounter = not self.stateCounter
if action == self.state[0]:
reward = 10
else:
reward = -10
self.count += 1
if self.count<self.maxSteps:
return ts.transition(self.state, reward=reward, discount=0.9)
else:
self._episode_ended = True
return ts.termination(self.state, reward=reward)
Thanks
Upvotes: 0
Views: 1056
Reputation: 38
As you said, there should be another dimension (Number of batches). You can add the dimension using tf.nest.map_structure(...):
Replace this:
experience, unused_info = next(iterator)
train_loss = agent.train(experience).loss
With:
experience, unused_info = next(iterator)
batched_exp = tf.nest.map_structure(
lambda t: tf.expand_dims(t, axis=0),
experience
)
train_loss = agent.train(batched_exp).loss
Upvotes: 1