Using EpisodicReplayBuffer in TF-Agents

Question

I am trying to use the EpisodicReplayBuffer in Tensorflow TF-Agents with a simple custom TF-agents environment. I am trying to use it with the DQNAgent and a q_rnn_network. I'm using an adapted version of the code in this tutorial: https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial, so I'm trying to use the iterator generated by as_dataset() to grab episodes one at a time.

When I try to run my code, I get the following error when trying to use agent.train on the experience from a single episode:

ValueError: All of the Tensors in `value` must have two outer dimensions: batch size and time. Specifically, tensors must have shape `[B, T] + spec.shape.
Full shapes of value tensors:
  Trajectory(step_type=TensorShape([101]), observation=TensorShape([101, 1]), action=TensorShape([101]), policy_info=(), next_step_type=TensorShape([101]), reward=TensorShape([101]), discount=TensorShape([101])).
Expected shapes (excluding the two outer dimensions):
  Trajectory(step_type=TensorShape([]), observation=TensorShape([1]), action=TensorShape([]), policy_info=(), next_step_type=TensorShape([]), reward=TensorShape([]), discount=TensorShape([])).

Obviously, the 101 mentioned in the error is the number of steps in the particular episode and could change, in general, for different episodes. When I've used the UniformReplayBuffer, there is another dimension, i.e. the number of batches, so that e.g. the observation would have had a shape of ([50, 50, 1]). In this case the dimensions correspond to time steps, batches, and then the observation spec. So, if I had to guess, I would think that for some reason the EpisodicReplayBuffer should have e.g. given an observation shape of [101, 1, 1], but instead just gave [101, 1]. I don't know how to make it do this though.

Here is some code that may help make more sense of things:

from __future__ import absolute_import, division, print_function

import base64
import numpy as np

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.networks import q_rnn_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.replay_buffers import episodic_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from SimpleMemEnv import Environment as Env
from tf_agents.environments import wrappers

from tf_agents.policies import policy_saver

def compute_avg_return(environment, policy, num_episodes=10, policy_state=()):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0


    while not time_step.is_last():
      if policy_state:
          action_step = policy.action(time_step, policy_state)
          policy_state = action_step.state
      else:
          action_step = policy.action(time_step)

      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


def collect_step(environment, policy, buffer, id, policy_state):
  time_step = environment.current_time_step()
  if policy_state:
      action_step = policy.action(time_step, policy_state)
      policy_state = action_step.state
  else:
      action_step = policy.action(time_step)

  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  id_tensor = tf.constant(id, dtype=tf.int64)
  buffer.add_batch(traj, id_tensor)
  if time_step.is_last():
      id[0] += 1

  return policy_state

def collect_data(env, policy, buffer, steps, id, policy_state=()):
  for _ in range(steps):
    policy_state = collect_step(env, policy, buffer, id, policy_state)

  return policy_state
    

tf.compat.v1.enable_v2_behavior()

num_iterations = 100000 
collect_steps_per_iteration = 1  
initial_collect_steps = 100  
replay_buffer_max_length = 100000  
batch_size = 55 
learning_rate = 1e-4 


log_interval = 200 

num_eval_episodes = 1  
eval_interval = 1000  

train_py_env  = wrappers.TimeLimit(Env(), duration=1000)
eval_py_env  = wrappers.TimeLimit(Env(), duration=1000)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

fc_layer_params = (100,)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

rq_net = q_rnn_network.QRnnNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    input_fc_layer_params=fc_layer_params,
    output_fc_layer_params=fc_layer_params,
    lstm_size= fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)
    
agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=rq_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)


agent.initialize()


random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())


replay_buffer = episodic_replay_buffer.EpisodicReplayBuffer(
    data_spec=agent.collect_data_spec,
    capacity = 1000,
    completed_only = True)

episode_id = [0]
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps, episode_id)

# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    sample_batch_size=None,
    num_steps=None)

iterator = iter(dataset)

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
policy_state = agent.policy.get_initial_state(batch_size=train_env.batch_size)
avg_return = 0
returns = [avg_return]

print('step = {0}: Average Return = {1}'.format(0, avg_return))

collect_policy_state = agent.collect_policy.get_initial_state(batch_size=train_env.batch_size)

step = 0
train_loss = None
for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  collect_policy_state = collect_data(train_env,
                                   agent.collect_policy,
                                   replay_buffer,
                                   collect_steps_per_iteration,
                                   episode_id,
                                   collect_policy_state)


  # Sample a batch of data from the buffer and update the agent's network.

  step += 1
      
  if step % 150 == 0:
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss


  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % eval_interval == 0:
    policy_state = agent.policy.get_initial_state(batch_size=train_env.batch_size)
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes, policy_state)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)
  
    
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)

plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.show()

A simple environment is (in SimpleMemEnv.py):

import abc
import tensorflow as tf
import numpy as np
from matplotlib import colors
import random

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.environments import wrappers
from enum import Enum

import matplotlib.pyplot as plt

tf.compat.v1.enable_v2_behavior()

class Environment(py_environment.PyEnvironment):
    def __init__(self):

        self._action_spec = array_spec.BoundedArraySpec(shape =  (),
                                                        dtype = np.int32,
                                                        minimum = 0,
                                                        maximum = 1,
                                                        name = 'action')

        self._observation_spec = array_spec.BoundedArraySpec(shape = (1,),
                                                             dtype = np.int32,
                                                             minimum = [0],
                                                             maximum = [1],
                                                             name = 'observation')

        self.state = np.zeros((1,), dtype = np.int32)
        self.stateCounter = 0
        self.count = 0
        self.maxSteps = 100

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._episode_ended = False
        self.count = 0
        self.state = np.zeros((1,), dtype=np.int32)
        self.stateCounter = 0
        return ts.restart(self.state)

    def _step(self,action):
        if self._episode_ended:
            return self.reset()

        if self.stateCounter == 1:
            if self.state[0] == 1:
                self.state[0] = 0
            else:
                self.state[0] = 1
        self.stateCounter = not self.stateCounter
        if action == self.state[0]:
            reward = 10
        else:
            reward = -10
        self.count += 1
        if self.count


Thanks

Diaco · Accepted Answer

As you said, there should be another dimension (Number of batches). You can add the dimension using tf.nest.map_structure(...):

Replace this:

experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

With:

experience, unused_info = next(iterator)
batched_exp = tf.nest.map_structure(
    lambda t: tf.expand_dims(t, axis=0),
    experience
)
train_loss = agent.train(batched_exp).loss

Using EpisodicReplayBuffer in TF-Agents

Answers (1)

Related Questions