Reputation: 42
Short version. I cant find the variable(s) that control either:
A) The 2nd dimension of a variable in a trajectory, eg the 3 in
Trajectory({'action': <tf.Tensor: shape=(64, 3),
or B) the number of dimensions a qnet takes during training?
I'm following this tutorial:
https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
By inserting the prints into the tutorial, as shown below, I extract examples of the following data as it is passed into the training
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
for _ in range(collect_steps_per_iteration):
collect_step(train_env, agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
print(experience)
print(type(experience))
train_loss = agent.train(experience)
Where does the second dimension of shape = (64, 3) in the following output come from?
Trajectory(
{'action': <tf.Tensor: shape=(64, 3), dtype=int64, numpy=
array([[1, 0, 0],
[0, 0, 0],
[1, 1, 0],
[0, 0, 0],
[1, 0, 0],
[1, 0, 1], .....
Below is the only relevant part of the code containing a number 3. Changing it however does nothing.
dataset = replay_buffer.as_dataset(
num_parallel_calls=3, sample_batch_size=batch_size,
num_steps=n_step_update + 1).prefetch(3)
I've been doing my own system similar to this which randomly decided to do dimensions of 2 instead of 3
Trajectory(
{'action': <tf.Tensor: shape=(15, 2), dtype=int32, numpy=
array([[4, 0],
[3, 0],
[3, 3],
[5, 5],
[0, 0],
[0, 0],
[0, 0],
this personal example is for system playing a game of connect 4.
[4, 0] the first term i pass in to my system. it is the x dimension of the slot chosen by the machine learning system. its my action. the second term is the slot chosen by the opponent (a simple connect 4 playing script).
I'm having a problem with the dimensions because in my own example it expects 3 instead of 2
ValueError: The agent was configured to expect a
sequence_length
of '3'.... but at least one of the Tensors invalue
has a time axis dim value '2'
In this example my system does have a dimension of 2... and I believe my qnet (somehow) expects dimensions of 3
Code for my qnet below
# this is in my environments initializer (I just thought you needed to see this)
self._action_spec = tf_agents.specs.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=game.x_rows-1 , name='action')
self._observation_spec = tf_agents.specs.BoundedArraySpec(
shape=(1, game.x_rows,game.y_rows), dtype=np.float32, minimum=0, name='observation')
###### my qnet (how does it get the demand of 3 dimensions?)
q_net = tf_agents.networks.q_network.QNetwork(
input_tensor_spec = the_env.observation_spec(),
action_spec = the_env.action_spec(),
preprocessing_layers=None,
preprocessing_combiner=None,
conv_layer_params=None,
fc_layer_params=(75, 40),
dropout_layer_params=None,
activation_fn=tf.keras.activations.relu,
kernel_initializer=None,
batch_squash=True,
dtype=tf.float32,
q_layer_activation_fn=None,
name='QNetwork')
Here is the only thing that suggests layer population ... which in my mind is related to the dimensions of the input data... and its nothing like 3 , so why does my q net want 3?
fc_layer_params=(75, 40)
###############################################
full reproducable code below is here
import tensorflow as tf
from tf_agents.networks import q_network
from tf_agents.agents.dqn import dqn_agent
import tf_agents
import tf_agents.environments.py_environment as PyEnvironment
from tf_agents.trajectories import time_step as ts
import numpy as np
import keras
import tf_agents.policies.random_tf_policy as random_tf_policy
import tf_agents.environments as tf_py_environment
import math
import numpy as np
import random
import copy
class simple_con_4_game():
def __init__(self, x, y):
self.x_rows = x
self.y_rows = y
self.slots = []
for i in range(x):
ys=[]
for j in range(y):
ys.append(0)
self.slots.append(ys)
def new_game(self):
for xs in self.slots:
for slot in xs:
slot = 0
def check_for_this_line_of_4(self,slot1_indexes,slot2_indexes):
x1 , y1 , x2 , y2 = slot1_indexes[0] , slot1_indexes[1] , slot2_indexes[0] , slot2_indexes[1]
x_dif = x1-x2
y_dif = y1-y2
players = [1, 2]# 1 = ml 2 = simple script , 0 = empty
for p in players:
all_good = True
if self.slots[x1][y1]!=p:
all_good = False
for i in range(3):
try:
x_ind = (x_dif*i)+x1
y_ind = (y_dif*i)+y1
if self.slots[x_ind][y_ind]!=p:
all_good = False
except IndexError:
pass
if all_good:
return p
return False
def check_for_any_line_of_4(self):
a = [-1,0,1]
for xs, x in zip(self.slots, range(len(self.slots))):
for slot, y in zip(xs, range(len(xs))):
for extra_x in a:
for extra_y in a:
if extra_x != 0 and extra_y != 0:
worked = self.check_for_this_line_of_4([x,y],[x+extra_x, y+extra_y])
if type(worked)== type(False):
pass
#elif worked == 1: # ml system won
else:
return worked
return False
def find_lowest_slot(self , x):
lowest_y = 9999
best_slot = "none"
for slot, y_ind in zip(self.slots[x], range(len(self.slots[x]))):
if slot == 0:
if y_ind < lowest_y:
lowest_y = y_ind
if lowest_y != 9999:
return lowest_y
return False
def ml_plays_turn(self, action):
y = self.find_lowest_slot(action)
self.slots[action][y] = 1
def script_plays_turn(self, action = 5):
y = self.find_lowest_slot(action)
self.slots[action][y] = 1
class Con4Env(PyEnvironment.PyEnvironment):
def __init__(self, game):
self.game = game
self._action_spec = tf_agents.specs.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=game.x_rows-1 , name='action')
self._observation_spec = tf_agents.specs.BoundedArraySpec(
shape=(1, game.x_rows,game.y_rows), dtype=np.float32, minimum=0, name='observation')
self._state = np.zeros((game.x_rows,game.y_rows) , dtype=np.float32)
self._time_step_spec = ts.time_step_spec(self._observation_spec)
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._state = np.zeros((game.x_rows,game.y_rows) , dtype=np.float32)
self._episode_ended = False
return ts.restart(np.array([self._state], dtype=np.float32))
def copy_gameboard_to_state(self):
for ys, yind in zip(self.game.slots, range(len(self.game.slots))):
for x , xind in zip(ys, range(len(ys))):
if x.occupied_by_player:
self._state[xind][yind] = 1
elif x.occupied_by_computer:
self._state[xind][yind] = 2
else:
self._state[xind][yind] = 0
def _step(self, action):
if self._episode_ended:
return self.reset()
lin_found = self.game.check_for_any_line_of_4()
if type(lin_found) == type(1):
if lin_found == 1:
reward = 1
elif lin_found == 2:
reward = -1
self._episode_ended = True
elif self.game.ml_plays_turn(action):
self.game.script_plays_turn()
self.copy_gameboard_to_state()
else:
reward = -0.05 #### column full, call it draw
self._episode_ended = True
if self._episode_ended: #### if game was ended last round the reward then we go in here 1 last time
lin_found = self.game.check_for_any_line_of_4()
if type(lin_found) == type(1):
if lin_found == 1:
reward = 1
elif lin_found == 2:
reward = -1
self._episode_ended = True
else:
reward = -0.05 #### column full, call it draw
self._episode_ended = True
self.game.new_game()
return ts.termination(np.array([self._state], dtype=np.float32), reward)
else:
return ts.transition(np.array([self._state], dtype=np.float32), reward=0.0, discount=0.0)
game = simple_con_4_game(20 , 20)
the_env = Con4Env(game)
eval_env = Con4Env(game)
the_env = tf_py_environment.TFPyEnvironment(the_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_env)
from tf_agents.utils import common
step_type_spec = tf.TensorSpec(shape=(), dtype=tf.dtypes.int32, name='step_type')
reward_spec= tf.TensorSpec(shape=(), dtype=tf.dtypes.float32, name='reward_spec')
discount_spec= tf.TensorSpec(shape=(), dtype=tf.dtypes.float32, name='discount_spec')
time_step_spec = tf_agents.trajectories.TimeStep( step_type_spec ,reward_spec, discount_spec, the_env.observation_spec() )
q_net = tf_agents.networks.q_network.QNetwork(
input_tensor_spec = the_env.observation_spec(),
action_spec = the_env.action_spec(),
preprocessing_layers=None,
preprocessing_combiner=None,
conv_layer_params=None,
fc_layer_params=(75, 40),
dropout_layer_params=None,
activation_fn=tf.keras.activations.relu,
kernel_initializer=None,
batch_squash=True,
dtype=tf.float32,
q_layer_activation_fn=None,
name='QNetwork'
)
train_step_counter = tf.Variable(0)
gamma = 0.99
min_q_value = -20 # @param {type:"integer"}
max_q_value = 20 # @param {type:"integer"}
n_step_update = 2 # @param {type:"integer"}
agent = dqn_agent.DqnAgent(
time_step_spec ,
the_env.action_spec() ,
q_net,
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.000001),
n_step_update=n_step_update,
td_errors_loss_fn=common.element_wise_squared_loss,
gamma=gamma,
train_step_counter=train_step_counter
)
random_policy = random_tf_policy.RandomTFPolicy(time_step_spec, the_env.action_spec())
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
compute_avg_return(the_env, random_policy , num_episodes= 3 )
# data collector
from tf_agents.utils import common
import copy
#@test {"skip": true}
replay_buffer_capacity = 999
initial_collect_steps = 100
batch_size = 15
n_step_update = 1
replay_buffer = tf_agents.replay_buffers.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=the_env.batch_size,
max_length=replay_buffer_capacity
)
def collect_step(environment, policy):
time_step = environment.current_time_step()
action_step = policy.action(time_step)
next_time_step = environment.step(action_step.action)
traj = tf_agents.trajectories.from_transition(time_step, action_step, next_time_step)
replay_buffer.add_batch(traj)
for _ in range(initial_collect_steps):
collect_step(the_env, random_policy)
dataset = replay_buffer.as_dataset(
num_parallel_calls=2, sample_batch_size=batch_size,
num_steps=n_step_update + 1).prefetch(2)
iterator = iter(dataset)
# this bugga needs to work
# Training the agent
from tf_agents.utils import common
import copy
num_eval_episodes = 50
num_iterations = 30
collect_steps_per_iteration = 200
the code in a graph using TF function.
agent.train = common.function(agent.train)
agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
for _ in range(num_iterations):
for __ in range(collect_steps_per_iteration):
collect_step(the_env, agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
print("exp bellow &*&*&*&*&*&* ")
print(experience) #################### <<<=========== THIS OBJECT IS PROBLEM !!!!!!!!!!!!!!
print(type(experience))
print("exp above , num below ")
print(_)
train_loss = agent.train(experience) # takes tf_agents.typing.types.NestedTensor but im passing trajectory
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss.loss))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))
returns.append(avg_return)
Upvotes: 1
Views: 111
Reputation: 10463
The (64,3) second dimension (3) comes from n_step_update + 1
:
dataset = replay_buffer.as_dataset(
num_parallel_calls=3, sample_batch_size=batch_size,
num_steps=n_step_update + 1).prefetch(3)
Upvotes: 1