Reputation: 151
I am trying to write my own DQN algorithm in Python, using Tensorflow following the paper(Mnih et al., 2015). In train_DQN
function, I have defined the training procedure, and DQN_CartPole
is for defining the function approximation(simple 3-layered Neural Network). For loss function, Huber loss or MSE is implemented followed by the gradient clipping(between -1 and 1). Then, I have implemented soft-update method instead of hard-update of the target network by copying the weights in the main network.
I am trying it on the CartPole environment(OpenAI gym), but the rewards does not improve as it does in other people's algorithms, such as keras-rl. Any help will be appreciated.
If possible, could you have a look at the source code?
class Parameters:
def __init__(self, mode=None):
assert mode != None
print("Loading Params for {} Environment".format(mode))
if mode == "Atari":
self.state_reshape = (1, 84, 84, 1)
self.num_frames = 1000000
self.memory_size = 10000
self.learning_start = 10000
self.sync_freq = 1000
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 1000
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
elif mode == "CartPole":
self.state_reshape = (1, 4)
self.num_frames = 10000
self.memory_size = 20000
self.learning_start = 100
self.sync_freq = 100
self.batch_size = 32
self.gamma = 0.99
self.update_hard_or_soft = "soft"
self.soft_update_tau = 1e-2
self.epsilon_start = 1.0
self.epsilon_end = 0.01
self.decay_steps = 500
self.prioritized_replay_alpha = 0.6
self.prioritized_replay_beta_start = 0.4
self.prioritized_replay_beta_end = 1.0
self.prioritized_replay_noise = 1e-6
class _DQN:
"""
Boilerplate for DQN Agent
"""
def __init__(self):
"""
define the deep learning model here!
"""
pass
def predict(self, sess, state):
"""
predict q-values given a state
:param sess:
:param state:
:return:
"""
return sess.run(self.pred, feed_dict={self.state: state})
def update(self, sess, state, action, Y):
feed_dict = {self.state: state, self.action: action, self.Y: Y}
_, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
# print(action, Y, sess.run(self.idx_flattened, feed_dict=feed_dict))
return loss
class DQN_CartPole(_DQN):
"""
DQN Agent for CartPole game
"""
def __init__(self, scope, env, loss_fn ="MSE"):
self.scope = scope
self.num_action = env.action_space.n
with tf.variable_scope(scope):
self.state = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="X")
self.Y = tf.placeholder(shape=[None], dtype=tf.float32, name="Y")
self.action = tf.placeholder(shape=[None], dtype=tf.int32, name="action")
fc1 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(self.state)
fc2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc1)
fc3 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(fc2)
self.pred = tf.keras.layers.Dense(self.num_action, activation=tf.nn.relu)(fc3)
# indices of the executed actions
self.idx_flattened = tf.range(0, tf.shape(self.pred)[0]) * tf.shape(self.pred)[1] + self.action
# passing [-1] to tf.reshape means flatten the array
# using tf.gather, associate Q-values with the executed actions
self.action_probs = tf.gather(tf.reshape(self.pred, [-1]), self.idx_flattened)
if loss_fn == "huber_loss":
# use huber loss
self.losses = tf.subtract(self.Y, self.action_probs)
self.loss = huber_loss(self.losses)
elif loss_fn == "MSE":
# use MSE
self.losses = tf.squared_difference(self.Y, self.action_probs)
self.loss = tf.reduce_mean(self.losses)
else:
assert False
# you can choose whatever you want for the optimiser
# self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
self.optimizer = tf.train.AdamOptimizer()
# to apply Gradient Clipping, we have to directly operate on the optimiser
# check this: https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#processing_gradients_before_applying_them
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
self.clipped_grads_and_vars = [(ClipIfNotNone(grad, -1., 1.), var) for grad, var in self.grads_and_vars]
self.train_op = self.optimizer.apply_gradients(self.clipped_grads_and_vars)
def train_DQN(main_model, target_model, env, replay_buffer, policy, params):
"""
Train DQN agent which defined above
:param main_model:
:param target_model:
:param env:
:param params:
:return:
"""
# log purpose
losses, all_rewards, cnt_action = [], [], []
episode_reward, index_episode = 0, 0
with tf.Session() as sess:
# initialise all variables used in the model
sess.run(tf.global_variables_initializer())
state = env.reset()
start = time.time()
for frame_idx in range(1, params.num_frames + 1):
action = policy.select_action(sess, target_model, state.reshape(params.state_reshape))
cnt_action.append(action)
next_state, reward, done, _ = env.step(action)
replay_buffer.add(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if done:
index_episode += 1
state = env.reset()
all_rewards.append(episode_reward)
if frame_idx > params.learning_start and len(replay_buffer) > params.batch_size:
states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
next_Q = target_model.predict(sess, next_states)
Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
loss = main_model.update(sess, states, actions, Y)
# Logging and refreshing log purpose values
losses.append(np.mean(loss))
logging(frame_idx, params.num_frames, index_episode, time.time()-start, episode_reward, np.mean(loss), cnt_action)
episode_reward = 0
cnt_action = []
start = time.time()
if frame_idx > params.learning_start and frame_idx % params.sync_freq == 0:
# soft update means we partially add the original weights of target model instead of completely
# sharing the weights among main and target models
if params.update_hard_or_soft == "hard":
sync_main_target(sess, main_model, target_model)
elif params.update_hard_or_soft == "soft":
soft_target_model_update(sess, main_model, target_model, tau=params.soft_update_tau)
return all_rewards, losses
np.logical_not(dones)
Upvotes: 0
Views: 2444
Reputation: 774
Briefly looking over, it seems that the dones
variable is a binary vector where 1
denotes done, and 0
denotes not-done.
You then use dones
here:
Y = rewards + params.gamma * np.argmax(next_Q, axis=1) * dones
So for all terminating transitions, you add the expected cumulative reward when following the policy for the rest of the episode (which is zero). For all non-terminating transitions, you do not add the expect cumulative reward.
I think you mean to do this the other way around, perhaps swap dones
in the above line of code with np.logical_not(dones)
?
Also, now that I look at it, it seems there is another major problem with this line. np.argmax(next_Q, axis=1)
returns the index of the maximum value in next_Q vector, not the actual maximum value. You need np.maximum(next_Q, axis=1)
(IIRC) to get the maximum expected reward of the next state's actions.
EDIT: The loss function is also very strangely defined. You are mixing Huber Loss with Mean-Squared-Error. If you want to use either huber_loss or MSE, you just compute them on the difference between the expected and predicted values. You appear to be doing both, which is certainly not a commonly defined loss function. For example, your model loss to use Huber Loss should just be:
self.loss = tf.reduce_mean(huber_loss(abs(self.Y - self.action_probs)))
Upvotes: 3