Reputation: 93
I was trying to implement a DQN to solve the CartPole-v0 task in the OpenAI Gym. Unfortunately, my implementation's performance does not seem to be improving.
Currently, as the training occurs, the episode reward actually decreases whereas the goal is to find better policies that increase this value.
I am using experience replay and a separate target network to back up my q values. I tried adding/deleting layers and neurons in the agent; this did not work. I altered the schedule for decaying the exploration rate; this did not work either. I've grown increasingly convinced that something's wrong with my loss function, but I'm not sure how I could change it to improve performance.
Here's my code for the loss function:
with tf.variable_scope('loss'):
one_hot_mask = self.one_hot_actions
eval = tf.reduce_max(self.q * one_hot_mask, axis=1)
print(eval)
trg = tf.reduce_max(self.q_targ, axis = 1) * self.gamma
print(trg)
label = trg + self.rewards
self.loss = tf.reduce_mean(tf.square(label - eval))
Where one_hot_actions is a placeholder defined as:
self.one_hot_actions = tf.placeholder(tf.float32, [None, self.env.action_space.n], 'one_hot_actions')
Here's my full code:
import tensorflow as tf
import numpy as np
import gym
import sys
import random
import math
import matplotlib.pyplot as plt
class Experience(object):
"""Experience buffer for experience replay"""
def __init__(self, size):
super(Experience, self).__init__()
self.size = size
self.memory = []
def add(self, sample):
self.memory.append(sample)
if len(self.memory) > self.size:
self.memory.pop(0)
class Agent(object):
def __init__(self, env, ep_max, ep_len, gamma, lr, batch, epochs, s_dim, minibatch_size):
super(Agent, self).__init__()
self.ep_max = ep_max
self.ep_len = ep_len
self.gamma = gamma
self.experience = Experience(100)
self.lr = lr
self.batch = batch
self.minibatch_size = minibatch_size
self.epochs = epochs
self.s_dim = s_dim
self.sess = tf.Session()
self.env = gym.make(env).unwrapped
self.state_0s = tf.placeholder(tf.float32, [None, self.s_dim], 'state_0s')
self.actions = tf.placeholder(tf.int32, [None, 1], 'actions')
self.rewards = tf.placeholder(tf.float32, [None, 1], 'rewards')
self.states = tf.placeholder(tf.float32, [None, self.s_dim], 'states')
self.one_hot_actions = tf.placeholder(tf.float32, [None, self.env.action_space.n], 'one_hot_actions')
# q nets
self.q, q_params = self.build_dqn('primary', trainable=True)
self.q_targ, q_targ_params = self.build_dqn('target', trainable=False)
with tf.variable_scope('update_target'):
self.update_target_op = [targ_p.assign(p) for p, targ_p in zip(q_params, q_targ_params)]
with tf.variable_scope('loss'):
one_hot_mask = self.one_hot_actions
eval = tf.reduce_max(self.q * one_hot_mask, axis=1)
print(eval)
trg = tf.reduce_max(self.q_targ, axis = 1) * self.gamma
print(trg)
label = trg + self.rewards
self.loss = tf.reduce_mean(tf.square(label - eval))
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
tf.summary.FileWriter("log/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
def build_dqn(self, name, trainable):
with tf.variable_scope(name):
if name == "primary":
l1 = tf.layers.dense(self.state_0s, 100, tf.nn.relu, trainable=trainable)
else:
l1 = tf.layers.dense(self.states, 100, tf.nn.relu, trainable=trainable)
l2 = tf.layers.dense(l1, 50, tf.nn.relu, trainable=trainable)
q = tf.layers.dense(l2, self.env.action_space.n, trainable=trainable)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return q, params
def choose_action(self, s, t):
s = s[np.newaxis, :]
if random.uniform(0,1) < self.get_explore_rate(t):
a = self.env.action_space.sample()
else:
a = np.argmax(self.sess.run(self.q, {self.state_0s: s})[0])
return a
def get_explore_rate(self, t):
return max(0.01, min(1, 1.0 - math.log10((t+1)/25)))
def update(self):
# experience is [ [s_0, a, r, s], [s_0, a, r, s], ... ]
self.sess.run(self.update_target_op)
indices = np.random.choice(range(len(self.experience.memory)), self.batch)
# indices = range(len(experience))
state_0 = [self.experience.memory[index][0] for index in indices]
a = [self.experience.memory[index][1] for index in indices]
rs = [self.experience.memory[index][2] for index in indices]
state = [self.experience.memory[index][3] for index in indices]
[self.sess.run(self.train_op, feed_dict = {self.state_0s: state_0,
self.one_hot_actions: a, self.rewards: np.asarray(rs).reshape([-1,1]), self.states: state}) for _ in range(self.epochs)]
def run(self):
all_ep_r = []
for ep in range(self.ep_max):
s_0 = self.env.reset()
ep_r = 0
for t in range(self.ep_len):
fake_ac = [0.0, 0.0] # used to make one hot actions
# self.env.render()
a = self.choose_action(s_0, ep)
s, r, done, _ = self.env.step(a)
if done:
s = np.zeros(np.shape(s_0))
fake_ac[a] = 1.0
print(fake_ac)
self.experience.add([s_0, fake_ac, r, s])
s_0 = s
ep_r += r
if done:
break
all_ep_r.append(ep_r)
print(
'Ep: %i' % ep,
"|Ep_r: %i" % ep_r,
)
if len(self.experience.memory) > self.batch -1:
self.update()
return all_ep_r
agent = Agent("CartPole-v0", 200, 200, 0.99, 0.00025, 32, 10, 4, 16)
all_ep_r = agent.run()
plt.plot(range(len(all_ep_r)), all_ep_r)
plt.show()
Upvotes: -1
Views: 694
Reputation: 2312
Simon's comment is right. Your code for the loss function is not correct because you are not taking the terminal state into account.
The target, trg
, should be reward + gamma * Q
if and only if the state is non-terminal.
If the state is terminal (the pole falls and the game is over) it is only the reward
.
Upvotes: 1