ashboy64
ashboy64

Reputation: 93

Deep Q Network not Solving OpenAI CartPole

I was trying to implement a DQN to solve the CartPole-v0 task in the OpenAI Gym. Unfortunately, my implementation's performance does not seem to be improving.

Currently, as the training occurs, the episode reward actually decreases whereas the goal is to find better policies that increase this value.

I am using experience replay and a separate target network to back up my q values. I tried adding/deleting layers and neurons in the agent; this did not work. I altered the schedule for decaying the exploration rate; this did not work either. I've grown increasingly convinced that something's wrong with my loss function, but I'm not sure how I could change it to improve performance.

Here's my code for the loss function:

with tf.variable_scope('loss'):
            one_hot_mask = self.one_hot_actions
            eval = tf.reduce_max(self.q * one_hot_mask, axis=1)
            print(eval)
            trg = tf.reduce_max(self.q_targ, axis = 1) * self.gamma
            print(trg)
            label = trg + self.rewards
            self.loss = tf.reduce_mean(tf.square(label - eval))

Where one_hot_actions is a placeholder defined as:

        self.one_hot_actions = tf.placeholder(tf.float32, [None, self.env.action_space.n], 'one_hot_actions')

Here's my full code:

import tensorflow as tf
import numpy as np
import gym
import sys
import random
import math
import matplotlib.pyplot as plt

class Experience(object):
    """Experience buffer for experience replay"""
    def __init__(self, size):
        super(Experience, self).__init__()
        self.size = size
        self.memory = []
    def add(self, sample):
        self.memory.append(sample)
        if len(self.memory) > self.size:
            self.memory.pop(0)

class Agent(object):
    def __init__(self, env, ep_max, ep_len, gamma, lr, batch, epochs, s_dim, minibatch_size):
        super(Agent, self).__init__()
        self.ep_max = ep_max
        self.ep_len = ep_len
        self.gamma = gamma
        self.experience = Experience(100)
        self.lr = lr
        self.batch = batch
        self.minibatch_size = minibatch_size
        self.epochs = epochs
        self.s_dim = s_dim
        self.sess = tf.Session()
        self.env = gym.make(env).unwrapped

        self.state_0s = tf.placeholder(tf.float32, [None, self.s_dim], 'state_0s')
        self.actions = tf.placeholder(tf.int32, [None, 1], 'actions')
        self.rewards = tf.placeholder(tf.float32, [None, 1], 'rewards')
        self.states = tf.placeholder(tf.float32, [None, self.s_dim], 'states')

        self.one_hot_actions = tf.placeholder(tf.float32, [None, self.env.action_space.n], 'one_hot_actions')

        # q nets
        self.q, q_params = self.build_dqn('primary', trainable=True)
        self.q_targ, q_targ_params = self.build_dqn('target', trainable=False)

        with tf.variable_scope('update_target'):
            self.update_target_op = [targ_p.assign(p) for p, targ_p in zip(q_params, q_targ_params)]

        with tf.variable_scope('loss'):
            one_hot_mask = self.one_hot_actions
            eval = tf.reduce_max(self.q * one_hot_mask, axis=1)
            print(eval)
            trg = tf.reduce_max(self.q_targ, axis = 1) * self.gamma
            print(trg)
            label = trg + self.rewards
            self.loss = tf.reduce_mean(tf.square(label - eval))

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        tf.summary.FileWriter("log/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

    def build_dqn(self, name, trainable):
        with tf.variable_scope(name):
            if name == "primary":
                l1 = tf.layers.dense(self.state_0s, 100, tf.nn.relu, trainable=trainable)
            else:
                l1 = tf.layers.dense(self.states, 100, tf.nn.relu, trainable=trainable)
            l2 = tf.layers.dense(l1, 50, tf.nn.relu, trainable=trainable)
            q = tf.layers.dense(l2, self.env.action_space.n, trainable=trainable)
        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return q, params

    def choose_action(self, s, t):
        s = s[np.newaxis, :]
        if random.uniform(0,1) < self.get_explore_rate(t):
            a = self.env.action_space.sample()
        else:
            a = np.argmax(self.sess.run(self.q, {self.state_0s: s})[0])
        return a

    def get_explore_rate(self, t):
        return max(0.01, min(1, 1.0 - math.log10((t+1)/25)))

    def update(self):
        # experience is [ [s_0, a, r, s], [s_0, a, r, s], ... ]
        self.sess.run(self.update_target_op)
        indices = np.random.choice(range(len(self.experience.memory)), self.batch)
        # indices = range(len(experience))
        state_0 = [self.experience.memory[index][0] for index in indices]
        a =  [self.experience.memory[index][1] for index in indices]
        rs = [self.experience.memory[index][2] for index in indices]
        state = [self.experience.memory[index][3] for index in indices]

        [self.sess.run(self.train_op, feed_dict = {self.state_0s: state_0,
            self.one_hot_actions: a, self.rewards: np.asarray(rs).reshape([-1,1]), self.states: state}) for _ in range(self.epochs)]

    def run(self):
        all_ep_r = []
        for ep in range(self.ep_max):
            s_0 = self.env.reset()
            ep_r = 0
            for t in range(self.ep_len):
                fake_ac = [0.0, 0.0] # used to make one hot actions
                # self.env.render()
                a = self.choose_action(s_0, ep)
                s, r, done, _ = self.env.step(a)
                if done:
                    s = np.zeros(np.shape(s_0))
                fake_ac[a] = 1.0
                print(fake_ac)
                self.experience.add([s_0, fake_ac, r, s])
                s_0 = s
                ep_r += r

                if done:
                    break

            all_ep_r.append(ep_r)
            print(
                'Ep: %i' % ep,
                "|Ep_r: %i" % ep_r,
            )
            if len(self.experience.memory) > self.batch -1:
                self.update()
        return all_ep_r

agent = Agent("CartPole-v0", 200, 200, 0.99, 0.00025, 32, 10, 4, 16)
all_ep_r = agent.run()
plt.plot(range(len(all_ep_r)), all_ep_r)
plt.show()

Upvotes: -1

Views: 694

Answers (1)

R.F. Nelson
R.F. Nelson

Reputation: 2312

Simon's comment is right. Your code for the loss function is not correct because you are not taking the terminal state into account.

The target, trg, should be reward + gamma * Q if and only if the state is non-terminal.

If the state is terminal (the pole falls and the game is over) it is only the reward.

Upvotes: 1

Related Questions