Reputation: 5
I've been learning tensorflow and rl for months, and for the past few days I've been trying to solve OpenAI Cartpole with my own code but my Deep Q-Network can't seem to solve it. I've checked and compared my code to other implementations and I don't see where I am going wrong? Can anyone look over my implementation and teach me what I am messing up? It would mean a lot, thanks.
My code:
import gym
import numpy as np
import tensorflow as tf
import math
import keras
import random
class cartpole:
def __init__(self, sess, env):
self.env = env
self.state_size = env.observation_space.shape[0]
self.num_actions = env.action_space.n
self.sess = sess
self.epsilon = 1.0
self.return_loss = 0.0
self.memory = []
self.gamma = .95
self.q_model()
init = tf.global_variables_initializer()
self.sess.run(init)
def q_model(self):
self.state_input = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32)
self.reward_labels = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.hiddenlayer1_weights = tf.Variable(tf.random_normal([self.state_size, 32]))
self.hiddenlayer1_bias = tf.Variable(tf.random_normal([32]))
self.hiddenlayer1_output = tf.matmul(self.state_input, self.hiddenlayer1_weights) + self.hiddenlayer1_bias
self.hiddenlayer1_output = tf.nn.relu(self.hiddenlayer1_output)
self.hiddenlayer2_weights = tf.Variable(tf.random_normal([32, 16]))
self.hiddenlayer2_bias = tf.Variable(tf.random_normal([16]))
self.hiddenlayer2_output = tf.matmul(self.hiddenlayer1_output, self.hiddenlayer2_weights) + self.hiddenlayer2_bias
self.hiddenlayer2_output = tf.nn.relu(self.hiddenlayer2_output)
self.q_weights = tf.Variable(tf.random_normal([16, self.num_actions]))
self.q_bias = tf.Variable(tf.random_normal([self.num_actions]))
self.q_output = tf.matmul(self.hiddenlayer2_output, self.q_weights) + self.q_bias
self.q_output = keras.activations.linear(self.q_output)
self.max_q_value = tf.reshape(tf.reduce_max(self.q_output), (1,1))
self.best_action = tf.squeeze(tf.argmax(self.q_output, axis=1))
self.loss = tf.losses.mean_squared_error(self.max_q_value, self.reward_labels)
self.train_model = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)
def predict_action(self, state):
self.epsilon *= .995 + .01
if (np.random.random() < self.epsilon):
action = env.action_space.sample()
else:
action = self.sess.run(self.best_action, feed_dict={self.state_input: state})
return action
def predict_value(self, state):
state = np.array(state).reshape((1, 4))
max_q_value = self.sess.run(self.max_q_value, feed_dict={self.state_input: state})[0][0]
return max_q_value
def train_q_model(self, state, reward):
q_values, _, loss = self.sess.run([self.q_output, self.train_model, self.loss], feed_dict={self.state_input: state, self.reward_labels: reward})
self.return_loss = loss
def get_loss(self):
return self.return_loss
def experience_replay(self):
if len(self.memory) < 33:
return
del self.memory[0]
batch = random.sample(self.memory, 32)
for state, action, reward, new_state, done in self.memory:
reward = reward if not done else - reward
new_state = np.array(new_state).reshape((1, 4))
if not done:
reward = reward + (self.gamma * self.predict_value(new_state))
reward = np.array(reward).reshape((1, 1))
self.train_q_model(state, reward)
env = gym.make("CartPole-v0")
sess = tf.Session()
A2C = cartpole(sess, env)
episodes = 2000
reward_history = []
for i in range(episodes):
state = env.reset()
reward_total = 0
while True:
state = np.array(state).reshape((1, 4))
average_best_reward = sum(reward_history[-100:]) / 100.0
if (average_best_reward) > 195:
env.render()
action = A2C.predict_action(state)
new_state, reward, done, _ = env.step(action)
reward_total += reward
A2C.memory.append([state, action, reward, new_state, done])
A2C.experience_replay()
state = new_state
if done:
if (average_best_reward >= 195):
print("Finished! Episodes taken: ", i, "average reward: ", average_best_reward)
print("average reward = ", average_best_reward, "reward total = ", reward_total, "loss = ", A2C.get_loss())
reward_history.append(reward_total)
break
Upvotes: 0
Views: 911
Reputation: 549
in addition to the correct points mentioned already, you are importing both Tensorflow as well as Keras
. However, the only time you leverage Keras
is with keras.activations.linear
. everything else you do is done using Tensorflow
directly.
Keras is a framwork built on top of Tensorflow, which simplifies the usage of the underlying Tensorflow libraries. I suggest you look some more into the differences of them and try to implement the above code using Keras only. While you already know how to do this in TF, you will appreciate the simplicity Keras will bring.
A simple help is would be this Cartpole agent from the OpenAI Leaderboard: https://gym.openai.com/evaluations/eval_GazXePIETsOvUaxmoILNHw/
Upvotes: 0
Reputation: 697
Your initial epsilon is set to 1 self.epsilon = 1.0
. And yet, when you perform an action, instead of decaying it, you increase it.
self.epsilon *= .995 + .01
1.0 x 0.995 + 0.01 = 0.995 + 0.01 = 1.005
The exploration factor (epsilon) should be decayed:
self.epsilon *= .995
This causes the agent to never use your trained network and instead sticks to performing random actions.
Also, updating the exploration factor right before return action
would be optimal, since you want it to use the initial epsilon for the first action as well.
def predict_action(self, state):
if (np.random.random() < self.epsilon):
action = env.action_space.sample()
else:
action = self.sess.run(self.best_action, feed_dict={self.state_input: state})
self.epsilon *= .995
return action
You can also change if len(self.memory) < 33:
to if len(self.memory) < 32:
, assuming you want 32 to be your batch size.
Upvotes: 1