Reputation: 253
I am trying to implement a DQN algorithm that trains the agent to play Breakout from the Open AI Gym Atari Environment by giving the RAM state of the game at each time step as input. I used the code from the AI-Blog repository by jaara https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py#L102 and made some change to it. Here is the code:
import random, numpy, math, gym
from SumTree import SumTree
import tensorflow as tf
import numpy as np
from tensorflow.keras import backend as K
import scipy.misc
# -----------------HYPER PARAMETERS--------------
# IMAGE_WIDTH = 84
# IMAGE_HEIGHT = 84
RAM_SIZE = 128
IMAGE_STACK = 2
HUBER_LOSS_DELTA = 2.0
LEARNING_RATE = 0.00025
MEMORY_CAPACITY = 200000
BATCH_SIZE = 32
GAMMA = 0.99
MAX_EPSILON = 1
MIN_EPSILON = 0.1
EXPLORATION_STOP = 500000 # at this step epsilon will be 0.01
LAMBDA = - math.log(0.01) / EXPLORATION_STOP # speed of decay
UPDATE_TARGET_FREQUENCY = 10000
#-------------------- UTILITIES -----------------------
def huber_loss(y_true, y_pred):
err = y_true - y_pred
cond = K.abs(err) < HUBER_LOSS_DELTA
L2 = 0.5 * K.square(err)
L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)
loss = tf.where(cond, L2, L1) # Keras does not cover where function in tensorflow :-(
return K.mean(loss)
# def processImage( ram ):
# rgb = scipy.misc.imresize(ram, (IMAGE_WIDTH, IMAGE_HEIGHT), interp='bilinear')
#
# r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
# gray = 0.2989 * r + 0.5870 * g + 0.1140 * b # extract luminance
#
# o = gray.astype('float32') / 128 - 1 # normalize
# return o
def save_model(agent, problem, algorithm_name=None):
file_name = ("saved_models\\"
+ problem +
"-" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))
if algorithm_name:
file_name += "-" + algorithm_name + ".h5"
else:
file_name += ".h5"
agent.brain.model.save(file_name)
#-------------------- BRAIN ---------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
class Brain:
def __init__(self, stateCnt, actionCnt, load_file=None):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.history = None
self.model = self._createModel()
self.model_ = self._createModel() # target network
if load_file:
self.model.load_weights(load_file)
self.model.load_weights(load_file)
def _createModel(self):
model = Sequential()
model.add(Dense(units=128, activation="relu", input_dim=self.stateCnt))
model.add(Dense(units=self.actionCnt, activation='linear', input_dim=128))
opt = RMSprop(lr=LEARNING_RATE)
model.compile(loss=huber_loss, optimizer=opt)
return model
def train(self, x, y, epochs=1, verbose=0):
self.history = self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose)
# print(history.history["val_loss"])
def predict(self, s, target=False):
if target:
return self.model_.predict(s)
else:
return self.model.predict(s)
def predictOne(self, s, target=False):
return self.predict(s.reshape(1, IMAGE_STACK*RAM_SIZE), target).flatten()
def updateTargetModel(self):
self.model_.set_weights(self.model.get_weights())
#-------------------- MEMORY --------------------------
class Memory: # stored as ( s, a, r, s_ ) in SumTree
e = 0.01 # epsilon, prevent error from falling below 0
a = 0.6 # alpha, the degree of bias, with 0 meaning no bias at all
def __init__(self, capacity):
self.tree = SumTree(capacity)
def _getPriority(self, error):
return (error + self.e) ** self.a
def add(self, error, sample):
p = self._getPriority(error)
self.tree.add(p, sample)
def sample(self, n):
batch = []
segment = self.tree.total() / n
for i in range(n):
a = segment * i
b = segment * (i + 1)
s = random.uniform(a, b)
(idx, p, data) = self.tree.get(s)
batch.append((idx, data))
return batch
def update(self, idx, error):
"""
Update the priority value of given entry
:param idx: The index of the given entry
:param error: The error value to be updated.
:return: None
"""
p = self._getPriority(error)
self.tree.update(idx, p)
#-------------------- AGENT ---------------------------
class Agent:
steps = 0
epsilon = MAX_EPSILON
def __init__(self, stateCnt, actionCnt, file=None):
"""
Initialize an agent, specifying the shape of the states and number of actions
:param (int, int) stateCnt: (x, y) tuple specifying the shape of the state
x: the number of arguments in a state e.g. size of the ram
y: number of frames seen by the agent
:param actionCnt: The number of actions this agent can do
:param file: The model (e.g: .h5) file that's being loaded into the agents' brain.
"""
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.brain = Brain(stateCnt, actionCnt, file)
self.memory = Memory(MEMORY_CAPACITY)
def act(self, s):
"""
Do an action according to the current state
:param numpyArray s: the current state.
:return: int: the action that's being done
"""
if random.random() < self.epsilon:
return random.randint(0, self.actionCnt-1)
else:
return numpy.argmax(self.brain.predictOne(s))
def observe(self, sample): # in (s, a, r, s_) format
"""
Add a sample to its memory
:param tuple sample: the (s, a, r, s_) sample to be added. s, s_ are array of size STACK_SIZE
:return: None
"""
x, y, errors = self._getTargets([(0, sample)])
self.memory.add(errors, sample)
if self.steps % UPDATE_TARGET_FREQUENCY == 0:
self.brain.updateTargetModel()
# slowly decrease Epsilon based on our eperience
self.steps += 1
self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
def _getTargets(self, batch):
"""
Get the list of estimated and target Q values for a given batch )
:param list batch: The given [(error, (s, a, s', r))] samples
:return: tuple (list[float], list[float], list[float]): Return three values: x, y, error
x: list of estimated Q(s, a) value
y: list of estimated target Q(s, a) value, which is r + gamma*maxQ_(s, a)
error: list of MSE between x and y.
"""
no_state = numpy.zeros(self.stateCnt)
states = numpy.array([ sample[1][0] for sample in batch ])
states_ = numpy.array([ (no_state if sample[1][3] is None else sample[1][3]) for sample in batch ])
p = agent.brain.predict(states) # estimated Q values for each sample in the batch
p_ = agent.brain.predict(states_, target=False)
pTarget_ = agent.brain.predict(states_, target=True)
x = numpy.zeros((len(batch), IMAGE_STACK*RAM_SIZE))
y = numpy.zeros((len(batch), self.actionCnt))
errors = numpy.zeros(len(batch))
for i in range(len(batch)):
sample = batch[i][1] # the i is the index, 1 is the actual sample
s = sample[0]; a = sample[1]; r = sample[2]; s_ = sample[3]
target = p[i] # target Q value for the i-th state
oldVal = target[a]
if s_ is None:
target[a] = r
else:
target[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ] # double DQN
x[i] = s
y[i] = target
errors[i] = abs(oldVal - target[a])
return (x, y, errors)
def replay(self):
"""
Take a batch from the agent's memory, get the x and y data and train it in the brain.
Also update the error values (priorities) of the entries in the batch.
:return: None
"""
batch = self.memory.sample(BATCH_SIZE)
x, y, errors = self._getTargets(batch)
# update errors
for i in range(len(batch)):
idx = batch[i][0]
self.memory.update(idx, errors[i])
self.brain.train(x, y)
class RandomAgent:
memory = Memory(MEMORY_CAPACITY)
exp = 0
def __init__(self, actionCnt):
self.actionCnt = actionCnt
def act(self, s):
return random.randint(0, self.actionCnt-1)
def observe(self, sample):
"""
Add a sample to its memory
:param 4-tuple sample: the (s, a, r, s_) sample to be added
:return: None
"""
# in (s, a, r, s_) format
error = abs(sample[2]) # reward
self.memory.add(error, sample)
self.exp += 1
def replay(self):
pass
#-------------------- ENVIRONMENT ---------------------
class Environment:
def __init__(self, problem):
self.problem = problem
self.env = gym.make(problem)
self.frames = 0
self.episodes = 0
self.R_40epi = 0
def run(self, agent):
ram = self.env.reset()
# w = processImage(ram)
s = numpy.concatenate((ram, numpy.zeros(128*(IMAGE_STACK-1))))
R = 0
last_action = 0
while True:
self.env.render()
self.frames += 1
# Frame skipping
# if self.frames % IMAGE_STACK == 0:
a = agent.act(s)
# last_action = a
# else:
# a = last_action
r = 0
ram, r, done, info = self.env.step(a)
s_ = numpy.concatenate((s[128:128*IMAGE_STACK], ram)) # last two screens
r = np.clip(r, -1, 1) # clip reward to [-1, 1]
if done: # terminal state
s_ = None
agent.observe( (s, a, r, s_) )
agent.replay()
s = s_
R += r
if done:
self.R_40epi += R
break
info = ("Total reward: " + str(R) + " " +
"Episode:" + str(self.episodes) + " " +
"Frames:" + str(self.frames) + " " +
datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
if not type(agent) is RandomAgent and agent.brain.history is not None:
info = (info + " loss: " + str(agent.brain.history.history["loss"]))
print(info)
if self.episodes % 40 == 0:
print("average in last 40 episodes:", self.R_40epi/40)
self.R_40epi = 0
self.episodes += 1
# save every 30 min
if datetime.datetime.now().strftime("%M") == "00" and type(agent) is not RandomAgent:
save_model(agent, self.problem, "ddqn-ram")
#-------------------- MAIN ----------------------------
import datetime
import sys
PROBLEM = 'Breakout-ram-v0'
env = Environment(PROBLEM)
# file = "saved_models\Breakout-ram-v0-2018-08-17-16-46-ddqn-ram.h5"
stateCnt = IMAGE_STACK*RAM_SIZE
actionCnt = env.env.action_space.n
agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)
try:
print("Initialization with random agent...")
while randomAgent.exp < MEMORY_CAPACITY:
env.run(randomAgent)
print(randomAgent.exp, "/", MEMORY_CAPACITY)
agent.memory = randomAgent.memory
randomAgent = None
print("Starting learning")
env.frames = 0
env.episodes = 0
# S = env.env.step(env.env.action_space.sample)[0]
while True:
env.run(agent)
finally:
save_model(agent, PROBLEM, "ddqn-ram-single128")
The problem I encountered is that when I tried to train the agent with this code, The average reward that the agent receives per episode was increasing first, but once the reward reaches around 3 to 4 (which happens at around 1 million timestep), it begin to decrease and stabilize at 1, never increase again no matter how much longer I train it (most algorithms reaches a reward of 60 to 100). The difference between the original code and my modified version is that I use the RAM state of the game as states instead of pictures, I use only a single 128 node dense hidden layer, and I am playing Breakout instead of Sea Quest, which is what the original code is playing. The code also have double DQN, reward clipping and prioritized experience replay implemented. What could possibly be the cause of the problem? Does reading RAM instead of game frames cause the problem?
For reference, this is the implementation of the "Sum Tree" data structure I used: import numpy
class SumTree:
def __init__(self, capacity):
"""
Initialize a sum tree structure
:param capacity: the number of values the tree can store
"""
self.capacity = capacity
self.tree = numpy.zeros( 2*capacity - 1 ) # the numpy array representing the actual tree
self.data = numpy.zeros( capacity, dtype=object ) # the array representing the data (leaf) of the tree
self.write = 0
def _propagate(self, idx, change):
parent = (idx - 1) // 2
self.tree[parent] += change
if parent != 0:
self._propagate(parent, change)
def _retrieve(self, idx, s):
left = 2 * idx + 1
right = left + 1
if left >= len(self.tree):
return idx
if s <= self.tree[left]:
return self._retrieve(left, s)
else:
return self._retrieve(right, s-self.tree[left])
def total(self):
return self.tree[0]
def add(self, p, data):
idx = self.write + self.capacity - 1
self.data[self.write] = data
self.update(idx, p)
self.write += 1
if self.write >= self.capacity:
self.write = 0
def update(self, idx, p):
change = p - self.tree[idx]
self.tree[idx] = p
self._propagate(idx, change)
def get(self, s):
idx = self._retrieve(0, s)
dataIdx = idx - self.capacity + 1
return (idx, self.tree[idx], self.data[dataIdx])
Upvotes: 5
Views: 7050
Reputation: 2985
The main reason, in my opinion, is two-fold:
Algorithm uses priority replay. This algorithm gives replay memories with higher temporal difference errors a higher probability of being selected, because it means the RL was not able to predict the correct Q-values given those states, so by picking these states more often, your model will train to do better on these states. But the problem is that these states are only a subset of your whole state space, and so your model will be biased towards this subset, and perform poorly for the remainder of your state space. This is especially a problem as you train your model longer, because only a small set of your states will have very large errors. To avoid this, you can anneal out the priority replay. Please see the original paper here: https://arxiv.org/abs/1511.05952
You may also want to anneal out your learning rate, or increase the batch size as training goes on. These two are apparently equivalent according to a new paper published earlier this year at Google. https://openreview.net/forum?id=B1Yy1BxCZ This will allow your model to very slowly have a learning rate of 0 as training goes on and on, essentially stopping training after a while. Because if you never lower learning rate, an unlucky batch of bad data can potentially ruin the weights of your neural network.
Upvotes: 15