DQN average reward decrease after training for a period of time

Question

I am trying to implement a DQN algorithm that trains the agent to play Breakout from the Open AI Gym Atari Environment by giving the RAM state of the game at each time step as input. I used the code from the AI-Blog repository by jaara https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py#L102 and made some change to it. Here is the code:

import random, numpy, math, gym
from SumTree import SumTree
import tensorflow as tf
import numpy as np
from tensorflow.keras import backend as K
import scipy.misc

# -----------------HYPER PARAMETERS--------------
# IMAGE_WIDTH = 84
# IMAGE_HEIGHT = 84
RAM_SIZE = 128
IMAGE_STACK = 2

HUBER_LOSS_DELTA = 2.0
LEARNING_RATE = 0.00025

MEMORY_CAPACITY = 200000

BATCH_SIZE = 32

GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.1

EXPLORATION_STOP = 500000   # at this step epsilon will be 0.01
LAMBDA = - math.log(0.01) / EXPLORATION_STOP  # speed of decay

UPDATE_TARGET_FREQUENCY = 10000

#-------------------- UTILITIES -----------------------
def huber_loss(y_true, y_pred):
    err = y_true - y_pred

    cond = K.abs(err) < HUBER_LOSS_DELTA
    L2 = 0.5 * K.square(err)
    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)

    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(

    return K.mean(loss)

# def processImage( ram ):
#     rgb = scipy.misc.imresize(ram, (IMAGE_WIDTH, IMAGE_HEIGHT), interp='bilinear')
#
#     r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
#     gray = 0.2989 * r + 0.5870 * g + 0.1140 * b     # extract luminance
#
#     o = gray.astype('float32') / 128 - 1    # normalize
#     return o

def save_model(agent, problem, algorithm_name=None):
    file_name = ("saved_models\"
                           + problem +
                           "-" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))
    if algorithm_name:
        file_name += "-" + algorithm_name + ".h5"
    else:
        file_name += ".h5"
    agent.brain.model.save(file_name)


#-------------------- BRAIN ---------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *

class Brain:
    def __init__(self, stateCnt, actionCnt, load_file=None):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.history = None

        self.model = self._createModel()
        self.model_ = self._createModel()          # target network
        if load_file:
            self.model.load_weights(load_file)
            self.model.load_weights(load_file)


    def _createModel(self):
        model = Sequential()

        model.add(Dense(units=128, activation="relu", input_dim=self.stateCnt))
        model.add(Dense(units=self.actionCnt, activation='linear', input_dim=128))

        opt = RMSprop(lr=LEARNING_RATE)
        model.compile(loss=huber_loss, optimizer=opt)

        return model

    def train(self, x, y, epochs=1, verbose=0):
        self.history = self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose)
        # print(history.history["val_loss"])

    def predict(self, s, target=False):
        if target:
            return self.model_.predict(s)
        else:
            return self.model.predict(s)

    def predictOne(self, s, target=False):
        return self.predict(s.reshape(1, IMAGE_STACK*RAM_SIZE), target).flatten()

    def updateTargetModel(self):
        self.model_.set_weights(self.model.get_weights())

#-------------------- MEMORY --------------------------
class Memory:   # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01  # epsilon, prevent error from falling below 0
    a = 0.6  # alpha, the degree of bias, with 0 meaning no bias at all

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def _getPriority(self, error):
        return (error + self.e) ** self.a

    def add(self, error, sample):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))

        return batch


    def update(self, idx, error):
        """
        Update the priority value of given entry
        :param idx: The index of the given entry
        :param error: The error value to be updated.
        :return: None
        """
        p = self._getPriority(error)
        self.tree.update(idx, p)

#-------------------- AGENT ---------------------------


class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt, file=None):
        """
        Initialize an agent, specifying the shape of the states and number of actions
        :param (int, int) stateCnt: (x, y) tuple specifying the shape of the state
                x: the number of arguments in a state e.g. size of the ram
                y: number of frames seen by the agent
        :param actionCnt: The number of actions this agent can do
        :param file: The model (e.g: .h5) file that's being loaded into the agents' brain.
        """
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.brain = Brain(stateCnt, actionCnt, file)
        self.memory = Memory(MEMORY_CAPACITY)

    def act(self, s):
        """
        Do an action according to the current state
        :param numpyArray s: the current state.
        :return: int: the action that's being done
        """
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt-1)
        else:
            return numpy.argmax(self.brain.predictOne(s))

    def observe(self, sample):  # in (s, a, r, s_) format
        """
        Add a sample to its memory
        :param tuple sample: the (s, a, r, s_) sample to be added. s, s_ are array of size STACK_SIZE
        :return: None
        """
        x, y, errors = self._getTargets([(0, sample)])
        self.memory.add(errors, sample)

        if self.steps % UPDATE_TARGET_FREQUENCY == 0:
            self.brain.updateTargetModel()

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def _getTargets(self, batch):
        """
        Get the list of estimated and target Q values for a given batch )
        :param list batch: The given [(error, (s, a, s', r))] samples
        :return: tuple (list[float], list[float], list[float]): Return three values: x, y, error
                x: list of estimated Q(s, a) value
                y: list of estimated target Q(s, a) value, which is r + gamma*maxQ_(s, a)
                error: list of MSE between x and y.
        """
        no_state = numpy.zeros(self.stateCnt)

        states = numpy.array([ sample[1][0] for sample in batch ])
        states_ = numpy.array([ (no_state if sample[1][3] is None else sample[1][3]) for sample in batch ])

        p = agent.brain.predict(states)   # estimated Q values for each sample in the batch

        p_ = agent.brain.predict(states_, target=False)
        pTarget_ = agent.brain.predict(states_, target=True)

        x = numpy.zeros((len(batch), IMAGE_STACK*RAM_SIZE))
        y = numpy.zeros((len(batch), self.actionCnt))
        errors = numpy.zeros(len(batch))

        for i in range(len(batch)):
            sample = batch[i][1]   # the i is the index, 1 is the actual sample
            s = sample[0]; a = sample[1]; r = sample[2]; s_ = sample[3]

            target = p[i]     # target Q value for the i-th state
            oldVal = target[a]
            if s_ is None:
                target[a] = r
            else:
                target[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ]  # double DQN

            x[i] = s
            y[i] = target
            errors[i] = abs(oldVal - target[a])

        return (x, y, errors)

    def replay(self):
        """
        Take a batch from the agent's memory, get the x and y data and train it in the brain.
        Also update the error values (priorities) of the entries in the batch.
        :return: None
        """
        batch = self.memory.sample(BATCH_SIZE)
        x, y, errors = self._getTargets(batch)

        # update errors
        for i in range(len(batch)):
            idx = batch[i][0]
            self.memory.update(idx, errors[i])

        self.brain.train(x, y)


class RandomAgent:
    memory = Memory(MEMORY_CAPACITY)
    exp = 0

    def __init__(self, actionCnt):
        self.actionCnt = actionCnt

    def act(self, s):
        return random.randint(0, self.actionCnt-1)

    def observe(self, sample):
        """
        Add a sample to its memory
        :param 4-tuple sample: the (s, a, r, s_) sample to be added
        :return: None
        """
        # in (s, a, r, s_) format
        error = abs(sample[2])  # reward
        self.memory.add(error, sample)
        self.exp += 1

    def replay(self):
        pass

#-------------------- ENVIRONMENT ---------------------
class Environment:
    def __init__(self, problem):
        self.problem = problem
        self.env = gym.make(problem)
        self.frames = 0
        self.episodes = 0
        self.R_40epi = 0

    def run(self, agent):
        ram = self.env.reset()
        # w = processImage(ram)
        s = numpy.concatenate((ram, numpy.zeros(128*(IMAGE_STACK-1))))

        R = 0
        last_action = 0
        while True:
            self.env.render()
            self.frames += 1

            # Frame skipping
            # if self.frames % IMAGE_STACK == 0:
            a = agent.act(s)
                # last_action = a
            # else:
            #     a = last_action

            r = 0
            ram, r, done, info = self.env.step(a)
            s_ = numpy.concatenate((s[128:128*IMAGE_STACK], ram))  # last two screens

            r = np.clip(r, -1, 1)   # clip reward to [-1, 1]

            if done:  # terminal state
                s_ = None

            agent.observe( (s, a, r, s_) )
            agent.replay()            

            s = s_
            R += r
            if done:
                self.R_40epi += R
                break

        info = ("Total reward: " + str(R) + " " +
              "Episode:" + str(self.episodes) + " " +
              "Frames:" + str(self.frames) + " " +
              datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
        if not type(agent) is RandomAgent and agent.brain.history is not None:
            info = (info + " loss: " + str(agent.brain.history.history["loss"]))
        print(info)
        if self.episodes % 40 == 0:
            print("average in last 40 episodes:", self.R_40epi/40)
            self.R_40epi = 0
        self.episodes += 1
        # save every 30 min
        if datetime.datetime.now().strftime("%M") == "00" and type(agent) is not RandomAgent:
            save_model(agent, self.problem, "ddqn-ram")

#-------------------- MAIN ----------------------------
import datetime
import sys
PROBLEM = 'Breakout-ram-v0'
env = Environment(PROBLEM)
# file = "saved_models\Breakout-ram-v0-2018-08-17-16-46-ddqn-ram.h5"

stateCnt  = IMAGE_STACK*RAM_SIZE
actionCnt = env.env.action_space.n

agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)

try:
    print("Initialization with random agent...")
    while randomAgent.exp < MEMORY_CAPACITY:
        env.run(randomAgent)
        print(randomAgent.exp, "/", MEMORY_CAPACITY)

    agent.memory = randomAgent.memory

    randomAgent = None

    print("Starting learning")
    env.frames = 0
    env.episodes = 0

    # S = env.env.step(env.env.action_space.sample)[0]
    while True:
        env.run(agent)
finally:
    save_model(agent, PROBLEM, "ddqn-ram-single128")

The problem I encountered is that when I tried to train the agent with this code, The average reward that the agent receives per episode was increasing first, but once the reward reaches around 3 to 4 (which happens at around 1 million timestep), it begin to decrease and stabilize at 1, never increase again no matter how much longer I train it (most algorithms reaches a reward of 60 to 100). The difference between the original code and my modified version is that I use the RAM state of the game as states instead of pictures, I use only a single 128 node dense hidden layer, and I am playing Breakout instead of Sea Quest, which is what the original code is playing. The code also have double DQN, reward clipping and prioritized experience replay implemented. What could possibly be the cause of the problem? Does reading RAM instead of game frames cause the problem?

For reference, this is the implementation of the "Sum Tree" data structure I used: import numpy

class SumTree:

    def __init__(self, capacity):
        """
        Initialize a sum tree structure
        :param capacity: the number of values the tree can store
        """
        self.capacity = capacity
        self.tree = numpy.zeros( 2*capacity - 1 )  # the numpy array representing the actual tree
        self.data = numpy.zeros( capacity, dtype=object )  # the array representing the data (leaf) of the tree
        self.write = 0

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return (idx, self.tree[idx], self.data[dataIdx])

DQN average reward decrease after training for a period of time

Answers (1)

Related Questions