Reputation: 1
So I was trying to solve the cartpole problem. This is a common problem when dealing with reinforcement learning. Essentially, you have a cart that is balancing a pole. The cart can move left or right. The episode ends when the pole falls. The whole point is to keep the pole balanced for as long as possible.
People usually take the shortcut of using the OpenAI gym environment. But I wanted to do it without using that shortcut, since I want to eventually make my own complex environments. I got it to run, but it doesn't seem to be learning anything. It collects a maximum of 14 points, every single time. What am I doing wrong?
import numpy as np
import math
import statistics
# Environment dimensions
SCREEN_WIDTH = 800
SCREEN_HEIGHT = 600
# Cart properties
CART_WIDTH = 100
CART_HEIGHT = 20
cart_x = SCREEN_WIDTH // 2 - CART_WIDTH
cart_y = SCREEN_HEIGHT - 50
cart_speed = 5
# Pole properties
POLE_LENGTH = 100
POLE_ANGLE = math.pi / 4
POLE_ANGULAR_VELOCITY = 0.0
POLE_ANGULAR_ACCELERATION = 0.0
GRAVITY = 0.01
# Game loop flag
running = True
EPISODES = 20000
LEFT = 0
RIGHT = 1
ACTIONS = [LEFT, RIGHT]
EPSILON = 0.9
EPSILON_DECAY = 0.01
MIN_EPSILON = 0.01
LEARNING_RATE = 0.5
DISCOUNT = 0.9
q_table = np.zeros((800, len(ACTIONS)))
def check_game_over(pole_angle):
if abs(pole_angle) > math.pi / 2:
return True
return False
def update_pos(state, action, pole_angular_acceleration, pole_angle, pole_angular_velocity):
if action == 0:
state -= cart_speed
if action == 1:
state += cart_speed
# Constrain cart within screen boundaries
state = max(0, min(state, SCREEN_WIDTH - CART_WIDTH))
# update pole physics
pole_angular_acceleration = GRAVITY * math.sin(pole_angle)
pole_angular_velocity += pole_angular_acceleration
pole_angle += pole_angular_velocity
# apply damping to stabilize the pole
pole_angular_velocity *= 0.99
return state, pole_angle, pole_angular_velocity, pole_angular_acceleration
def choose_action(state, epsilon):
if np.random.uniform() < epsilon:
action = np.argmax(q_table[state])
else:
action = np.random.choice(ACTIONS)
return action
def train():
for e in range(EPISODES):
pole_angular_velocity = POLE_ANGULAR_VELOCITY
pole_angle = POLE_ANGLE
pole_angular_acceleration = POLE_ANGULAR_ACCELERATION
reward = 0
rewards = []
avg_rewards = []
epsilon = EPSILON
state = SCREEN_WIDTH // 2 - CART_WIDTH
while not check_game_over(pole_angle):
# choose action
action = choose_action(state, epsilon)
# update positions
old_pos = q_table[state][action]
next_s, pole_angle, pole_angular_velocity, pole_angular_acceleration = update_pos(state, action, pole_angular_acceleration, pole_angle, pole_angular_velocity)
next_max = max(q_table[int(old_pos)])
new_value = (1 - LEARNING_RATE) * old_pos + LEARNING_RATE * (reward + DISCOUNT * next_max)
q_table[int(old_pos)][action] = new_value
state = next_s
# reward stuff
reward += 1
print(reward)
rewards.append(reward)
epsilon = max(MIN_EPSILON, epsilon * EPSILON_DECAY)
if e % 100 == 0:
avg_rewards.append(statistics.mean(rewards))
print(avg_rewards)
train()
`
I thought the problem was not decreasing epsilon, but that didn't change the performance at all.
Upvotes: 0
Views: 10