Reputation: 1743
I am training a REINFORCE algorithm on the CartPole environment. Due to the simple nature of the environment, I expect it to learn quickly. However, that doesn't happen.
Here is the main portion of the algorithm -
for i in range(episodes):
print("i = ", i)
state = env.reset()
done = False
transitions = []
tot_rewards = 0
while not done:
act_proba = model(torch.from_numpy(state))
action = np.random.choice(np.array([0,1]), p = act_proba.data.numpy())
next_state, reward, done, info = env.step(action)
tot_rewards += 1
transitions.append((state, action, tot_rewards))
state = next_state
if i%50==0:
print("i = ", i, ",reward = ", tot_rewards)
score.append(tot_rewards)
reward_batch = torch.Tensor([r for (s,a,r) in transitions])
disc_rewards = discount_rewards(reward_batch)
nrml_disc_rewards = normalize_rewards(disc_rewards)
state_batch = torch.Tensor([s for (s,a,r) in transitions])
action_batch = torch.Tensor([a for (s,a,r) in transitions])
pred_batch = model(state_batch)
prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
loss = -(torch.sum(torch.log(prob_batch)*nrml_disc_rewards))
opt.zero_grad()
loss.backward()
opt.step()
Here is the entire algorithm -
#I referred to this when writing the code - https://github.com/DeepReinforcementLearning/DeepReinforcementLearningInAction/blob/master/Chapter%204/Ch4_book.ipynb
import numpy as np
import gym
import torch
from torch import nn
env = gym.make('CartPole-v0')
learning_rate = 0.0001
episodes = 10000
def discount_rewards(reward, gamma = 0.99):
return torch.pow(gamma, torch.arange(len(reward)))*reward
def normalize_rewards(disc_reward):
return disc_reward/(disc_reward.max())
class NeuralNetwork(nn.Module):
def __init__(self, state_size, action_size):
super(NeuralNetwork, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.linear_relu_stack = nn.Sequential(
nn.Linear(state_size, 300),
nn.ReLU(),
nn.Linear(300, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, action_size),
nn.Softmax()
)
def forward(self,x):
x = self.linear_relu_stack(x)
return x
model = NeuralNetwork(env.observation_space.shape[0], env.action_space.n)
opt = torch.optim.Adam(params = model.parameters(), lr = learning_rate)
score = []
for i in range(episodes):
print("i = ", i)
state = env.reset()
done = False
transitions = []
tot_rewards = 0
while not done:
act_proba = model(torch.from_numpy(state))
action = np.random.choice(np.array([0,1]), p = act_proba.data.numpy())
next_state, reward, done, info = env.step(action)
tot_rewards += 1
transitions.append((state, action, tot_rewards))
state = next_state
if i%50==0:
print("i = ", i, ",reward = ", tot_rewards)
score.append(tot_rewards)
reward_batch = torch.Tensor([r for (s,a,r) in transitions])
disc_rewards = discount_rewards(reward_batch)
nrml_disc_rewards = normalize_rewards(disc_rewards)
state_batch = torch.Tensor([s for (s,a,r) in transitions])
action_batch = torch.Tensor([a for (s,a,r) in transitions])
pred_batch = model(state_batch)
prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
loss = -(torch.sum(torch.log(prob_batch)*nrml_disc_rewards))
opt.zero_grad()
loss.backward()
opt.step()
Upvotes: 1
Views: 1017
Reputation: 305
Your computation for discounting the reward is where your mistake is.
In REINFORCE (and many other algorithms) you need to compute the sum of future discounted rewards for every step onward.
This means that the sum of discounted rewards for the first step should be:
G_1 = r_1 + gamma * r_2 + (gamma ** 2) * r_3 + ... + (gamma ** (T-1)) * r_T
G_2 = r_2 + gamma * r_3 + (gamma ** 2) * r_4 + ... + (gamma ** (T-1)) * r_T
And so on...
This gives you an array containing all the sum of future rewards for every step (i.e. [G_1, G_2, G_3, ... , G_T]
)
However, what you compute currently is only applying a discount on the current step's reward:
G_1 = r_1
G_2 = gamma * r_2
G_3 = (gamma ** 2) * r_3
And so on...
Here is the Python code fixing your problem. We compute from the back of the list of reward to the front to be more computationally efficient.
def discount_rewards(reward, gamma=0.99):
R = 0
returns = []
reward = reward.tolist()
for r in reward[::-1]:
R = r + gamma * R
returns.append(R)
returns = torch.tensor(returns[::-1])
return returns
Here is a figure showing the progression of the algorithm's score over the first 5000 steps.
Upvotes: 3