Reputation: 1
I am currently trying to train a DQN (using gym and pytorch) to solve small instances of the Travelling salesman problem (for now I just want to solve a size 10 problem as I know it is capable of doing so easily). So far I have solved it using Tabular Q Learning and now have moved on to Deep Q Learning. For some reason, when I'm training the DQN I will observe that it will (for example) acheive 80% accuracy (i.e. it is within 20% of the optimal solution) for 50 episodes, and then all of a sudden drop to 50% which obviosuly makes no sense as it should be maximising its performance as time goes on. I have left it to train for up to 10,000 episodes and the performance is pretty terrible.
I am using a custom gym environment where my observations are simply binary arrays of visited nodes (so for a size 10 graph it will be a size 10 binary array). I know that the issue isn't in my environment as the Tabular Q Learning works perfectly. I am also masking the Q-values outputted by the network depending on what actions are allowed to be taken (i.e. I make it impossible to select the same node twice or select a node that has already been visited). Is there something wrong in my code? I feel like I have made some sort of silly mistake somewhere that is impacting my experience replay but I'm not sure what it is! Thanks in advance :)
class FeedForwardNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size, lr):
super().__init__()
self.linear1 = nn.Linear(*input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, output_size)
self.optimizer = optim.Adam(self.parameters(), lr=lr)
self.loss = nn.SmoothL1Loss()
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
actions = self.linear3(x)
return actions
class TrainingAgentDQN():
def __init__(self, gamma, epsilon, lr, input_size, batch_size, output_size,
max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
self.gamma = gamma
self.epsilon = epsilon
self.eps_min = eps_end
self.eps_dec = eps_dec
self.lr = lr
self.mem_size = max_mem_size
self.batch_size = batch_size
self.mem_cntr = 0
self.Q_eval = FeedForwardNet(input_size=input_size, output_size = output_size, lr = self.lr, hidden_size=256)
self.state_memory = np.zeros((self.mem_size, *input_size), dtype=np.float32)
self.new_state_memory = np.zeros((self.mem_size, *input_size), dtype=np.float32)
self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
self.terminal_memory = np.zeros(self.mem_size, dtype=bool)
def store_transition(self, state, action, reward, state_, done):
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.reward_memory[index] = reward
self.action_memory[index] = action
self.terminal_memory[index] = done
self.mem_cntr += 1
def choose_action(self, observation, route):
visited = observation
# If all nodes have been visited
if len(route)==len(visited):
action = route[0]
return action
if np.random.random() > self.epsilon:
state = torch.tensor(observation, dtype=torch.float32).to(self.Q_eval.device)
actions = self.Q_eval.forward(state)
# Mask invalid actions
valid_actions = [a for a in range(len(visited)) if visited[a] == 0]
masked_actions = torch.tensor(valid_actions, dtype=torch.long).to(self.Q_eval.device)
masked_q_values = torch.index_select(actions, 0, masked_actions)
action = torch.argmax(masked_q_values).item()
action = masked_actions[action]
else:
# Choose from valid actions
valid_actions = [a for a in range(len(visited)) if visited[a] == 0]
action = np.random.choice(valid_actions)
return action.item()
def learn(self):
if self.mem_cntr < self.batch_size:
return
self.Q_eval.optimizer.zero_grad()
max_mem = min(self.mem_cntr, self.mem_size)
batch = np.random.choice(max_mem, self.batch_size, replace=False)
batch_index = np.arange(self.batch_size, dtype=np.int32)
state_batch = torch.tensor(self.state_memory[batch]).to(self.Q_eval.device)
new_state_batch = torch.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
reward_batch = torch.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
terminal_batch = torch.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
action_batch = self.action_memory[batch]
q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
q_next = self.Q_eval.forward(new_state_batch)
q_next[terminal_batch] = 0.0
q_target = reward_batch + self.gamma * torch.max(q_next, dim=1)[0]
loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
loss.backward()
self.Q_eval.optimizer.step()
self.epsilon = self.epsilon - self.eps_dec \
if self.epsilon > self.eps_min else self.eps_min
Upvotes: 0
Views: 49