Why is my DQN exhibiting performance decrease over the training cycle to solve the Travelling Salesman Problem?

Question

I am currently trying to train a DQN (using gym and pytorch) to solve small instances of the Travelling salesman problem (for now I just want to solve a size 10 problem as I know it is capable of doing so easily). So far I have solved it using Tabular Q Learning and now have moved on to Deep Q Learning. For some reason, when I'm training the DQN I will observe that it will (for example) acheive 80% accuracy (i.e. it is within 20% of the optimal solution) for 50 episodes, and then all of a sudden drop to 50% which obviosuly makes no sense as it should be maximising its performance as time goes on. I have left it to train for up to 10,000 episodes and the performance is pretty terrible.

I am using a custom gym environment where my observations are simply binary arrays of visited nodes (so for a size 10 graph it will be a size 10 binary array). I know that the issue isn't in my environment as the Tabular Q Learning works perfectly. I am also masking the Q-values outputted by the network depending on what actions are allowed to be taken (i.e. I make it impossible to select the same node twice or select a node that has already been visited). Is there something wrong in my code? I feel like I have made some sort of silly mistake somewhere that is impacting my experience replay but I'm not sure what it is! Thanks in advance :)

class FeedForwardNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, lr):
        super().__init__()
        self.linear1 = nn.Linear(*input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, output_size)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.SmoothL1Loss()
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, x):
        
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        actions = self.linear3(x)
        return actions
    

class TrainingAgentDQN():
    def __init__(self, gamma, epsilon, lr, input_size, batch_size, output_size,
                 max_mem_size=100000, eps_end=0.01, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0

        self.Q_eval = FeedForwardNet(input_size=input_size, output_size = output_size, lr = self.lr, hidden_size=256)

        self.state_memory = np.zeros((self.mem_size, *input_size), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_size), dtype=np.float32)

        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, state_, done):

        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done

        self.mem_cntr += 1

 
        
    def choose_action(self, observation, route):


        visited = observation
        
        # If all nodes have been visited
        if len(route)==len(visited):
                action = route[0]
                return action

        if np.random.random() > self.epsilon:
            state = torch.tensor(observation, dtype=torch.float32).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            
            # Mask invalid actions
            valid_actions = [a for a in range(len(visited)) if visited[a] == 0]
            masked_actions = torch.tensor(valid_actions, dtype=torch.long).to(self.Q_eval.device)
            masked_q_values = torch.index_select(actions, 0, masked_actions)

            action = torch.argmax(masked_q_values).item()
            action = masked_actions[action]
    
        else:
            # Choose from valid actions
            valid_actions = [a for a in range(len(visited)) if visited[a] == 0]
            action = np.random.choice(valid_actions)
        
        return action.item()
 
    
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        
        self.Q_eval.optimizer.zero_grad()

        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)

        batch_index = np.arange(self.batch_size, dtype=np.int32)

        state_batch = torch.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = torch.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = torch.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = torch.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

        action_batch = self.action_memory[batch]

        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        
        q_next[terminal_batch] = 0.0

        q_target = reward_batch + self.gamma * torch.max(q_next, dim=1)[0]

        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        
        self.epsilon = self.epsilon - self.eps_dec \
            if self.epsilon > self.eps_min else self.eps_min

Why is my DQN exhibiting performance decrease over the training cycle to solve the Travelling Salesman Problem?

Answers (0)

Related Questions