alireza
alireza

Reputation: 1

Encountering Shape Mismatch Error in DDQN Implementation with T-Soft Update and PER

I'm a beginner in Reinforcement Learning and Machine Learning. I've been implementing a Deep Q-Network (DDQN) algorithm for learning purposes, but I encountered a shape mismatch error.

Specifically, the target_q_values tensor contains Q-values for all possible actions for each state in the batch target_q_values = reward + (1-done) * self.gamma * next_q_values.squeeze(1), while the current_q_values tensor only holds the Q-values for the chosen actions

current_q_values = self.q_net(state)

current_q_values = torch.gather(current_q_values,dim=1,index=action.long()).

This leads to an error during the calculation of the Temporal Difference (TD) error

td_error = torch.abs(target_q_values - current_q_values).

here is the error:

Traceback (most recent call last):
  File "c:\Users\ali mola\jupyter_sttuf\test.py", line 14, in <module>
    model.train(1_000_000,buffer)#TODO-tep()' even though this environment has already returned terminated = True. You should always call 'reset()' once you receive 'terminated = True' -- any further steps are undefined behavior.
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ali mola\jupyter_sttuf\DDQN_agent.py", line 226, in train
    loss, td_error = self.update(batch, weights=weights)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ali mola\jupyter_sttuf\DDQN_agent.py", line 154, in update
    td_error = torch.abs(target_q_values - current_q_values)
                         ~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
RuntimeError: The size of tensor a (64) must match the size of tensor b (2) at non-singleton dimension 1

here is some info for better understanding the code and the error:

state shape: torch.Size([64, 4])
next_state shape: torch.Size([64, 4])
action shape: torch.Size([64, 2])
reward shape: torch.Size([64])
done shape: torch.Size([64])
current_q_values shape: torch.Size([64, 2])
target_q_values shape: torch.Size([64])
next_q_values:torch.Size([64, 1])
next_q_values_online shape:torch.Size([64, 2])

here is my DDQN:

class DDQN():
    def __init__(self,
                state_size,
                action_size,
                env,
                epsilon_factor_decay=0.99,
                beta_rate=0.1,
                gamma=0.99,
                tau=0.5,
                learning_rate=0.001,
                nu=1.0,
                target_update_interval=1000,
                hidden=64):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        self.loss_fn = nn.HuberLoss()
        self.env = env 
        self.epsilon_factor_decay = epsilon_factor_decay
        self.beta_rate = beta_rate
        self.nu = nu
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.tau = tau
        self.action_size = action_size
        self.state_size = state_size
        self.hidden = hidden
        self.target_update_interval = target_update_interval
        self._n_calls = 0
        self.q_net = Net(self.state_size,self.action_size,self.hidden).to(device())
        self.q_net_target = Net(self.state_size,self.action_size,self.hidden).to(device())
        self.t_soft = TSoftUpdate(self.q_net,self.q_net_target,self.nu,self.tau)
        self.optimizer = optim.Adam(self.q_net.parameters(),lr=self.learning_rate)
    def act(self,state,epsilon):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        self.q_net.eval()
        with torch.no_grad():
            state = torch.as_tensor(state).to(device())
            if epsilon > random.uniform(0.0,0.5):
                action = self.env.action_space.sample()
            else:
                action = torch.argmax(self.q_net(state)).cpu().numpy().item()
        return action

    def update(self,batch,weights=None):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        self.q_net.train()
        self.q_net_target.train()
        state,next_state,action,reward,done = batch
        print(f"state shape: {state.shape}")
        print(f"next_state shape: {next_state.shape}")
        print(f"action shape: {action.shape}")
        print(f"reward shape: {reward.shape}")
        print(f"done shape: {done.shape}")
        with torch.no_grad():
            next_q_values = self.q_net_target(next_state)
            next_q_values_online = self.q_net(next_state)
            max_q_online_action = next_q_values_online.max(dim=1)
            next_q_values = torch.gather(next_q_values,dim=1,index=max_q_online_action.indices.unsqueeze(1))
            print(f'next_q_values:{next_q_values.shape}')
            target_q_values = reward + (1-done) * self.gamma * next_q_values.squeeze(1)
            print(f'target_q_values:{target_q_values.shape}')
        current_q_values = self.q_net(state)
        current_q_values = torch.gather(current_q_values,dim=1,index=action.long())
        print(f'current_q_values:{current_q_values},target_q_values:{target_q_values}')
        td_error = torch.abs(target_q_values - current_q_values)
        assert current_q_values.shape == target_q_values.shape  
        if weights is None:
            weights = torch.ones_like(current_q_values)
        loss = (weights.to(device()) * (self.loss_fn(current_q_values, target_q_values))).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self._n_calls % self.target_update_interval == 0:
            self.t_soft.update()
        return loss , td_error
    def save(self):
        torch.save(self.q_net, "q_net.pth")
        torch.save(self.q_net_target, "q_net_target.pth")
    def evaluate_policy(self,episodes=10, seed=0):
        set_seed(self.env, seed=seed)

        returns = []
        for ep in range(episodes):
            done, total_reward = False, 0
            state, _ = self.env.reset(seed=seed)
            while not done:
                state, reward, terminated, truncated, _ = self.env.step(self.act(state,self.epsilon))
                done = terminated or truncated
                total_reward += reward
            returns.append(total_reward)
        return np.mean(returns), np.std(returns)
    def power_decay_schedule(self,episode_number:int,
                         decay_factor: float,
                         minimum_epsilon: float) -> float:
        """Power decay schedule found in other practical applications."""
        return max(decay_factor**episode_number, minimum_epsilon)
    def train(self,
              gradient_steps: int,
              buffer,
              eps_min:float=0.07,
              batch_size: int = 64,
              seed:int=0,
              test_every:int=10000,
               best_reward=150 ):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        print(f"Device: {device()}, Seed: {seed}")
        rewards_total, stds_total = [], []
        loss_count, total_loss = 0, 0
        episodes = 0
        done = False
        state,_ = self.env.reset(seed=seed)

        for step in range(1,gradient_steps+1):
            if done:
                done = False
                state, _ = self.env.reset(seed=seed)
                episodes += 1
                self._n_calls +=1
            self.epsilon = self.power_decay_schedule(step,self.epsilon_factor_decay,eps_min)
            self.beta = 1 - np.exp(-self.beta_rate * step)
            action = self.act(state,epsilon=self.epsilon)
            if action > self.action_size:
                raise ValueError(f'action size{self.action_size}, action{action}')
            else:
                next_state, reward, terminated, truncated, _ = self.env.step(action)
            done = terminated or truncated
            buffer.add((state,action,reward,next_state,int(done)),beta=self.beta)
            state = next_state
            rewards_total.append(np.mean(reward))
            stds_total.append(np.std(reward))
            if step > batch_size:
                if buffer:
                    if isinstance(buffer,PrioritizedExperienceReplayBuffer):
                        batch , weights, tree_idx= buffer.sample(batch_size)
                        loss, td_error = self.update(batch, weights=weights)
                        buffer.update_priorities(tree_idx, td_error.cpu().detach().numpy())            
                else:
                    raise RuntimeError("buffer is not set")
                total_loss += loss
                loss_count += 1
                if step % test_every == 0:
                    mean, std = self.evaluate_policy(episodes=10, seed=seed)
                    print(f"Episode: {episodes}, Step: {step}, Reward mean: {mean:.2f}, Reward std: {std:.2f}, Loss: {total_loss / loss_count:.4f}, Eps: {self.epsilon},beta: {self.beta}")#TODO-track the reward and imporve it 

                    if mean > best_reward:
                        best_reward = mean
                        self.save()
                    rewards_total.append(mean)
                    stds_total.append(std)
        return np.array(rewards_total), np.array(stds_total)

I've searched online and attempted to use AI assistants to resolve the issue, but without success.

sorry for making it long and thank you for reading this.

Upvotes: 0

Views: 9

Answers (0)

Related Questions