Encountering Shape Mismatch Error in DDQN Implementation with T-Soft Update and PER

Question

I'm a beginner in Reinforcement Learning and Machine Learning. I've been implementing a Deep Q-Network (DDQN) algorithm for learning purposes, but I encountered a shape mismatch error.

Specifically, the target_q_values tensor contains Q-values for all possible actions for each state in the batch target_q_values = reward + (1-done) * self.gamma * next_q_values.squeeze(1), while the current_q_values tensor only holds the Q-values for the chosen actions

current_q_values = self.q_net(state)

current_q_values = torch.gather(current_q_values,dim=1,index=action.long()).

This leads to an error during the calculation of the Temporal Difference (TD) error

td_error = torch.abs(target_q_values - current_q_values).

here is the error:

Traceback (most recent call last):
  File "c:\Users\ali mola\jupyter_sttuf	est.py", line 14, in 
    model.train(1_000_000,buffer)#TODO-tep()' even though this environment has already returned terminated = True. You should always call 'reset()' once you receive 'terminated = True' -- any further steps are undefined behavior.
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ali mola\jupyter_sttuf\DDQN_agent.py", line 226, in train
    loss, td_error = self.update(batch, weights=weights)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ali mola\jupyter_sttuf\DDQN_agent.py", line 154, in update
    td_error = torch.abs(target_q_values - current_q_values)
                         ~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
RuntimeError: The size of tensor a (64) must match the size of tensor b (2) at non-singleton dimension 1

here is some info for better understanding the code and the error:

state shape: torch.Size([64, 4])
next_state shape: torch.Size([64, 4])
action shape: torch.Size([64, 2])
reward shape: torch.Size([64])
done shape: torch.Size([64])
current_q_values shape: torch.Size([64, 2])
target_q_values shape: torch.Size([64])
next_q_values:torch.Size([64, 1])
next_q_values_online shape:torch.Size([64, 2])

here is my DDQN:

class DDQN():
    def __init__(self,
                state_size,
                action_size,
                env,
                epsilon_factor_decay=0.99,
                beta_rate=0.1,
                gamma=0.99,
                tau=0.5,
                learning_rate=0.001,
                nu=1.0,
                target_update_interval=1000,
                hidden=64):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        self.loss_fn = nn.HuberLoss()
        self.env = env 
        self.epsilon_factor_decay = epsilon_factor_decay
        self.beta_rate = beta_rate
        self.nu = nu
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.tau = tau
        self.action_size = action_size
        self.state_size = state_size
        self.hidden = hidden
        self.target_update_interval = target_update_interval
        self._n_calls = 0
        self.q_net = Net(self.state_size,self.action_size,self.hidden).to(device())
        self.q_net_target = Net(self.state_size,self.action_size,self.hidden).to(device())
        self.t_soft = TSoftUpdate(self.q_net,self.q_net_target,self.nu,self.tau)
        self.optimizer = optim.Adam(self.q_net.parameters(),lr=self.learning_rate)
    def act(self,state,epsilon):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        self.q_net.eval()
        with torch.no_grad():
            state = torch.as_tensor(state).to(device())
            if epsilon > random.uniform(0.0,0.5):
                action = self.env.action_space.sample()
            else:
                action = torch.argmax(self.q_net(state)).cpu().numpy().item()
        return action

    def update(self,batch,weights=None):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        self.q_net.train()
        self.q_net_target.train()
        state,next_state,action,reward,done = batch
        print(f"state shape: {state.shape}")
        print(f"next_state shape: {next_state.shape}")
        print(f"action shape: {action.shape}")
        print(f"reward shape: {reward.shape}")
        print(f"done shape: {done.shape}")
        with torch.no_grad():
            next_q_values = self.q_net_target(next_state)
            next_q_values_online = self.q_net(next_state)
            max_q_online_action = next_q_values_online.max(dim=1)
            next_q_values = torch.gather(next_q_values,dim=1,index=max_q_online_action.indices.unsqueeze(1))
            print(f'next_q_values:{next_q_values.shape}')
            target_q_values = reward + (1-done) * self.gamma * next_q_values.squeeze(1)
            print(f'target_q_values:{target_q_values.shape}')
        current_q_values = self.q_net(state)
        current_q_values = torch.gather(current_q_values,dim=1,index=action.long())
        print(f'current_q_values:{current_q_values},target_q_values:{target_q_values}')
        td_error = torch.abs(target_q_values - current_q_values)
        assert current_q_values.shape == target_q_values.shape  
        if weights is None:
            weights = torch.ones_like(current_q_values)
        loss = (weights.to(device()) * (self.loss_fn(current_q_values, target_q_values))).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self._n_calls % self.target_update_interval == 0:
            self.t_soft.update()
        return loss , td_error
    def save(self):
        torch.save(self.q_net, "q_net.pth")
        torch.save(self.q_net_target, "q_net_target.pth")
    def evaluate_policy(self,episodes=10, seed=0):
        set_seed(self.env, seed=seed)

        returns = []
        for ep in range(episodes):
            done, total_reward = False, 0
            state, _ = self.env.reset(seed=seed)
            while not done:
                state, reward, terminated, truncated, _ = self.env.step(self.act(state,self.epsilon))
                done = terminated or truncated
                total_reward += reward
            returns.append(total_reward)
        return np.mean(returns), np.std(returns)
    def power_decay_schedule(self,episode_number:int,
                         decay_factor: float,
                         minimum_epsilon: float) -> float:
        """Power decay schedule found in other practical applications."""
        return max(decay_factor**episode_number, minimum_epsilon)
    def train(self,
              gradient_steps: int,
              buffer,
              eps_min:float=0.07,
              batch_size: int = 64,
              seed:int=0,
              test_every:int=10000,
               best_reward=150 ):
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
        print(f"Device: {device()}, Seed: {seed}")
        rewards_total, stds_total = [], []
        loss_count, total_loss = 0, 0
        episodes = 0
        done = False
        state,_ = self.env.reset(seed=seed)

        for step in range(1,gradient_steps+1):
            if done:
                done = False
                state, _ = self.env.reset(seed=seed)
                episodes += 1
                self._n_calls +=1
            self.epsilon = self.power_decay_schedule(step,self.epsilon_factor_decay,eps_min)
            self.beta = 1 - np.exp(-self.beta_rate * step)
            action = self.act(state,epsilon=self.epsilon)
            if action > self.action_size:
                raise ValueError(f'action size{self.action_size}, action{action}')
            else:
                next_state, reward, terminated, truncated, _ = self.env.step(action)
            done = terminated or truncated
            buffer.add((state,action,reward,next_state,int(done)),beta=self.beta)
            state = next_state
            rewards_total.append(np.mean(reward))
            stds_total.append(np.std(reward))
            if step > batch_size:
                if buffer:
                    if isinstance(buffer,PrioritizedExperienceReplayBuffer):
                        batch , weights, tree_idx= buffer.sample(batch_size)
                        loss, td_error = self.update(batch, weights=weights)
                        buffer.update_priorities(tree_idx, td_error.cpu().detach().numpy())            
                else:
                    raise RuntimeError("buffer is not set")
                total_loss += loss
                loss_count += 1
                if step % test_every == 0:
                    mean, std = self.evaluate_policy(episodes=10, seed=seed)
                    print(f"Episode: {episodes}, Step: {step}, Reward mean: {mean:.2f}, Reward std: {std:.2f}, Loss: {total_loss / loss_count:.4f}, Eps: {self.epsilon},beta: {self.beta}")#TODO-track the reward and imporve it 

                    if mean > best_reward:
                        best_reward = mean
                        self.save()
                    rewards_total.append(mean)
                    stds_total.append(std)
        return np.array(rewards_total), np.array(stds_total)

I've searched online and attempted to use AI assistants to resolve the issue, but without success.

sorry for making it long and thank you for reading this.

Encountering Shape Mismatch Error in DDQN Implementation with T-Soft Update and PER

Answers (0)

Related Questions