Reputation: 1
I'm a beginner in Reinforcement Learning and Machine Learning. I've been implementing a Deep Q-Network (DDQN) algorithm for learning purposes, but I encountered a shape mismatch error.
Specifically, the target_q_values tensor contains Q-values for all possible actions for each state in the batch target_q_values = reward + (1-done) * self.gamma * next_q_values.squeeze(1)
, while the current_q_values tensor only holds the Q-values for the chosen actions
current_q_values = self.q_net(state)
current_q_values = torch.gather(current_q_values,dim=1,index=action.long())
.
This leads to an error during the calculation of the Temporal Difference (TD) error
td_error = torch.abs(target_q_values - current_q_values)
.
here is the error:
Traceback (most recent call last):
File "c:\Users\ali mola\jupyter_sttuf\test.py", line 14, in <module>
model.train(1_000_000,buffer)#TODO-tep()' even though this environment has already returned terminated = True. You should always call 'reset()' once you receive 'terminated = True' -- any further steps are undefined behavior.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\ali mola\jupyter_sttuf\DDQN_agent.py", line 226, in train
loss, td_error = self.update(batch, weights=weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\ali mola\jupyter_sttuf\DDQN_agent.py", line 154, in update
td_error = torch.abs(target_q_values - current_q_values)
~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~
RuntimeError: The size of tensor a (64) must match the size of tensor b (2) at non-singleton dimension 1
here is some info for better understanding the code and the error:
state shape: torch.Size([64, 4])
next_state shape: torch.Size([64, 4])
action shape: torch.Size([64, 2])
reward shape: torch.Size([64])
done shape: torch.Size([64])
current_q_values shape: torch.Size([64, 2])
target_q_values shape: torch.Size([64])
next_q_values:torch.Size([64, 1])
next_q_values_online shape:torch.Size([64, 2])
here is my DDQN:
class DDQN():
def __init__(self,
state_size,
action_size,
env,
epsilon_factor_decay=0.99,
beta_rate=0.1,
gamma=0.99,
tau=0.5,
learning_rate=0.001,
nu=1.0,
target_update_interval=1000,
hidden=64):
torch.manual_seed(0)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
self.loss_fn = nn.HuberLoss()
self.env = env
self.epsilon_factor_decay = epsilon_factor_decay
self.beta_rate = beta_rate
self.nu = nu
self.gamma = gamma
self.learning_rate = learning_rate
self.tau = tau
self.action_size = action_size
self.state_size = state_size
self.hidden = hidden
self.target_update_interval = target_update_interval
self._n_calls = 0
self.q_net = Net(self.state_size,self.action_size,self.hidden).to(device())
self.q_net_target = Net(self.state_size,self.action_size,self.hidden).to(device())
self.t_soft = TSoftUpdate(self.q_net,self.q_net_target,self.nu,self.tau)
self.optimizer = optim.Adam(self.q_net.parameters(),lr=self.learning_rate)
def act(self,state,epsilon):
torch.manual_seed(0)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
self.q_net.eval()
with torch.no_grad():
state = torch.as_tensor(state).to(device())
if epsilon > random.uniform(0.0,0.5):
action = self.env.action_space.sample()
else:
action = torch.argmax(self.q_net(state)).cpu().numpy().item()
return action
def update(self,batch,weights=None):
torch.manual_seed(0)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
self.q_net.train()
self.q_net_target.train()
state,next_state,action,reward,done = batch
print(f"state shape: {state.shape}")
print(f"next_state shape: {next_state.shape}")
print(f"action shape: {action.shape}")
print(f"reward shape: {reward.shape}")
print(f"done shape: {done.shape}")
with torch.no_grad():
next_q_values = self.q_net_target(next_state)
next_q_values_online = self.q_net(next_state)
max_q_online_action = next_q_values_online.max(dim=1)
next_q_values = torch.gather(next_q_values,dim=1,index=max_q_online_action.indices.unsqueeze(1))
print(f'next_q_values:{next_q_values.shape}')
target_q_values = reward + (1-done) * self.gamma * next_q_values.squeeze(1)
print(f'target_q_values:{target_q_values.shape}')
current_q_values = self.q_net(state)
current_q_values = torch.gather(current_q_values,dim=1,index=action.long())
print(f'current_q_values:{current_q_values},target_q_values:{target_q_values}')
td_error = torch.abs(target_q_values - current_q_values)
assert current_q_values.shape == target_q_values.shape
if weights is None:
weights = torch.ones_like(current_q_values)
loss = (weights.to(device()) * (self.loss_fn(current_q_values, target_q_values))).mean()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if self._n_calls % self.target_update_interval == 0:
self.t_soft.update()
return loss , td_error
def save(self):
torch.save(self.q_net, "q_net.pth")
torch.save(self.q_net_target, "q_net_target.pth")
def evaluate_policy(self,episodes=10, seed=0):
set_seed(self.env, seed=seed)
returns = []
for ep in range(episodes):
done, total_reward = False, 0
state, _ = self.env.reset(seed=seed)
while not done:
state, reward, terminated, truncated, _ = self.env.step(self.act(state,self.epsilon))
done = terminated or truncated
total_reward += reward
returns.append(total_reward)
return np.mean(returns), np.std(returns)
def power_decay_schedule(self,episode_number:int,
decay_factor: float,
minimum_epsilon: float) -> float:
"""Power decay schedule found in other practical applications."""
return max(decay_factor**episode_number, minimum_epsilon)
def train(self,
gradient_steps: int,
buffer,
eps_min:float=0.07,
batch_size: int = 64,
seed:int=0,
test_every:int=10000,
best_reward=150 ):
torch.manual_seed(0)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
print(f"Device: {device()}, Seed: {seed}")
rewards_total, stds_total = [], []
loss_count, total_loss = 0, 0
episodes = 0
done = False
state,_ = self.env.reset(seed=seed)
for step in range(1,gradient_steps+1):
if done:
done = False
state, _ = self.env.reset(seed=seed)
episodes += 1
self._n_calls +=1
self.epsilon = self.power_decay_schedule(step,self.epsilon_factor_decay,eps_min)
self.beta = 1 - np.exp(-self.beta_rate * step)
action = self.act(state,epsilon=self.epsilon)
if action > self.action_size:
raise ValueError(f'action size{self.action_size}, action{action}')
else:
next_state, reward, terminated, truncated, _ = self.env.step(action)
done = terminated or truncated
buffer.add((state,action,reward,next_state,int(done)),beta=self.beta)
state = next_state
rewards_total.append(np.mean(reward))
stds_total.append(np.std(reward))
if step > batch_size:
if buffer:
if isinstance(buffer,PrioritizedExperienceReplayBuffer):
batch , weights, tree_idx= buffer.sample(batch_size)
loss, td_error = self.update(batch, weights=weights)
buffer.update_priorities(tree_idx, td_error.cpu().detach().numpy())
else:
raise RuntimeError("buffer is not set")
total_loss += loss
loss_count += 1
if step % test_every == 0:
mean, std = self.evaluate_policy(episodes=10, seed=seed)
print(f"Episode: {episodes}, Step: {step}, Reward mean: {mean:.2f}, Reward std: {std:.2f}, Loss: {total_loss / loss_count:.4f}, Eps: {self.epsilon},beta: {self.beta}")#TODO-track the reward and imporve it
if mean > best_reward:
best_reward = mean
self.save()
rewards_total.append(mean)
stds_total.append(std)
return np.array(rewards_total), np.array(stds_total)
I've searched online and attempted to use AI assistants to resolve the issue, but without success.
sorry for making it long and thank you for reading this.
Upvotes: 0
Views: 9