Reputation: 1
Trying to run a pretrained model on the google cloud platform. After running the download_from_gcs
function and loading the .pth file, my print statement notifies me that the model didn't load.
Nothing seems to get my model to load when I start training.
import gymnasium as gym
import numpy as np
import torch
import torch.optim as optim
import argparse
from model import DQN
from memory import ReplayMemory
import random
import math
from PIL import Image
from utils import preprocess_env
from google.cloud import storage
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
"""Downloads a blob from the bucket."""
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(source_blob_name)
blob.download_to_filename(destination_file_name)
print(f"Downloaded {source_blob_name} from GCS bucket {bucket_name} to local file {destination_file_name}.")
class DQNAgent(object):
def __init__(self, replace_target_cnt, env, state_space, action_space, num_eps, tau=0.0007,
model_name='enduro_model', gamma=0.922, eps_strt=0.9,
eps_end=0.05, eps_dec=1.4e-6, batch_size=128, lr=2e-05):
self.env = env
self.state_space = state_space
self.action_space = action_space
self.batch_size = batch_size
self.GAMMA = gamma
self.LR = lr
self.eps = eps_strt
self.eps_start = eps_strt
self.eps_dec = eps_dec
self.eps_end = eps_end
self.tau = tau
self.steps_done = 0
self.num_eps = num_eps
if torch.cuda.is_available():
print('GPU FOUND')
# Use the first GPU (index 0) if available
self.device = torch.device("cuda")
else:
print("GPU device not found. Falling back to CPU.")
self.device = torch.device("cpu")
#initialize ReplayMemory
self.memory = ReplayMemory(100000)
# After how many training iterations the target network should update
self.replace_target_cnt = replace_target_cnt
self.learn_counter = 0
self.policy_net = DQN(self.state_space, self.action_space, filename=model_name).to(self.device)
self.target_net = DQN(self.state_space, self.action_space, filename=model_name+'target').to(self.device)
self.target_net.eval()
# If pretrained model of the modelname already exists, load it
try:
# bucket_name = "atari-bucket"
# blob_name = "pt-model/enduro_modelX.pth"
# local_file_name = "enduro_modelX.pth"
# # Download the model file from GCS
# download_from_gcs(bucket_name, blob_name, local_file_name)
# models/enduro_modelNULL.pth
self.policy_net.load_model('atari-bucket/pt-model/enduro_modelX.pth', weights_only=True)
print('loaded pretrained model')
except:
print('Didnt load model')
pass
# Set target net to be the same as policy net
self.replace_target_net()
#Set optimizer & loss
self.optim = optim.AdamW(self.policy_net.parameters(), lr=self.LR, amsgrad=True)
self.loss = torch.nn.SmoothL1Loss()
def parse_arguments():
# Initialize ArgumentParser
parser = argparse.ArgumentParser(description="DQN Agent for training on a Gym environment.")
# Define the command-line arguments
parser.add_argument('--replace_target_cnt', type=int, default=1000, help='How often to replace the target network')
parser.add_argument('--gamma', type=float, default=0.922, help='Discount factor for future rewards')
parser.add_argument('--eps_strt', type=float, default=0.9, help='Starting epsilon (exploration rate)')
parser.add_argument('--eps_end', type=float, default=0.05, help='Ending epsilon')
parser.add_argument('--eps_dec', type=float, default=1.4e-6, help='Decay rate for epsilon')
parser.add_argument('--batch_size', type=int, default=128, help='Batch size for training')
parser.add_argument('--lr', type=float, default=2e-05, help='Learning rate')
parser.add_argument('--num_eps', type=int, default=5000, help='Number of episodes for training')
parser.add_argument('--render', type=bool, default=False, help='Whether to render the environment during training')
args = parser.parse_args()
return args
def sample_batch(self):
batch = self.memory.sample(self.batch_size)
# print(f'Batch.state = {batch.state}')
# print(f'Batch.state[0].shape = {batch.state[0].shape}')
state_shape = batch.state[0].shape
#Batch.state[0].shape (4, 84, 84)
# Convert to tensors with correct dimensions
state = torch.tensor(batch.state).view(self.batch_size, -1, state_shape[1], state_shape[2]).float().to(self.device)
action = torch.tensor(batch.action).unsqueeze(1).to(self.device)
reward = torch.tensor(batch.reward).float().unsqueeze(1).to(self.device)
state_ = torch.tensor(batch.next_state).view(self.batch_size, -1, state_shape[1], state_shape[2]).float().to(self.device)
done = torch.tensor(batch.done).float().unsqueeze(1).to(self.device)
return state, action, reward, state_, done
#------------------------------------------------#
# # Unpack batch of transitions into separate lists for each attribute
# states, actions, rewards, next_states, dones = zip(*batch)
# print(f'States {states}, actions {actions}, rewards {rewards}, next_states{next_states}')
# Convert each list to a tensor with the correct dimensions
# state = torch.tensor(batch.state).float().to(self.device)
# action = torch.tensor(batch.action).unsqueeze(1).to(self.device) # Ensure actions are 2D (batch_size, 1)
# reward = torch.tensor(batch.reward).float().unsqueeze(1).to(self.device) # Ensure rewards are 2D (batch_size, 1)
# state_ = torch.tensor(batch.next_state).float().to(self.device)
# done = torch.tensor(batch.done).float().unsqueeze(1).to(self.device) # Ensure dones are 2D (batch_size, 1)
# return state, action, reward, state_, done
# Returns the greedy action according to the policy net
def greedy_action(self, obs):
# Ensure obs_ is just the raw observation array
if isinstance(obs, tuple):
obs = obs[0] # If step returns a tuple, get the observation
# print("Choosing Greedy Action")
obs = torch.tensor(obs).float().to(self.device)
obs = obs.unsqueeze(0)
action = self.policy_net(obs).argmax().item()
return action
# Returns an action based on epsilon greedy method
def choose_action(self, obs):
# print("Choosing Action")
if random.random() > self.eps:
action = self.greedy_action(obs)
else:
action = random.choice([x for x in range(self.action_space)])
return action
def replace_target_net(self):
if self.learn_counter % self.replace_target_cnt == 0:
target_net_state_dict = self.target_net.state_dict()
policy_net_state_dict = self.policy_net.state_dict()
for key in policy_net_state_dict:
target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)
self.target_net.load_state_dict(self.policy_net.state_dict())
print('Target network replaced')
# Decrement epsilon
def dec_eps(self):
self.eps = self.eps_end + (self.eps_start - self.eps_end) * math.exp(-1. *self.steps_done / self.eps_dec)
# EPS_END + (EPS_START - EPS_END) * \
# math.exp(-1. * steps_done / EPS_DECAY)
def play_games(self, render=True):
# Set network to eval mode
self.policy_net.eval()
self.env = gym.make("Enduro-v4", render_mode="human")
self.env = preprocess_env(self.env)
scores = []
for i in range(self.num_eps):
done = False
# Get preprocessed observation from environment
state, _ = self.env.reset()
score = 0
cnt = 0
while not done:
# Take the greedy action and observe next state
action = self.greedy_action(state)
next_state, reward, done, _, info = self.env.step(action)
if render:
self.env.render()
# Store transition
self.memory.push(state, action, next_state, reward, int(done))
# Calculate score, set next state and obs, and increment counter
score += reward
state = next_state
cnt += 1
# If the score is more than 300, save a gif of that game
if score > 300:
self.save_gif(cnt)
scores.append(score)
print(f'Episode {i}/{num_eps}: \n\tScore: {score}\n\tAvg score (past 100): {np.mean(scores[-100:])}\
\n\tEpsilon: {self.eps}\n\tSteps made: {cnt}')
self.env.close()
def learn(self, num_iters=1):
# print('Learning Func')
# Skip learning if there's not enough memory
if self.memory.pointer < self.batch_size:
return
losses = []
for i in range(num_iters):
# Sample batch
state, action, reward, state_, done = self.sample_batch()
# Calculate the Q-value of the action taken
q_eval = self.policy_net(state).gather(1, action)
# Calculate the best next action value from the target net and detach it from the computation graph
q_next = self.target_net(state_).detach().max(1)[0].unsqueeze(1)
# Calculate the target Q-value
# (1 - done) ensures q_target is 0 if transition is in a terminal state
q_target = reward + (1 - done) * (self.GAMMA * q_next)
# Compute the loss
loss = self.loss(q_eval, q_target).to(self.device)
losses.append(loss.cpu().item())
# Perform backward propagation and optimization step
self.optim.zero_grad()
loss.backward()
self.optim.step()
# Increment learn_counter (used for epsilon decay and target network updates)
self.learn_counter += 1
# Check if it's time to replace the target network
self.replace_target_net()
# Save the model and decrement epsilon
self.policy_net.save_model()
self.dec_eps()
return losses
def save_gif(self, num_transitions):
frames = []
for i in range(self.memory.pointer - num_transitions, self.memory.pointer):
frame = Image.fromarray(self.memory.memory[i].state, mode='RGB')
frames.append(frame)
frames[0].save('episode.gif', format='GIF', append_images=frames[1:], save_all=True, duration=10, loop=0)
# Plays num_eps amount of games, while optimizing the model after each episode
def train(self, render=False):
scores = []
avg_losses_per_episode = []
max_score = 0
for i in range(self.num_eps):
done = False
# max_steps = 8000
state, _ = self.env.reset()
score = 0
cnt = 0
while not done:
action = self.choose_action(state)
next_state, reward, done, _, info = self.env.step(action)
if render:
self.env.render()
self.memory.push(state, action, reward, next_state, int(done))
score += reward
state = next_state
cnt += 1
if score > max_score:
max_score = score
if score > 100 and score >= max_score:
self.save_gif(cnt)
scores.append(score)
eps_losses = self.learn(math.ceil(cnt/self.batch_size))
self.steps_done += 1
avg_loss = np.mean(eps_losses) if eps_losses else 0
avg_losses_per_episode.append(avg_loss)
# Periodically save the model to GCS
save_interval = 250 # Save every 500 episodes
if i % save_interval == 0:
model_filename = f"model_checkpoint_{i}" # Name of the model locally
destination_blob_name = f"models/checkpoints/PTmodel_checkpoint_{i}.pth" # GCS path
self.policy_net.save_model(
filename=model_filename,
bucket_name="riqmodel-bucket",
destination_blob_name=destination_blob_name
)
print(f"Model checkpoint {model_filename} saved to GCS as {destination_blob_name}.")
self.env.close()
return avg_losses_per_episode
Ive tried:
I run the command: lang-none python trainertraining.py --environment "ALE/Enduro-v5" --num_eps 5000 --batch_size 128 --gamma 0.922 --eps_start 0.9 --eps_end 0.05 --eps_decay 1.4e-6 --tau 0.0007 --learning_rate 1e-4 --bucket_name "atari-bucket" --destination_blob_name "training_loss_plot.png"
I expected the pretrained model to get loaded at some point, but it never worked. Model is in GCS bucket and downloaded to the vertex AI browser. Its size is around 6MB.
Upvotes: 0
Views: 29