TGeorges
TGeorges

Reputation: 1

Is there a reason Vertex AI doesn't load .pth file?

Trying to run a pretrained model on the google cloud platform. After running the download_from_gcs function and loading the .pth file, my print statement notifies me that the model didn't load.

Nothing seems to get my model to load when I start training.

import gymnasium as gym
import numpy as np
import torch
import torch.optim as optim
import argparse
from model import DQN
from memory import ReplayMemory
import random
import math
from PIL import Image
from utils import preprocess_env
from google.cloud import storage


def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Downloaded {source_blob_name} from GCS bucket {bucket_name} to local file {destination_file_name}.")

    

class DQNAgent(object):
    def __init__(self, replace_target_cnt, env, state_space, action_space, num_eps, tau=0.0007, 
                 model_name='enduro_model', gamma=0.922, eps_strt=0.9, 
                 eps_end=0.05, eps_dec=1.4e-6, batch_size=128, lr=2e-05):
        self.env = env
        self.state_space = state_space
        self.action_space = action_space
        self.batch_size = batch_size
        self.GAMMA = gamma
        self.LR = lr
        self.eps = eps_strt
        self.eps_start = eps_strt
        self.eps_dec = eps_dec
        self.eps_end = eps_end
        self.tau = tau
        self.steps_done = 0
        self.num_eps = num_eps
        

        
        if torch.cuda.is_available():
            print('GPU FOUND')
            # Use the first GPU (index 0) if available
            self.device = torch.device("cuda")
        else:
            print("GPU device not found. Falling back to CPU.")
            self.device = torch.device("cpu")

        #initialize ReplayMemory
        self.memory = ReplayMemory(100000)

        # After how many training iterations the target network should update
        self.replace_target_cnt = replace_target_cnt
        self.learn_counter = 0

        self.policy_net = DQN(self.state_space, self.action_space, filename=model_name).to(self.device)
        self.target_net = DQN(self.state_space, self.action_space, filename=model_name+'target').to(self.device)
        self.target_net.eval()

        # If pretrained model of the modelname already exists, load it
        try:
#             bucket_name = "atari-bucket"
#             blob_name = "pt-model/enduro_modelX.pth"
#             local_file_name = "enduro_modelX.pth"

#             # Download the model file from GCS
#             download_from_gcs(bucket_name, blob_name, local_file_name)
            # models/enduro_modelNULL.pth
    
            self.policy_net.load_model('atari-bucket/pt-model/enduro_modelX.pth', weights_only=True)
            print('loaded pretrained model')
        except:
            print('Didnt load model')
            pass

         # Set target net to be the same as policy net
        self.replace_target_net()

        #Set optimizer & loss
        self.optim = optim.AdamW(self.policy_net.parameters(), lr=self.LR, amsgrad=True)
        self.loss = torch.nn.SmoothL1Loss()
        
    def parse_arguments():
        # Initialize ArgumentParser
        parser = argparse.ArgumentParser(description="DQN Agent for training on a Gym environment.")
        
        # Define the command-line arguments
        parser.add_argument('--replace_target_cnt', type=int, default=1000, help='How often to replace the target network')
        parser.add_argument('--gamma', type=float, default=0.922, help='Discount factor for future rewards')
        parser.add_argument('--eps_strt', type=float, default=0.9, help='Starting epsilon (exploration rate)')
        parser.add_argument('--eps_end', type=float, default=0.05, help='Ending epsilon')
        parser.add_argument('--eps_dec', type=float, default=1.4e-6, help='Decay rate for epsilon')
        parser.add_argument('--batch_size', type=int, default=128, help='Batch size for training')
        parser.add_argument('--lr', type=float, default=2e-05, help='Learning rate')
        parser.add_argument('--num_eps', type=int, default=5000, help='Number of episodes for training')
        parser.add_argument('--render', type=bool, default=False, help='Whether to render the environment during training')

        args = parser.parse_args()
        return args
    
    def sample_batch(self):
        batch = self.memory.sample(self.batch_size)
        # print(f'Batch.state = {batch.state}')
        # print(f'Batch.state[0].shape = {batch.state[0].shape}')
        state_shape = batch.state[0].shape
        #Batch.state[0].shape (4, 84, 84)

        # Convert to tensors with correct dimensions
        state = torch.tensor(batch.state).view(self.batch_size, -1, state_shape[1], state_shape[2]).float().to(self.device)
        action = torch.tensor(batch.action).unsqueeze(1).to(self.device)
        reward = torch.tensor(batch.reward).float().unsqueeze(1).to(self.device)
        state_ = torch.tensor(batch.next_state).view(self.batch_size, -1, state_shape[1], state_shape[2]).float().to(self.device)
        done = torch.tensor(batch.done).float().unsqueeze(1).to(self.device)

        return state, action, reward, state_, done


        #------------------------------------------------#
        # # Unpack batch of transitions into separate lists for each attribute
        # states, actions, rewards, next_states, dones = zip(*batch)

        # print(f'States {states}, actions {actions}, rewards {rewards}, next_states{next_states}')

        # Convert each list to a tensor with the correct dimensions
        # state = torch.tensor(batch.state).float().to(self.device)
        # action = torch.tensor(batch.action).unsqueeze(1).to(self.device)  # Ensure actions are 2D (batch_size, 1)
        # reward = torch.tensor(batch.reward).float().unsqueeze(1).to(self.device)  # Ensure rewards are 2D (batch_size, 1)
        # state_ = torch.tensor(batch.next_state).float().to(self.device)
        # done = torch.tensor(batch.done).float().unsqueeze(1).to(self.device)  # Ensure dones are 2D (batch_size, 1)


        # return state, action, reward, state_, done

    # Returns the greedy action according to the policy net
    
    def greedy_action(self, obs):
    # Ensure obs_ is just the raw observation array
        if isinstance(obs, tuple):
            obs = obs[0]  # If step returns a tuple, get the observation
        # print("Choosing Greedy Action")
        obs = torch.tensor(obs).float().to(self.device)
        obs = obs.unsqueeze(0)
        action = self.policy_net(obs).argmax().item()
        return action

    # Returns an action based on epsilon greedy method
    
    def choose_action(self, obs):
        # print("Choosing Action")
        if random.random() > self.eps:
            action = self.greedy_action(obs)
        else:
            action = random.choice([x for x in range(self.action_space)])
        return action
    
    def replace_target_net(self):
        if self.learn_counter % self.replace_target_cnt == 0:
            target_net_state_dict = self.target_net.state_dict()
            policy_net_state_dict = self.policy_net.state_dict()
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)

            self.target_net.load_state_dict(self.policy_net.state_dict())
            print('Target network replaced')
    
    # Decrement epsilon 
    def dec_eps(self):
        self.eps = self.eps_end + (self.eps_start - self.eps_end) * math.exp(-1. *self.steps_done / self.eps_dec)

        # EPS_END + (EPS_START - EPS_END) * \
        # math.exp(-1. * steps_done / EPS_DECAY)
        
    def play_games(self, render=True):
        # Set network to eval mode
        self.policy_net.eval()
        self.env = gym.make("Enduro-v4", render_mode="human")
        self.env = preprocess_env(self.env)

        scores = []

        for i in range(self.num_eps):
            done = False

            # Get preprocessed observation from environment
            state, _ = self.env.reset()
            
            score = 0
            cnt = 0
            while not done:
                # Take the greedy action and observe next state
                action = self.greedy_action(state)
                next_state, reward, done, _, info = self.env.step(action)
                if render:
                    self.env.render()


                # Store transition
                self.memory.push(state, action, next_state, reward, int(done))

                # Calculate score, set next state and obs, and increment counter
                score += reward
                state = next_state
                cnt += 1

            # If the score is more than 300, save a gif of that game
            if score > 300:
                self.save_gif(cnt)

            scores.append(score)
            print(f'Episode {i}/{num_eps}: \n\tScore: {score}\n\tAvg score (past 100): {np.mean(scores[-100:])}\
                    \n\tEpsilon: {self.eps}\n\tSteps made: {cnt}')
        
        self.env.close()

    def learn(self, num_iters=1):
        # print('Learning Func')
        # Skip learning if there's not enough memory
        if self.memory.pointer < self.batch_size:
            return 
        
        losses = []

        for i in range(num_iters):
            # Sample batch
            state, action, reward, state_, done = self.sample_batch()

            # Calculate the Q-value of the action taken
            q_eval = self.policy_net(state).gather(1, action)

            # Calculate the best next action value from the target net and detach it from the computation graph
            q_next = self.target_net(state_).detach().max(1)[0].unsqueeze(1)

            # Calculate the target Q-value
            # (1 - done) ensures q_target is 0 if transition is in a terminal state
            q_target = reward + (1 - done) * (self.GAMMA * q_next)

            # Compute the loss
            loss = self.loss(q_eval, q_target).to(self.device)

            losses.append(loss.cpu().item())

            # Perform backward propagation and optimization step
            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

            # Increment learn_counter (used for epsilon decay and target network updates)
            self.learn_counter += 1

            # Check if it's time to replace the target network
            self.replace_target_net()

        # Save the model and decrement epsilon
        self.policy_net.save_model()
        self.dec_eps()

        return losses

    def save_gif(self, num_transitions):
        frames = []
        for i in range(self.memory.pointer - num_transitions, self.memory.pointer):
            frame = Image.fromarray(self.memory.memory[i].state, mode='RGB')
            frames.append(frame)
        
        frames[0].save('episode.gif', format='GIF', append_images=frames[1:], save_all=True, duration=10, loop=0)
    
    # Plays num_eps amount of games, while optimizing the model after each episode
    def train(self, render=False):
        scores = []
        avg_losses_per_episode = []
        max_score = 0

        for i in range(self.num_eps):
            done = False
            # max_steps = 8000
            state, _ = self.env.reset()

            score = 0
            cnt = 0
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _, info = self.env.step(action)
                if render:
                    self.env.render()

                self.memory.push(state, action, reward, next_state, int(done))
                score += reward
                state = next_state
                cnt += 1

            if score > max_score:
                max_score = score

            if score > 100 and score >= max_score:
                self.save_gif(cnt)

            scores.append(score)
            eps_losses = self.learn(math.ceil(cnt/self.batch_size))
            self.steps_done += 1

            avg_loss = np.mean(eps_losses) if eps_losses else 0
            avg_losses_per_episode.append(avg_loss)

            # Periodically save the model to GCS
            save_interval = 250  # Save every 500 episodes
            if i % save_interval == 0:
                model_filename = f"model_checkpoint_{i}"  # Name of the model locally
                destination_blob_name = f"models/checkpoints/PTmodel_checkpoint_{i}.pth"  # GCS path
                self.policy_net.save_model(
                    filename=model_filename, 
                    bucket_name="riqmodel-bucket", 
                    destination_blob_name=destination_blob_name
                )
                print(f"Model checkpoint {model_filename} saved to GCS as {destination_blob_name}.")

        self.env.close()
        return avg_losses_per_episode

Ive tried:

I run the command: lang-none python trainertraining.py --environment "ALE/Enduro-v5" --num_eps 5000 --batch_size 128 --gamma 0.922 --eps_start 0.9 --eps_end 0.05 --eps_decay 1.4e-6 --tau 0.0007 --learning_rate 1e-4 --bucket_name "atari-bucket" --destination_blob_name "training_loss_plot.png"

I expected the pretrained model to get loaded at some point, but it never worked. Model is in GCS bucket and downloaded to the vertex AI browser. Its size is around 6MB.

Upvotes: 0

Views: 29

Answers (0)

Related Questions