Hossein
Hossein

Reputation: 3

how to save epoches every 20 times and load model again?

i want to train this model on colab and it has 1000 epoches ..but its takes too long.

this is my code with python:

its training a model for 1000 epoches ,

and i want to save epoches every 20 times(for example) and load model again and continue from last epoch .

example : train from 1 to 20 , then save model...then load model and continue from 20 to 40 and so .

import argparse
import numpy as np
import pandas as pd
import sys, os
from random import shuffle
import torch
import torch.nn as nn
from models.gcn import GCNNet
from utils import *

# training function at each epoch
def train(model, device, train_loader, optimizer, epoch,hidden,cell):
    print('Training on {} samples...'.format(len(train_loader.dataset)))
    model.train()
    for batch_idx, data in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data,hidden,cell)
        loss = loss_fn(output, data.y.view(-1, 1).float().to(device))
        loss.backward()
        optimizer.step()
        if batch_idx % LOG_INTERVAL == 0:
            print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch,
                                                                           batch_idx * len(data.x),
                                                                           len(train_loader.dataset),
                                                                           100. * batch_idx / len(train_loader),
                                                                           loss.item()))

def predicting(model, device, loader,hidden,cell):
    model.eval()
    total_preds = torch.Tensor()
    total_labels = torch.Tensor()
    print('Make prediction for {} samples...'.format(len(loader.dataset)))
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data,hidden,cell)
            total_preds = torch.cat((total_preds, output.cpu()), 0)
            total_labels = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0)
    return total_labels.numpy().flatten(),total_preds.numpy().flatten()


loss_fn = nn.MSELoss()
LOG_INTERVAL = 20

def main(args):
  dataset = args.dataset
  modeling = [GCNNet]
  model_st = modeling[0].__name__

  cuda_name = "cuda:0"
  print('cuda_name:', cuda_name)

  TRAIN_BATCH_SIZE = args.batch_size
  TEST_BATCH_SIZE = args.batch_size
  LR = args.lr
  
  NUM_EPOCHS = args.epoch

  print('Learning rate: ', LR)
  print('Epochs: ', NUM_EPOCHS)

  # Main program: iterate over different datasets
  print('\nrunning on ', model_st + '_' + dataset )
  processed_data_file_train = 'data/processed/' + dataset + '_train.pt'
  processed_data_file_test = 'data/processed/' + dataset + '_test.pt'
  if ((not os.path.isfile(processed_data_file_train)) or (not os.path.isfile(processed_data_file_test))):
     print('please run create_data.py to prepare data in pytorch format!')
  else:
    train_data = TestbedDataset(root='data', dataset=dataset+'_train')
    test_data = TestbedDataset(root='data', dataset=dataset+'_test')
        
    # make data PyTorch mini-batch processing ready
    train_loader = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True,drop_last=True)
    test_loader = DataLoader(test_data, batch_size=TEST_BATCH_SIZE, shuffle=False,drop_last=True)

    # training the model
    device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")
    model = modeling[0](k1=1,k2=2,k3=3,embed_dim=128,num_layer=1,device=device).to(device)



    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    best_mse = 1000
    best_ci = 0
    best_epoch = -1
    #model_file_name = 'model' + model_st + '_' + dataset +  '.model'
    result_file_name = 'result' + model_st + '_' + dataset +  '.csv'


    ##TRAIN _ NUM OF EPOCHES
    for epoch in range(NUM_EPOCHS):
      hidden,cell = model.init_hidden(batch_size=TRAIN_BATCH_SIZE)
      train(model, device, train_loader, optimizer, epoch+1,hidden,cell)
      G,P = predicting(model, device, test_loader,hidden,cell)
      ret = [rmse(G,P),mse(G,P),pearson(G,P),spearman(G,P),ci(G,P),get_rm2(G.reshape(G.shape[0],-1),P.reshape(P.shape[0],-1))]
      if ret[1]<best_mse:
        if args.save_file:
          model_file_name = args.save_file + '.model'
          torch.save(model.state_dict(), model_file_name)
        
        
        with open(result_file_name,'w') as f:
          f.write('rmse,mse,pearson,spearman,ci,rm2\n')
          f.write(','.join(map(str,ret)))
        best_epoch = epoch+1
        best_mse = ret[1]
        best_ci = ret[-2]
        print('rmse improved at epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)
      else:
        print(ret[1],'No improvement since epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)

if __name__ == "__main__":
  parser = argparse.ArgumentParser(description="Run DeepGLSTM")

  parser.add_argument("--dataset",type=str,default='davis',
                      help="Dataset Name (davis,kiba,DTC,Metz,ToxCast,Stitch)")

  parser.add_argument("--epoch",
                      type = int,
                      default = 1000,
                      help="Number of training epochs. Default is 1000."
                      ) 
  
  parser.add_argument("--lr",
                      type=float,
                      default = 0.0005,
                      help="learning rate",
                      )
  
  parser.add_argument("--batch_size",type=int,
                      default = 128,
                      help = "Number of drug-tareget per batch. Default is 128 for davis.") # batch 128 for Davis
  
  parser.add_argument("--save_file",type=str,
                      default=None,
                      help="Where to save the trained model. For example davis.model")


  args = parser.parse_args()
  print(args)
  main(args)

what shoud i do for my code ?

Upvotes: -1

Views: 61

Answers (2)

CNLiu
CNLiu

Reputation: 1

You don’t need to save the model and reload the model again. Just save the model for that epoch. Please refer to the following code I modified, thanks

import argparse
import numpy as np
import pandas as pd
import sys, os
from random import shuffle
import torch
import torch.nn as nn
from models.gcn import GCNNet
from utils import *

# training function at each epoch
def train(model, device, train_loader, optimizer, epoch,hidden,cell):
    print('Training on {} samples...'.format(len(train_loader.dataset)))
    model.train()
    for batch_idx, data in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data,hidden,cell)
        loss = loss_fn(output, data.y.view(-1, 1).float().to(device))
        loss.backward()
        optimizer.step()
        if batch_idx % LOG_INTERVAL == 0:
            print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch,
                                                                           batch_idx * len(data.x),
                                                                           len(train_loader.dataset),
                                                                           100. * batch_idx / len(train_loader),
                                                                           loss.item()))

def predicting(model, device, loader,hidden,cell):
    model.eval()
    total_preds = torch.Tensor()
    total_labels = torch.Tensor()
    print('Make prediction for {} samples...'.format(len(loader.dataset)))
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data,hidden,cell)
            total_preds = torch.cat((total_preds, output.cpu()), 0)
            total_labels = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0)
    return total_labels.numpy().flatten(),total_preds.numpy().flatten()


loss_fn = nn.MSELoss()
LOG_INTERVAL = 20

def main(args):
  dataset = args.dataset
  modeling = [GCNNet]
  model_st = modeling[0].__name__

  cuda_name = "cuda:0"
  print('cuda_name:', cuda_name)

  TRAIN_BATCH_SIZE = args.batch_size
  TEST_BATCH_SIZE = args.batch_size
  LR = args.lr
  
  NUM_EPOCHS = args.epoch

  print('Learning rate: ', LR)
  print('Epochs: ', NUM_EPOCHS)

  # Main program: iterate over different datasets
  print('\nrunning on ', model_st + '_' + dataset )
  processed_data_file_train = 'data/processed/' + dataset + '_train.pt'
  processed_data_file_test = 'data/processed/' + dataset + '_test.pt'
  if ((not os.path.isfile(processed_data_file_train)) or (not os.path.isfile(processed_data_file_test))):
     print('please run create_data.py to prepare data in pytorch format!')
  else:
    train_data = TestbedDataset(root='data', dataset=dataset+'_train')
    test_data = TestbedDataset(root='data', dataset=dataset+'_test')
        
    # make data PyTorch mini-batch processing ready
    train_loader = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True,drop_last=True)
    test_loader = DataLoader(test_data, batch_size=TEST_BATCH_SIZE, shuffle=False,drop_last=True)

    # training the model
    device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")
    model = modeling[0](k1=1,k2=2,k3=3,embed_dim=128,num_layer=1,device=device).to(device)



    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    best_mse = 1000
    best_ci = 0
    best_epoch = -1
    #model_file_name = 'model' + model_st + '_' + dataset +  '.model'
    result_file_name = 'result' + model_st + '_' + dataset +  '.csv'


    ##TRAIN _ NUM OF EPOCHES
    for epoch in range(NUM_EPOCHS):
      hidden,cell = model.init_hidden(batch_size=TRAIN_BATCH_SIZE)
      train(model, device, train_loader, optimizer, epoch+1,hidden,cell)
      G,P = predicting(model, device, test_loader,hidden,cell)
      ret = [rmse(G,P),mse(G,P),pearson(G,P),spearman(G,P),ci(G,P),get_rm2(G.reshape(G.shape[0],-1),P.reshape(P.shape[0],-1))]

      if (epoch+1) % args.save_model_epoch == 0:
         model_file_name = args.save_file + 'model_epoch_' + str(epoch+1) + '.model'
         torch.save(model.state_dict(), model_file_name)
         
      if ret[1]<best_mse:
        if args.save_file:
          model_file_name = args.save_file + '.model'
          torch.save(model.state_dict(), model_file_name)
        
        
        with open(result_file_name,'w') as f:
          f.write('rmse,mse,pearson,spearman,ci,rm2\n')
          f.write(','.join(map(str,ret)))
        best_epoch = epoch+1
        best_mse = ret[1]
        best_ci = ret[-2]
        print('rmse improved at epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)
      else:
        print(ret[1],'No improvement since epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)

if __name__ == "__main__":
  parser = argparse.ArgumentParser(description="Run DeepGLSTM")

  parser.add_argument("--dataset",type=str,default='davis',
                      help="Dataset Name (davis,kiba,DTC,Metz,ToxCast,Stitch)")

  parser.add_argument("--epoch",
                      type = int,
                      default = 1000,
                      help="Number of training epochs. Default is 1000."
                      ) 
  
  parser.add_argument("--lr",
                      type=float,
                      default = 0.0005,
                      help="learning rate",
                      )
  
  parser.add_argument("--batch_size",type=int,
                      default = 128,
                      help = "Number of drug-tareget per batch. Default is 128 for davis.") # batch 128 for Davis
  
  parser.add_argument("--save_file",type=str,
                      default=None,
                      help="Where to save the trained model. For example davis.model")
  
  parser.add_argument("--save_model_epoch",
                      type = int,
                      default = 20,
                      help="Epochs to save the model during training.  Default is 20")


  args = parser.parse_args()
  print(args)
  main(args)

Upvotes: 0

meysam
meysam

Reputation: 83

Here is my current approach for training the model:

  def train_model(model, train_loader, criterion, optimizer, start_epoch,end_epoch, save_path):
    model.train()
    for epoch in range(start_epoch, end_epoch):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f'Epoch [{epoch + 1}/{end_epoch}], Loss: {running_loss / len(train_loader)}')

        # Save the model every 20 epochs
        if (epoch + 1) % 20 == 0:
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': running_loss,
            }, f'{save_path}_epoch_{epoch + 1}.pth')

and you can load your model and continue the train

# If a checkpoint exists, load it
if checkpoint_path:
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    print(f'Resuming training from epoch {start_epoch}')

# Train the model in chunks of 20 epochs
for i in range(start_epoch, num_epochs, 20):
    train_model(model, train_loader, criterion, optimizer, i, min(i + 20, num_epochs), save_path)

Upvotes: 0

Related Questions