Reputation: 3
i want to train this model on colab and it has 1000 epoches ..but its takes too long.
this is my code with python:
its training a model for 1000 epoches ,
and i want to save epoches every 20 times(for example) and load model again and continue from last epoch .
example : train from 1 to 20 , then save model...then load model and continue from 20 to 40 and so .
import argparse
import numpy as np
import pandas as pd
import sys, os
from random import shuffle
import torch
import torch.nn as nn
from models.gcn import GCNNet
from utils import *
# training function at each epoch
def train(model, device, train_loader, optimizer, epoch,hidden,cell):
print('Training on {} samples...'.format(len(train_loader.dataset)))
model.train()
for batch_idx, data in enumerate(train_loader):
data = data.to(device)
optimizer.zero_grad()
output = model(data,hidden,cell)
loss = loss_fn(output, data.y.view(-1, 1).float().to(device))
loss.backward()
optimizer.step()
if batch_idx % LOG_INTERVAL == 0:
print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch,
batch_idx * len(data.x),
len(train_loader.dataset),
100. * batch_idx / len(train_loader),
loss.item()))
def predicting(model, device, loader,hidden,cell):
model.eval()
total_preds = torch.Tensor()
total_labels = torch.Tensor()
print('Make prediction for {} samples...'.format(len(loader.dataset)))
with torch.no_grad():
for data in loader:
data = data.to(device)
output = model(data,hidden,cell)
total_preds = torch.cat((total_preds, output.cpu()), 0)
total_labels = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0)
return total_labels.numpy().flatten(),total_preds.numpy().flatten()
loss_fn = nn.MSELoss()
LOG_INTERVAL = 20
def main(args):
dataset = args.dataset
modeling = [GCNNet]
model_st = modeling[0].__name__
cuda_name = "cuda:0"
print('cuda_name:', cuda_name)
TRAIN_BATCH_SIZE = args.batch_size
TEST_BATCH_SIZE = args.batch_size
LR = args.lr
NUM_EPOCHS = args.epoch
print('Learning rate: ', LR)
print('Epochs: ', NUM_EPOCHS)
# Main program: iterate over different datasets
print('\nrunning on ', model_st + '_' + dataset )
processed_data_file_train = 'data/processed/' + dataset + '_train.pt'
processed_data_file_test = 'data/processed/' + dataset + '_test.pt'
if ((not os.path.isfile(processed_data_file_train)) or (not os.path.isfile(processed_data_file_test))):
print('please run create_data.py to prepare data in pytorch format!')
else:
train_data = TestbedDataset(root='data', dataset=dataset+'_train')
test_data = TestbedDataset(root='data', dataset=dataset+'_test')
# make data PyTorch mini-batch processing ready
train_loader = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True,drop_last=True)
test_loader = DataLoader(test_data, batch_size=TEST_BATCH_SIZE, shuffle=False,drop_last=True)
# training the model
device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")
model = modeling[0](k1=1,k2=2,k3=3,embed_dim=128,num_layer=1,device=device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
best_mse = 1000
best_ci = 0
best_epoch = -1
#model_file_name = 'model' + model_st + '_' + dataset + '.model'
result_file_name = 'result' + model_st + '_' + dataset + '.csv'
##TRAIN _ NUM OF EPOCHES
for epoch in range(NUM_EPOCHS):
hidden,cell = model.init_hidden(batch_size=TRAIN_BATCH_SIZE)
train(model, device, train_loader, optimizer, epoch+1,hidden,cell)
G,P = predicting(model, device, test_loader,hidden,cell)
ret = [rmse(G,P),mse(G,P),pearson(G,P),spearman(G,P),ci(G,P),get_rm2(G.reshape(G.shape[0],-1),P.reshape(P.shape[0],-1))]
if ret[1]<best_mse:
if args.save_file:
model_file_name = args.save_file + '.model'
torch.save(model.state_dict(), model_file_name)
with open(result_file_name,'w') as f:
f.write('rmse,mse,pearson,spearman,ci,rm2\n')
f.write(','.join(map(str,ret)))
best_epoch = epoch+1
best_mse = ret[1]
best_ci = ret[-2]
print('rmse improved at epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)
else:
print(ret[1],'No improvement since epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run DeepGLSTM")
parser.add_argument("--dataset",type=str,default='davis',
help="Dataset Name (davis,kiba,DTC,Metz,ToxCast,Stitch)")
parser.add_argument("--epoch",
type = int,
default = 1000,
help="Number of training epochs. Default is 1000."
)
parser.add_argument("--lr",
type=float,
default = 0.0005,
help="learning rate",
)
parser.add_argument("--batch_size",type=int,
default = 128,
help = "Number of drug-tareget per batch. Default is 128 for davis.") # batch 128 for Davis
parser.add_argument("--save_file",type=str,
default=None,
help="Where to save the trained model. For example davis.model")
args = parser.parse_args()
print(args)
main(args)
what shoud i do for my code ?
Upvotes: -1
Views: 61
Reputation: 1
You don’t need to save the model and reload the model again. Just save the model for that epoch. Please refer to the following code I modified, thanks
import argparse
import numpy as np
import pandas as pd
import sys, os
from random import shuffle
import torch
import torch.nn as nn
from models.gcn import GCNNet
from utils import *
# training function at each epoch
def train(model, device, train_loader, optimizer, epoch,hidden,cell):
print('Training on {} samples...'.format(len(train_loader.dataset)))
model.train()
for batch_idx, data in enumerate(train_loader):
data = data.to(device)
optimizer.zero_grad()
output = model(data,hidden,cell)
loss = loss_fn(output, data.y.view(-1, 1).float().to(device))
loss.backward()
optimizer.step()
if batch_idx % LOG_INTERVAL == 0:
print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch,
batch_idx * len(data.x),
len(train_loader.dataset),
100. * batch_idx / len(train_loader),
loss.item()))
def predicting(model, device, loader,hidden,cell):
model.eval()
total_preds = torch.Tensor()
total_labels = torch.Tensor()
print('Make prediction for {} samples...'.format(len(loader.dataset)))
with torch.no_grad():
for data in loader:
data = data.to(device)
output = model(data,hidden,cell)
total_preds = torch.cat((total_preds, output.cpu()), 0)
total_labels = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0)
return total_labels.numpy().flatten(),total_preds.numpy().flatten()
loss_fn = nn.MSELoss()
LOG_INTERVAL = 20
def main(args):
dataset = args.dataset
modeling = [GCNNet]
model_st = modeling[0].__name__
cuda_name = "cuda:0"
print('cuda_name:', cuda_name)
TRAIN_BATCH_SIZE = args.batch_size
TEST_BATCH_SIZE = args.batch_size
LR = args.lr
NUM_EPOCHS = args.epoch
print('Learning rate: ', LR)
print('Epochs: ', NUM_EPOCHS)
# Main program: iterate over different datasets
print('\nrunning on ', model_st + '_' + dataset )
processed_data_file_train = 'data/processed/' + dataset + '_train.pt'
processed_data_file_test = 'data/processed/' + dataset + '_test.pt'
if ((not os.path.isfile(processed_data_file_train)) or (not os.path.isfile(processed_data_file_test))):
print('please run create_data.py to prepare data in pytorch format!')
else:
train_data = TestbedDataset(root='data', dataset=dataset+'_train')
test_data = TestbedDataset(root='data', dataset=dataset+'_test')
# make data PyTorch mini-batch processing ready
train_loader = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True,drop_last=True)
test_loader = DataLoader(test_data, batch_size=TEST_BATCH_SIZE, shuffle=False,drop_last=True)
# training the model
device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")
model = modeling[0](k1=1,k2=2,k3=3,embed_dim=128,num_layer=1,device=device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
best_mse = 1000
best_ci = 0
best_epoch = -1
#model_file_name = 'model' + model_st + '_' + dataset + '.model'
result_file_name = 'result' + model_st + '_' + dataset + '.csv'
##TRAIN _ NUM OF EPOCHES
for epoch in range(NUM_EPOCHS):
hidden,cell = model.init_hidden(batch_size=TRAIN_BATCH_SIZE)
train(model, device, train_loader, optimizer, epoch+1,hidden,cell)
G,P = predicting(model, device, test_loader,hidden,cell)
ret = [rmse(G,P),mse(G,P),pearson(G,P),spearman(G,P),ci(G,P),get_rm2(G.reshape(G.shape[0],-1),P.reshape(P.shape[0],-1))]
if (epoch+1) % args.save_model_epoch == 0:
model_file_name = args.save_file + 'model_epoch_' + str(epoch+1) + '.model'
torch.save(model.state_dict(), model_file_name)
if ret[1]<best_mse:
if args.save_file:
model_file_name = args.save_file + '.model'
torch.save(model.state_dict(), model_file_name)
with open(result_file_name,'w') as f:
f.write('rmse,mse,pearson,spearman,ci,rm2\n')
f.write(','.join(map(str,ret)))
best_epoch = epoch+1
best_mse = ret[1]
best_ci = ret[-2]
print('rmse improved at epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)
else:
print(ret[1],'No improvement since epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run DeepGLSTM")
parser.add_argument("--dataset",type=str,default='davis',
help="Dataset Name (davis,kiba,DTC,Metz,ToxCast,Stitch)")
parser.add_argument("--epoch",
type = int,
default = 1000,
help="Number of training epochs. Default is 1000."
)
parser.add_argument("--lr",
type=float,
default = 0.0005,
help="learning rate",
)
parser.add_argument("--batch_size",type=int,
default = 128,
help = "Number of drug-tareget per batch. Default is 128 for davis.") # batch 128 for Davis
parser.add_argument("--save_file",type=str,
default=None,
help="Where to save the trained model. For example davis.model")
parser.add_argument("--save_model_epoch",
type = int,
default = 20,
help="Epochs to save the model during training. Default is 20")
args = parser.parse_args()
print(args)
main(args)
Upvotes: 0
Reputation: 83
Here is my current approach for training the model:
def train_model(model, train_loader, criterion, optimizer, start_epoch,end_epoch, save_path):
model.train()
for epoch in range(start_epoch, end_epoch):
running_loss = 0.0
for inputs, labels in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f'Epoch [{epoch + 1}/{end_epoch}], Loss: {running_loss / len(train_loader)}')
# Save the model every 20 epochs
if (epoch + 1) % 20 == 0:
torch.save({
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': running_loss,
}, f'{save_path}_epoch_{epoch + 1}.pth')
and you can load your model and continue the train
# If a checkpoint exists, load it
if checkpoint_path:
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']
print(f'Resuming training from epoch {start_epoch}')
# Train the model in chunks of 20 epochs
for i in range(start_epoch, num_epochs, 20):
train_model(model, train_loader, criterion, optimizer, i, min(i + 20, num_epochs), save_path)
Upvotes: 0