Why ray.train.get_checkpoint() from Ray Tune is returning None even after saving the checkpoint?

Question

I am trying to tune my model with ray tune for pytorch. I would really like to be able to save the tuning progress, stop the execution and resume the execution from where I left. Unfortunately, I am not able to resume the execution from the last checkpoint, even after I successfully made the checkpoint.

I followed the tutorial from the official documentation, from here ray tune example, and tried to implement the same thing for my model, here is my code:

import argparse
import os
import random
import tempfile
import time
import numpy as np
import ray
import torch

from dataset import get_dataset
from models.Backbone import Backbone
from utils import load_config
from ray.train import Checkpoint
from ray import tune
from ray import train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search import ConcurrencyLimiter
from ray.tune.search.hyperopt import HyperOptSearch
from utils import updata_lr, Meter, cal_score
from tqdm import tqdm


def training(params, model, optimizer, epoch, train_loader):

    model.train()
    device = params['device']
    loss_meter = Meter()

    word_right, struct_right, exp_right, length, cal_num = 0, 0, 0, 0, 0

    for batch_idx, (images, image_masks, labels, label_masks) in enumerate(train_loader):

        images, image_masks, labels, label_masks = images.to(device), image_masks.to(device), labels.to(
            device), label_masks.to(device)

        batch, time = labels.shape[:2]
        if not 'lr_decay' in params or params['lr_decay'] == 'cosine':
            updata_lr(optimizer, epoch, batch_idx, len(train_loader), params['epoches'], params['lr'])
        optimizer.zero_grad()

        probs, loss = model(images, image_masks, labels, label_masks)

        word_loss, struct_loss, parent_loss, kl_loss = loss
        loss = (word_loss + struct_loss + parent_loss + kl_loss)

        loss.backward()
        if params['gradient_clip']:
            torch.nn.utils.clip_grad_norm_(model.parameters(), params['gradient'])

        optimizer.step()

        loss_meter.add(loss.item())

        wordRate, structRate, ExpRate = cal_score(probs, labels, label_masks)

        word_right = word_right + wordRate * time
        struct_right = struct_right + structRate * time
        exp_right = exp_right + ExpRate * batch
        length = length + time
        cal_num = cal_num + batch

        if batch_idx % 10 == 9:
            print(f'Epoch: {epoch+1} batch index: {batch_idx+1}/{len(train_loader)} train loss: {loss.item():.4f} word loss: {word_loss:.4f} '
                                 f'struct loss: {struct_loss:.4f} parent loss: {parent_loss:.4f} '
                                 f'kl loss: {kl_loss:.4f} WordRate: {word_right / length:.4f} '
                                 f'structRate: {struct_right / length:.4f} ExpRate: {exp_right / cal_num:.4f}')

    return loss_meter.mean, word_right / length, struct_right / length, exp_right / cal_num


def eval(params, model, epoch, eval_loader):

    model.eval()
    device = params['device']
    loss_meter = Meter()

    word_right, struct_right, exp_right, length, cal_num = 0, 0, 0, 0, 0

    for batch_idx, (images, image_masks, labels, label_masks) in enumerate(eval_loader):

        images, image_masks, labels, label_masks = images.to(device), image_masks.to(device), labels.to(
            device), label_masks.to(device)

        batch, time = labels.shape[:2]

        probs, loss = model(images, image_masks, labels, label_masks, is_train=False)

        word_loss, struct_loss = loss
        loss = word_loss + struct_loss
        loss_meter.add(loss.item())

        wordRate, structRate, ExpRate = cal_score(probs, labels, label_masks)

        word_right = word_right + wordRate * time
        struct_right = struct_right + structRate * time
        exp_right = exp_right + ExpRate
        length = length + time
        cal_num = cal_num + batch

    return loss_meter.mean, word_right / length, struct_right / length, exp_right / cal_num


parser = argparse.ArgumentParser()
parser.add_argument('--config', default='config.yaml', type=str, help='path to config file')
parser.add_argument('--check', action='store_true', help='only for code check')
args = parser.parse_args()

if not args.config:
    print('please provide config yaml')
    exit(-1)

"""config"""
params = load_config(args.config)

def train_tune(config, base_dir):

    params.update(config)

    """random seed"""
    random.seed(params['seed'])
    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])
    torch.cuda.manual_seed(params['seed'])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    params['device'] = device

    # Construct absolute paths
    params['word_path'] = os.path.join(base_dir, params['word_path'])
    params['train_image_path'] = os.path.join(base_dir, params['train_image_path'])
    params['train_label_path'] = os.path.join(base_dir, params['train_label_path'])
    params['eval_image_path'] = os.path.join(base_dir, params['eval_image_path'])
    params['eval_label_path'] = os.path.join(base_dir, params['eval_label_path'])
    params['checkpoint_dir'] = os.path.join(base_dir, params['checkpoint_dir'])

    train_loader, eval_loader = get_dataset(params)
    model = Backbone(params)
    now = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
    model.name = f'{params["experiment"]}_{now}_Encoder-{params["encoder"]["net"]}_Decoder-{params["decoder"]["net"]}_' \
                f'max_size-{params["image_height"]}-{params["image_width"]}'
    print(model.name)
    model = model.to(device)

    optimizer = getattr(torch.optim, params['optimizer'])(model.parameters(), lr=float(params['lr']),
                                                      eps=float(params['eps']), weight_decay=float(params['weight_decay']))
    
    # Load existing checkpoint through `get_checkpoint()` API.
    start = 0
    checkpoint = train.get_checkpoint()
    print(checkpoint)
    if checkpoint:
        loaded_checkpoint = train.get_checkpoint()
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, prev_epoch = torch.load(
                os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            )
            start = prev_epoch + 1
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)
    
    min_score = 0
    min_step = 0
    for epoch in range(start, params['epoches']):
        
        train_loss, train_word_score, train_node_score, train_expRate = training(params, model, optimizer, epoch, train_loader)

        eval_loss, eval_word_score, eval_node_score, eval_expRate = eval(params, model, epoch, eval_loader)

        print(f'Epoch: {epoch+1}  loss: {eval_loss:.4f}  word score: {eval_word_score:.4f}  struct score: {eval_node_score:.4f} '
              f'ExpRate: {eval_expRate:.4f}')
        
        if eval_expRate > min_score and not args.check:
            min_score = eval_expRate
            min_step = 0
        
        elif min_score != 0 and 'lr_decay' in params and params['lr_decay'] == 'step':

            min_step += 1

            if min_step > params['step_ratio']:
                new_lr = optimizer.param_groups[0]['lr'] / params['step_decay']

                if new_lr < params['lr'] / 1000:
                    print('lr is too small')
                    exit(-1)

                for param_group in optimizer.param_groups:
                    param_group['lr'] = new_lr

                min_step = 0
        
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            path = os.path.join(temp_checkpoint_dir, "checkpoint.pt")
            torch.save(
                (model.state_dict(), optimizer.state_dict(), epoch), path
            )
            checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
            train.report(
                {"loss": eval_loss, "accuracy": eval_expRate},
                checkpoint=checkpoint,
            )
    
    print("Finished Training")


def main(num_samples=20, max_num_epochs=10, gpus_per_trial=1):
    config = {
        "batch_size": tune.choice([2, 4, 8]),
        "optimizer": tune.choice(['Adadelta', 'Adagrad', 'Adam', 'RMSprop', 'SGD']),
        "lr": tune.loguniform(1e-4, 1),
        "lr_decay": tune.choice(['step', 'cosine']),
        "step_ratio": tune.choice([5, 10, 20, 30, 40]),
        "weight_decay": tune.loguniform(1e-4, 1),
        "dropout": tune.choice([True, False]),
        "dropout_ratio": tune.uniform(0.2, 0.5),
        "relu": tune.choice([True, False]),
        "gradient": tune.choice([0.1, 1, 5, 10, 100]),
        "gradient_clip": tune.choice([True, False]),
        "use_label_mask": tune.choice([True, False]),
    }
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2
    )

    initial_params = [
        {
            "batch_size": 8,
            "optimizer": 'Adadelta',
            "lr": 1,
            "lr_decay": 'cosine',
            "step_ratio": 10,
            "weight_decay": 1e-4,
            "dropout": True,
            "dropout_ratio": 0.5,
            "relu": True,
            "gradient": 100,
            "gradient_clip": True,
            "use_label_mask": False,
        },
    ]
    algo = HyperOptSearch(points_to_evaluate=initial_params)
    algo = ConcurrencyLimiter(algo, max_concurrent=4)

    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_tune, base_dir="/workspaces/Anaconda/SAN"),
            resources={"cpu": 8, "gpu": gpus_per_trial}
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            scheduler=scheduler,
            search_alg=algo,
            num_samples=num_samples,
        ),
        param_space=config,
    )
    results = tuner.fit()

    best_result = results.get_best_result("loss", "min")

    print("Best trial config: {}".format(best_result.config))
    print("Best trial final validation loss: {}".format(best_result.metrics["loss"]))
    print("Best trial final validation accuracy: {}".format(best_result.metrics["accuracy"]))


if __name__ == "__main__":
    main()

I just made some changes in the configuration, but followed the steps from the documentation example. Using this code I save the state of the tuning after each epoch:

with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
    path = os.path.join(temp_checkpoint_dir, "checkpoint.pt")
    torch.save(
        (model.state_dict(), optimizer.state_dict(), epoch), path
    )
    checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
    train.report(
        {"loss": eval_loss, "accuracy": eval_expRate},
        checkpoint=checkpoint,
    )

And using this code I load the last made checkpoint:

start = 0
checkpoint = train.get_checkpoint()
print(checkpoint)
if checkpoint:
    loaded_checkpoint = train.get_checkpoint()
    with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
        model_state, optimizer_state, prev_epoch = torch.load(
            os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
        )
        start = prev_epoch + 1
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

I ran the code till the first epoch was done and saved, I got a confirmation log that it was saved, also I could see the saved epoch in the ~/ray_result directory using tensorboard. I stopped the execution and tried to run the execution again. Unfortunately, ray.train.get_checkpoint() returned None and the execution started again from epoch 1, not 2.

I am not sure what I did wrong, I tried to seach this problem on ray tune github issues, documentation and just on the web, but couldn't find an answer. Could someone tell me, please, what I did wrong?

Why ray.train.get_checkpoint() from Ray Tune is returning None even after saving the checkpoint?

Answers (0)

Related Questions