Mehmet Güven
Mehmet Güven

Reputation: 1

When logging the tutorial to mlflow, my callback function I wrote in multi-gpu does not work

I am logging to mlflow while training yolov8 which is a pre-trained model. I wrote my own callback function but it works on single gpu but not on multi-gpu

import mlflow
import mlflow.pytorch
import torch
from ultralytics import YOLO
import os

os.environ['WANDB_MODE'] = 'offline'

print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA Device:", torch.cuda.current_device())

print("Setting up MLflow...")
mlflow.set_tracking_uri("http://127.0.0.1:5000")  
experiment_name = "human_detection_model"
mlflow.set_experiment(experiment_name)
print(f"Experiment '{experiment_name}' set up.")

print("Loading YOLO model...")
model = YOLO('yolov8n.pt')
print("YOLO model loaded successfully.")

def log_epoch_model(trainer):
    if not hasattr(log_epoch_model, "epoch"): 
        log_epoch_model.epoch = 0  
    log_epoch_model.epoch += 1  

    model = trainer.ema.ema if hasattr(trainer, 'ema') else trainer.model
    dummy_input = torch.randn(1, 3, 640, 640, dtype=torch.double).numpy()
     mlflow.pytorch.log_model(model,                               artifact_path=f"weights/epoch_{log_epoch_model.epoch}",
                         input_example=dummy_input)

    print(f"Epoch {log_epoch_model.epoch} model logged to MLflow.")

model.add_callback("on_train_epoch_end", log_epoch_model)
print("Callback added.")

try:
    with mlflow.start_run() as run:
        print("Training started.")
        results = model.train(
        data='coco128.yaml',
        epochs=10,
        imgsz=640,
        verbose=True,
        project=experiment_name,
        name=experiment_name,
        save_period=0,  
        device='0,1,2,3' 
      )
    print("Training completed.")
    print(f"Run ID: {run.info.run_id}")
except Exception as e:
    print(f"An error occurred: {e}")

here is the code I use this way I am waiting for your recommendation

Upvotes: 0

Views: 10

Answers (0)

Related Questions