Reputation: 1
I am logging to mlflow while training yolov8 which is a pre-trained model. I wrote my own callback function but it works on single gpu but not on multi-gpu
import mlflow
import mlflow.pytorch
import torch
from ultralytics import YOLO
import os
os.environ['WANDB_MODE'] = 'offline'
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("CUDA Device:", torch.cuda.current_device())
print("Setting up MLflow...")
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment_name = "human_detection_model"
mlflow.set_experiment(experiment_name)
print(f"Experiment '{experiment_name}' set up.")
print("Loading YOLO model...")
model = YOLO('yolov8n.pt')
print("YOLO model loaded successfully.")
def log_epoch_model(trainer):
if not hasattr(log_epoch_model, "epoch"):
log_epoch_model.epoch = 0
log_epoch_model.epoch += 1
model = trainer.ema.ema if hasattr(trainer, 'ema') else trainer.model
dummy_input = torch.randn(1, 3, 640, 640, dtype=torch.double).numpy()
mlflow.pytorch.log_model(model, artifact_path=f"weights/epoch_{log_epoch_model.epoch}",
input_example=dummy_input)
print(f"Epoch {log_epoch_model.epoch} model logged to MLflow.")
model.add_callback("on_train_epoch_end", log_epoch_model)
print("Callback added.")
try:
with mlflow.start_run() as run:
print("Training started.")
results = model.train(
data='coco128.yaml',
epochs=10,
imgsz=640,
verbose=True,
project=experiment_name,
name=experiment_name,
save_period=0,
device='0,1,2,3'
)
print("Training completed.")
print(f"Run ID: {run.info.run_id}")
except Exception as e:
print(f"An error occurred: {e}")
here is the code I use this way I am waiting for your recommendation
Upvotes: 0
Views: 10