Aiman
Aiman

Reputation: 21

Training script unable to load preprocessing model

I am new to Sagemaker, I am trying to create inference pipeline and for that I am creating two models one for preprocessing and another one for training. I am using SKLearn to create the both of those jobs. For the preprocessor I have a preprocess script that includes basic data transformation and functions required for inference. At the end of this job the preprocess.joblib is saved to s3. Until this part the code is running without issues and the model.joblib is saved to /opt/ml/model/. The next step, is to train a model and for that I have a training script, which load the model and inside the main function, when the model is loaded, it shows me an error that the model does not exist.

here is both the preprocessing and training scripts:

---- preprocessing script


    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    import joblib
    import os
    
    # Function to create a preprocessing pipeline
    def preprocess_data():
        numeric_features = ['age', 'inactivity', 'recency', 'frequency']
        numeric_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
    
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features)
            ]
        )
        return preprocessor
    
    if __name__ == "__main__":
        print("[INFO] Loading and preprocessing training data...")
        
        # Paths for SageMaker environment variables
        input_dir = os.environ.get("SM_CHANNEL_TRAIN")
        output_dir = os.environ.get("SM_MODEL_DIR")
    
        # Combine input files
        data_files = [os.path.join(input_dir, file) for file in os.listdir(input_dir)]
        train_data = pd.concat([pd.read_csv(file) for file in data_files])
    
        # Drop the target column and preprocess only features
        target_column = 'Risk'
        predefined_features = ['age', 'inactivity', 'recency', 'frequency']
        features_data = train_data[predefined_features]
    
        # Fit the preprocessing pipeline
        preprocessor = preprocess_data()
        preprocessor.fit(features_data)
        
        # Save the preprocessor model
        joblib.dump(preprocessor, os.path.join(output_dir, "preprocessor.joblib"))
        print("[INFO] Preprocessing model saved successfully as preprocessor.joblib.")

--- Training script


    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    import pandas as pd
    import joblib
    import os
    import argparse
    
    # Function to load a pre-trained model
    def load_preprocessor(model_dir):
        preprocessor_path = os.path.join(model_dir, "preprocessor.joblib")
        if os.path.exists(preprocessor_path):
            print(f"[INFO] Loading preprocessor from {preprocessor_path}")
            return joblib.load(preprocessor_path)
        else:
            raise FileNotFoundError(f"[ERROR] Preprocessor artifact not found at {preprocessor_path}")
    
    if __name__ == "__main__":
        print("[INFO] Parsing arguments...")
        parser = argparse.ArgumentParser()
        parser.add_argument("--n_estimators", type=int, default=100)
        parser.add_argument("--random_state", type=int, default=42)
        parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
        parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
        parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
        parser.add_argument("--train-file", type=str, default="train.csv")
        parser.add_argument("--test-file", type=str, default="test.csv")
        args = parser.parse_args()
    
        print("[INFO] Loading preprocessor...")
        preprocessor = load_preprocessor(args.model_dir)
    
        print("[INFO] Reading training and testing data...")
        train_df = pd.read_csv(os.path.join(args.train, args.train_file))
        test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
        # Define features and labels
        features = ['age', 'inactivity', 'recency', 'frequency']
        label = 'Risk'
    
        # Preprocess training and testing data
        X_train = preprocessor.transform(train_df[features])
        y_train = train_df[label]
        X_test = preprocessor.transform(test_df[features])
        y_test = test_df[label]
    
        print("[INFO] Training Random Forest model...")
        model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
        model.fit(X_train, y_train)
    
        # Save the trained model
        model_path = os.path.join(args.model_dir, "random_forest_model.joblib")
        joblib.dump(model, model_path)
        print(f"[INFO] Trained model saved as random_forest_model.joblib at {model_path}")
    
        print("[INFO] Evaluating model on test data...")
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
    
        print("Accuracy:", acc)
        print("Classification Report:\n", report)

What could be the issue?

Upvotes: 1

Views: 26

Answers (0)

Related Questions