Reputation: 21
I am new to Sagemaker, I am trying to create inference pipeline and for that I am creating two models one for preprocessing and another one for training. I am using SKLearn to create the both of those jobs.
For the preprocessor I have a preprocess script that includes basic data transformation and functions required for inference. At the end of this job the preprocess.joblib
is saved to s3. Until this part the code is running without issues and the model.joblib
is saved to /opt/ml/model/
.
The next step, is to train a model and for that I have a training script, which load the model and inside the main function, when the model is loaded, it shows me an error that the model does not exist.
here is both the preprocessing and training scripts:
---- preprocessing script
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd
import joblib
import os
# Function to create a preprocessing pipeline
def preprocess_data():
numeric_features = ['age', 'inactivity', 'recency', 'frequency']
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features)
]
)
return preprocessor
if __name__ == "__main__":
print("[INFO] Loading and preprocessing training data...")
# Paths for SageMaker environment variables
input_dir = os.environ.get("SM_CHANNEL_TRAIN")
output_dir = os.environ.get("SM_MODEL_DIR")
# Combine input files
data_files = [os.path.join(input_dir, file) for file in os.listdir(input_dir)]
train_data = pd.concat([pd.read_csv(file) for file in data_files])
# Drop the target column and preprocess only features
target_column = 'Risk'
predefined_features = ['age', 'inactivity', 'recency', 'frequency']
features_data = train_data[predefined_features]
# Fit the preprocessing pipeline
preprocessor = preprocess_data()
preprocessor.fit(features_data)
# Save the preprocessor model
joblib.dump(preprocessor, os.path.join(output_dir, "preprocessor.joblib"))
print("[INFO] Preprocessing model saved successfully as preprocessor.joblib.")
--- Training script
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import joblib
import os
import argparse
# Function to load a pre-trained model
def load_preprocessor(model_dir):
preprocessor_path = os.path.join(model_dir, "preprocessor.joblib")
if os.path.exists(preprocessor_path):
print(f"[INFO] Loading preprocessor from {preprocessor_path}")
return joblib.load(preprocessor_path)
else:
raise FileNotFoundError(f"[ERROR] Preprocessor artifact not found at {preprocessor_path}")
if __name__ == "__main__":
print("[INFO] Parsing arguments...")
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int, default=100)
parser.add_argument("--random_state", type=int, default=42)
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
parser.add_argument("--train-file", type=str, default="train.csv")
parser.add_argument("--test-file", type=str, default="test.csv")
args = parser.parse_args()
print("[INFO] Loading preprocessor...")
preprocessor = load_preprocessor(args.model_dir)
print("[INFO] Reading training and testing data...")
train_df = pd.read_csv(os.path.join(args.train, args.train_file))
test_df = pd.read_csv(os.path.join(args.test, args.test_file))
# Define features and labels
features = ['age', 'inactivity', 'recency', 'frequency']
label = 'Risk'
# Preprocess training and testing data
X_train = preprocessor.transform(train_df[features])
y_train = train_df[label]
X_test = preprocessor.transform(test_df[features])
y_test = test_df[label]
print("[INFO] Training Random Forest model...")
model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
model.fit(X_train, y_train)
# Save the trained model
model_path = os.path.join(args.model_dir, "random_forest_model.joblib")
joblib.dump(model, model_path)
print(f"[INFO] Trained model saved as random_forest_model.joblib at {model_path}")
print("[INFO] Evaluating model on test data...")
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", acc)
print("Classification Report:\n", report)
What could be the issue?
Upvotes: 1
Views: 26