Reputation: 3
I've defined a neural network to predict the sale price of a house given features about the house, it has RMSE score of 0.16 on both test and training data. What approaches can i take to make the model more accuarate
from scikeras.wrappers import KerasRegressor
from keras.models import Model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from keras.layers import Dense, Dropout, Input
from sklearn.metrics import mean_squared_error
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Load data
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
# Define features and target
target = 'SalePrice'
features = train_data.drop(columns=[target]).columns
X = train_data[features]
y = train_data[target]
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Identify numerical and categorical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == 'object']
# Preprocessing pipeline
numerical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
]
)
# Fit and transform data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
# Define the model
def build_model():
input_shape = X_train_preprocessed.shape[1]
inputs = Input(shape=(input_shape,))
x = Dense(128, activation='relu')(inputs)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(1)(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='mean_squared_error')
return model
nn_model = KerasRegressor(model=build_model, epochs=100, batch_size=10, verbose=0)
nn_model.fit(X_train_preprocessed, y_train)
# Predictions and evaluation
preds = nn_model.predict(X_val_preprocessed)
rmse = np.sqrt(mean_squared_error(np.log(y_val), np.log(preds)))
print('RMSE:', rmse)
# Preprocessing and submitting
X_test = test[features]
X_test_preprocessed = preprocessor.transform(X_test)
test_preds = nn_model.predict(X_test_preprocessed)
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': test_preds})
submission.to_csv('/kaggle/working/submission.csv', index=False)
I've tried adding layers and neurons to the model as well playing around with the dropout rate and this yielded an RMSE of 0.14 on the training data
def build_model():
input_shape = X_train_preprocessed.shape[1]
inputs = Input(shape=(input_shape,))
x = Dense(1024, activation='relu')(inputs)
x = Dropout(0.4)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
outputs = Dense(1)(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='mean_squared_error')
return model
Upvotes: 0
Views: 39