Reputation: 11
So I'm trying to do outlier removal and supervised feature selection in the pipeline before classifier training. For this I had to create custom transformers to feed into the pipeline. All the examples I found had y=None
as an argument for the transform()
method, however, since I need to change y
(i.e. remove outliers from y
), I need to be able to access it. Here's my custom transformer for outlier removal.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
class OutlierExtractor1(BaseEstimator, TransformerMixin):
def __init__(self):
self.threshold = 2
self.isInlier = None
def transform(self, X, y):
ind = [False if i == -1 else True for i in self.isInlier]
return (X.loc[ind,:], y.loc[ind])
def fit(self, X, y):
X2 = np.asarray(X)
y2 = np.asarray(y)
scaler = StandardScaler()
norm = scaler.fit_transform(X2)
normalized_X = pd.DataFrame(norm, columns=X.columns)
lcf = LocalOutlierFactor(metric = 'euclidean')
self.isInlier = list(lcf.fit_predict(normalized_X))
return self
And here is the pipeline where I use said transformer:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
space = {'rf__max_depth': [9, 11, 12, 14],
'rf__n_estimators': [80, 90, 100]}
pipe = Pipeline([('outliers', OutlierExtractor1()),
('rf', RandomForestClassifier(criterion = 'entropy',
min_samples_split = 4,
min_samples_leaf = 2,
min_impurity_decrease = 0.01,
random_state=0))])
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
search.fit(X = downsampled, y = target)
pd.DataFrame(search.cv_results_)
I get this error.
TypeError Traceback (most recent call last)
<ipython-input-34-d10a6e74d8e8> in <module>
20 ftwo_scorer = make_scorer(fbeta_score, beta=2)
21 search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
---> 22 search.fit(X = downsampled, y = target)
23 pd.DataFrame(search.cv_results_)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
763 refit_start_time = time.time()
764 if y is not None:
--> 765 self.best_estimator_.fit(X, y, **fit_params)
766 else:
767 self.best_estimator_.fit(X, **fit_params)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~\AppData\Roaming\Python\Python37\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
691 else:
692 # fit method of arity 2 (supervised transformation)
--> 693 return self.fit(X, y, **fit_params).transform(X)
694
695
TypeError: transform() missing 1 required positional argument: 'y'
The error goes a way if I set y=None
, however, y
is not changed! It looks like the pipeline function only feeds X
to the pre processing steps. Can someone help please?
EDIT
The pipeline() function source code feeds X and y to the fit() method of each step, however, it only feeds X to the transform() method, so y cannot be changed.
My solution was to do the outlier extraction outside of the pipeline and consequentely outside of cross validation, which is a bummer.
Upvotes: 1
Views: 1247
Reputation: 46898
One thing about detecting outlier inside the train / test, bear in mind you are working with a smaller subset, so it might be less accurate. If the purpose is to simply exclusion, you can do that before passing it to a pipeline.
If you do need to do this, then it makes more sense to do the outlier detection within the fit. Below is modification of the code following a comment by jnothman in github:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier
class WithoutOutliersClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, outlier_detector, classifier):
self.outlier_detector = outlier_detector
self.classifier = classifier
def fit(self, X, y):
self.outlier_detector_ = clone(self.outlier_detector)
mask = self.outlier_detector_.fit_predict(X, y) == 1
self.classifier_ = clone(self.classifier).fit(X[mask], y[mask])
return self
def predict(self, X):
return self.classifier_.predict(X)
We can test this
import numpy as np
np.random.seed(111)
x = np.random.normal(0,1,(200,3))
y = np.random.binomial(1,0.5,200)
We expect 4 outliers:
(LocalOutlierFactor(metric='euclidean').fit_predict(x) == 1).sum()
4
I set oob_score = True
to show that the classifier is trained on the subset we expect:
rf = WithoutOutliersClassifier(LocalOutlierFactor(metric='euclidean'),
RandomForestClassifier(oob_score=True))
rf.fit(x,y)
rf.classifier_.oob_decision_function_.shape
(196, 2)
Now put this into a pipeline, note the change in names of your param:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
space = {'rf__classifier__max_depth': [3,4],
'rf__classifier__n_estimators' : [50,100]}
pipe = Pipeline([('scale', StandardScaler()),
('rf', rf)])
search = GridSearchCV(pipe, param_grid = space)
search.fit(X = x, y = y)
Upvotes: 1