Does sklearn pipeline() feed both X and y to the following steps?

So I'm trying to do outlier removal and supervised feature selection in the pipeline before classifier training. For this I had to create custom transformers to feed into the pipeline. All the examples I found had y=None as an argument for the transform() method, however, since I need to change y (i.e. remove outliers from y), I need to be able to access it. Here's my custom transformer for outlier removal.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

class OutlierExtractor1(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.threshold = 2
        self.isInlier = None

    def transform(self, X, y):
        ind = [False if i == -1 else True for i in self.isInlier]
        return (X.loc[ind,:], y.loc[ind])

    def fit(self, X, y):
        X2 = np.asarray(X)
        y2 = np.asarray(y)
        scaler = StandardScaler()
        norm = scaler.fit_transform(X2)
        normalized_X = pd.DataFrame(norm, columns=X.columns)
        lcf = LocalOutlierFactor(metric = 'euclidean')
        self.isInlier = list(lcf.fit_predict(normalized_X))
        return self

And here is the pipeline where I use said transformer:

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer

space = {'rf__max_depth': [9, 11, 12, 14],
         'rf__n_estimators': [80, 90, 100]}

pipe = Pipeline([('outliers', OutlierExtractor1()),
                 ('rf', RandomForestClassifier(criterion = 'entropy',
                                               min_samples_split = 4,
                                               min_samples_leaf = 2,
                                               min_impurity_decrease = 0.01,
                                               random_state=0))])

ftwo_scorer = make_scorer(fbeta_score, beta=2)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
search.fit(X = downsampled, y = target)
pd.DataFrame(search.cv_results_)

I get this error.

TypeError                                 Traceback (most recent call last)
<ipython-input-34-d10a6e74d8e8> in <module>
     20 ftwo_scorer = make_scorer(fbeta_score, beta=2)
     21 search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
---> 22 search.fit(X = downsampled, y = target)
     23 pd.DataFrame(search.cv_results_)

~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    763             refit_start_time = time.time()
    764             if y is not None:
--> 765                 self.best_estimator_.fit(X, y, **fit_params)
    766             else:
    767                 self.best_estimator_.fit(X, **fit_params)

~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    328         """
    329         fit_params_steps = self._check_fit_params(**fit_params)
--> 330         Xt = self._fit(X, y, **fit_params_steps)
    331         with _print_elapsed_time('Pipeline',
    332                                  self._log_message(len(self.steps) - 1)):

~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
    294                 message_clsname='Pipeline',
    295                 message=self._log_message(step_idx),
--> 296                 **fit_params_steps[name])
    297             # Replace the transformer of the step with the fitted
    298             # transformer. This is necessary when loading the transformer

~\AppData\Roaming\Python\Python37\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    738     with _print_elapsed_time(message_clsname, message):
    739         if hasattr(transformer, 'fit_transform'):
--> 740             res = transformer.fit_transform(X, y, **fit_params)
    741         else:
    742             res = transformer.fit(X, y, **fit_params).transform(X)

~\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    691         else:
    692             # fit method of arity 2 (supervised transformation)
--> 693             return self.fit(X, y, **fit_params).transform(X)
    694 
    695 

TypeError: transform() missing 1 required positional argument: 'y'

The error goes a way if I set y=None, however, y is not changed! It looks like the pipeline function only feeds X to the pre processing steps. Can someone help please?

EDIT

The pipeline() function source code feeds X and y to the fit() method of each step, however, it only feeds X to the transform() method, so y cannot be changed.

My solution was to do the outlier extraction outside of the pipeline and consequentely outside of cross validation, which is a bummer.

Upvotes: 1

Views: 1247

Answers (1)

StupidWolf
StupidWolf

Reputation: 46898

One thing about detecting outlier inside the train / test, bear in mind you are working with a smaller subset, so it might be less accurate. If the purpose is to simply exclusion, you can do that before passing it to a pipeline.

If you do need to do this, then it makes more sense to do the outlier detection within the fit. Below is modification of the code following a comment by jnothman in github:

from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier

class WithoutOutliersClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, outlier_detector, classifier):
        self.outlier_detector = outlier_detector
        self.classifier = classifier

    def fit(self, X, y):
        self.outlier_detector_ = clone(self.outlier_detector)
        mask = self.outlier_detector_.fit_predict(X, y) == 1
        self.classifier_ = clone(self.classifier).fit(X[mask], y[mask])
        return self

    def predict(self, X):
        return self.classifier_.predict(X)

We can test this

import numpy as np
np.random.seed(111)
x = np.random.normal(0,1,(200,3))
y = np.random.binomial(1,0.5,200)

We expect 4 outliers:

(LocalOutlierFactor(metric='euclidean').fit_predict(x) == 1).sum()
4

I set oob_score = True to show that the classifier is trained on the subset we expect:

rf = WithoutOutliersClassifier(LocalOutlierFactor(metric='euclidean'),
RandomForestClassifier(oob_score=True))
rf.fit(x,y)
rf.classifier_.oob_decision_function_.shape
 (196, 2)

Now put this into a pipeline, note the change in names of your param:

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

space = {'rf__classifier__max_depth': [3,4],
'rf__classifier__n_estimators' : [50,100]}

pipe = Pipeline([('scale', StandardScaler()),
                 ('rf', rf)])

search = GridSearchCV(pipe, param_grid = space)
search.fit(X = x, y = y)

Upvotes: 1

Related Questions