Reputation: 31
I can run a manual fit/transform on the numeric pipeline without any errors:
fulldf = full_pipeline.fit_transform(train)
The error only comes when running"
scores = cross_val_score(full_pipeline_lr, X_train, y_train,cv = 2)
Full repex
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
# load data
source = 'http://lib.stat.cmu.edu/datasets/boston_corrected.txt'
BostonHousing2 = pd.read_table(source, skiprows= 9)
boston = BostonHousing2.drop(['OBS.', 'MEDV'], axis=1)
boston.columns = map(str.lower, boston.columns)
boston["cmedv"] = np.log(boston["cmedv"])
# create stratified sample based on quantities of price
q = boston.cmedv.quantile([0, 0.25,0.5,0.75,1])
boston["cmedv_cat"] = pd.cut(boston["cmedv"], bins = q, right=True,
labels = [1,2,3,4], include_lowest = True)
from sklearn.model_selection import StratifiedShuffleSplit
traintest = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=42)
for train_idx, test_idx in traintest.split(boston, boston["cmedv_cat"]):
train= boston.loc[train_idx]
test = boston.loc[test_idx]
train, test = train.drop(['cmedv_cat'], axis = 1), test.drop(['cmedv_cat'], axis = 1)
X_train = train.drop("cmedv", axis = 1)
y_train = train['cmedv']
# selects columns for pipeline
class ColumnSelect( BaseEstimator, TransformerMixin ):
def __init__( self, feature_names):
self._feature_names = feature_names
def fit( self, X, y = None ):
return self
def transform( self, X, y = None ):
return X[self._feature_names]
numerical_features = ['lon', 'lat', 'crim', 'zn','lstat', 'age']
numerical_pipeline = Pipeline( steps = [
( 'num_select', ColumnSelect(numerical_features) ),
( 'num_scale', StandardScaler() ),
( 'yeo_johnson', PowerTransformer() )
])
full_pipeline = FeatureUnion( transformer_list = [
('numerical_pipeline', numerical_pipeline),
#('ordinal_pipeline', ordinal_pipeline),
#('categorical_pipeline', categorical_pipeline)
])
fulldf = full_pipeline.fit_transform(train)
# add linear regression
full_pipeline_lr = Pipeline( steps = [
( 'full_pipeline', full_pipeline ),
( 'model', LinearRegression() ) ])
# Fitting without cross validation works
full_pipeline_lr.fit( train, train['cmedv'] )
y_pred = full_pipeline_lr.predict( test )
full_pipeline_lr.score( train, train['cmedv'] )
# trying cross validation - getting error
from sklearn.model_selection import cross_val_score
scores = cross_val_score(full_pipeline_lr, X_train, y_train,cv = 2)
Error
Traceback (most recent call last):
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 330, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 296, in _fit
**fit_params_steps[name])
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 953, in fit_transform
results = self._parallel_func(X, y, fit_params, _fit_transform_one)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 983, in _parallel_func
weight) in enumerate(transformers, 1))
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/parallel.py", line 1029, in __call__
if self.dispatch_one_batch(iterator):
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/parallel.py", line 253, in __call__
for func, args, kwargs in self.items]
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/parallel.py", line 253, in <listcomp>
for func, args, kwargs in self.items]
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 367, in fit_transform
Xt = self._fit(X, y, **fit_params_steps)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 296, in _fit
**fit_params_steps[name])
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/joblib/memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/sklearn/base.py", line 693, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "<ipython-input-25-06c5d9af1bc4>", line 47, in transform
return X[self._feature_names]
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/pandas/core/frame.py", line 2902, in __getitem__
indexer = self.columns.get_loc(key)
File "/Users/bxp151/opt/anaconda3/envs/boston/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2891, in get_loc
raise KeyError(key) from err
KeyError: None
Upvotes: 1
Views: 552
Reputation: 31
I modified:
self_.feature_names
to
self.feature_names
and it works. I'd love to know why that's the case if anyone knows.
Upvotes: 1