user2598911
user2598911

Reputation: 379

IndexError while fitting pipeline with FeatureUnion

I keep getting an

IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices

while trying to fit my dataframe to the following pipeline. Train and Test are two dataframes with same columns. There are different columns but I only want to focus on three of them through the ItemSelector.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import  OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

    class ItemSelector(BaseEstimator, TransformerMixin):

        def __init__(self, column):
            self.column = column

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            return X[self.column]


    def predictCases(train, test):
        target_names = sorted(list(set(train['TARGET'].values)))
        y_train  = np.array([target_names.index(x) for x in train['TARGET'].values])
        y_test   = np.array([target_names.index(x) for x in test['TARGET'].values])

        # train and predict
        classifier = Pipeline([
                    ('union', FeatureUnion([

                            ('text', Pipeline([
                                ('selector', ItemSelector(column='TEXT')),
                                ('tfidf_vec', TfidfVectorizer())
                            ])),

                            ('feature1', Pipeline([
                                ('selector', ItemSelector(column='CATEG_FEAT1')),
                                ('lbe', LabelEncoder())
                            ])),

                            ('feature2', Pipeline([
                                ('selector', ItemSelector(column='CATEG_FEAT2')),
                                ('lbe', LabelEncoder())
                            ]))
                    ])),
                    ('clf', OneVsRestClassifier(LinearSVC()))])
        classifier.fit(train.values, y_train)
        predicted = classifier.predict(test.values)
        return(metrics.precision_recall_fscore_support(y_test, predicted))

Full Error:

IndexError                                Traceback (most recent call last)
<ipython-input-19-95d9d0c337f4> in <module>()
----> 1 tt = predictCases(train_resampled, validate)

<ipython-input-17-efc951f4192e> in predictCases(train, test)
     24                 ])),
     25                 ('clf', OneVsRestClassifier(LinearSVC()))])
---> 26     classifier.fit(train.values, y_train)
     27     predicted = classifier.predict(test.values)
     28     return(metrics.precision_recall_fscore_support(y_test, predicted))

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    732             delayed(_fit_transform_one)(trans, name, weight, X, y,
    733                                         **fit_params)
--> 734             for name, trans, weight in self._iter())
    735 
    736         if not result:

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
        324         # Don't delay the application, to avoid keeping the input
        325         # arguments in memory
    --> 326         self.results = batch()
        327 
        328     def get(self):

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
        129 
        130     def __call__(self):
    --> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        132 
        133     def __len__(self):

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
        129 
        130     def __call__(self):
    --> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        132 
        133     def __len__(self):

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
        575                        **fit_params):
        576     if hasattr(transformer, 'fit_transform'):
    --> 577         res = transformer.fit_transform(X, y, **fit_params)
        578     else:
        579         res = transformer.fit(X, y, **fit_params).transform(X)

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
        299         """
        300         last_step = self._final_estimator
    --> 301         Xt, fit_params = self._fit(X, y, **fit_params)
        302         if hasattr(last_step, 'fit_transform'):
        303             return last_step.fit_transform(Xt, y, **fit_params)

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
        232                 pass
        233             elif hasattr(transform, "fit_transform"):
    --> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
        235             else:
        236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

    C:\\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
        495         else:
        496             # fit method of arity 2 (supervised transformation)
    --> 497             return self.fit(X, y, **fit_params).transform(X)
        498 
        499 

    <ipython-input-2-fdc42fd9d831> in transform(self, X)
         10 
         11     def transform(self, X):
    ---> 12         return X[self.column]

    IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

Edit:

If I use train instead of train.values in fit I get the following error:

TypeError: fit_transform() takes 2 positional arguments but 3 were given

Upvotes: 1

Views: 374

Answers (1)

Mikhail Korobov
Mikhail Korobov

Reputation: 22248

You're passing test.values (i.e. a numpy array with raw DataFrame values) to classifier.predict and classifier.fit, while your transformer expects a DataFrame object.

Upvotes: 1

Related Questions