Jarad
Jarad

Reputation: 18983

Sklearn - FeatureUnion - Transformer: TypeError: fit_transform() takes 2 positional arguments but 3 were given

My apologies in advance for this large code block. It is the most concise way I can provide a repeatable working example.

In the code, I'm trying to use FeatureUnion to transform two columns from a dataframe where one column is text data so TfidfVectorizer and the other is a column of lists of tags so I want to use MultiLabelBinarizer.

ItemSelector transformer is for selecting the right column from the dataframe.

Why am I getting TypeError: fit_transform() takes 2 positional arguments but 3 were given ?

What do I need to change in the code to get this example to run properly?

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

import pandas as pd
import numpy as np

d = {'label': ['Help', 'Help', 'Other', 'Sale/Coupon', 'Other', 'Help', 'Help',
           'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 'Other',
           'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 'Sale/Coupon',
           'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 'Other'],
     'multilabels': ["['Samples']", "['Deck']", "['Deck', 'Deck Over', 'Stain']",
                     "['Coupons']", "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']"],
     'response': ['this is some text', 'this is some more text',
                  'and here is some more', 'and some more',
                  'and here we go some more yay done', 'this is some text',
                  'this is some more text', 'and here is some more',
                  'and some more', 'and here we go some more yay done',
                  'this is some text', 'this is some more text',
                  'and here is some more', 'and some more',
                  'and here we go some more yay done', 'this is some text',
                  'this is some more text', 'and here is some more',
                  'and some more', 'and here we go some more yay done',
                  'this is some text', 'this is some more text',
                  'and here is some more', 'and some more',
                  'and here we go some more yay done']}

class ItemSelector(BaseEstimator, TransformerMixin):
  def __init__(self, key):
    self.key = key

  def fit(self, X, y=None):
    return self

  def transform(self, df):
    return df[self.key]

feature_union = FeatureUnion(
  transformer_list=[
    ('step1', Pipeline([
      ('selector', ItemSelector(key='response')),
      ('tfidf', TfidfVectorizer()),
      ])),
    ('step2', Pipeline([
      ('selector', ItemSelector(key='multilabels')),
      ('multilabel', MultiLabelBinarizer())
      ]))
    ])

pipeline = OneVsRestClassifier(
  Pipeline([('union', feature_union),('sgd', SGDClassifier())])
  )

grid = GridSearchCV(pipeline, {}, verbose=5)

df = pd.DataFrame(d, columns=['response', 'multilabels', 'label'])
X = df[['response', 'multilabels']]
y = df['label']
grid.fit(X, y)

This is the full error:

Traceback (most recent call last):
  File "C:/Users/owner/Documents/my files/Account Tracking/Client/Foresee Analysis/SOQuestion.py", line 72, in <module>
    grid.fit(X, y)
  File "C:\Python34\lib\site-packages\sklearn\model_selection\_search.py", line 945, in fit
    return self._fit(X, y, groups, ParameterGrid(self.param_grid))
  File "C:\Python34\lib\site-packages\sklearn\model_selection\_search.py", line 564, in _fit
    for parameters in parameter_iterable
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async
    result = ImmediateResult(func)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__
    self.results = batch()
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:\Python34\lib\site-packages\sklearn\model_selection\_validation.py", line 238, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 216, in fit
    for i, column in enumerate(columns))
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async
    result = ImmediateResult(func)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__
    self.results = batch()
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 80, in _fit_binary
    estimator.fit(X, y)
  File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 268, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)
  File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 234, in _fit
    Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
  File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 734, in fit_transform
    for name, trans, weight in self._iter())
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async
    result = ImmediateResult(func)
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__
    self.results = batch()
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 577, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 303, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params)
TypeError: fit_transform() takes 2 positional arguments but 3 were given

Note: I have looked at _transform() takes 2 positional arguments but 3 were given but it still doesn't make sense to me.

Upvotes: 0

Views: 1882

Answers (1)

Jarad
Jarad

Reputation: 18983

Got it. Made another transformer to deal with the multi-label binarization. This is more like a work-around instead of a solution since the binarization happens within the transform instead of the pipeline.

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

import pandas as pd
import numpy as np

d = {'label': ['Help', 'Help', 'Other', 'Sale/Coupon', 'Other', 'Help', 'Help',
           'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 'Other',
           'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 'Sale/Coupon',
           'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 'Other'],
     'multilabels': ["['Samples']", "['Deck']", "['Deck', 'Deck Over', 'Stain']",
                     "['Coupons']", "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']", "['Samples']", "['Deck']",
                     "['Deck', 'Deck Over', 'Stain']", "['Coupons']",
                     "['Bathroom']"],
     'response': ['this is some text', 'this is some more text',
                  'and here is some more', 'and some more',
                  'and here we go some more yay done', 'this is some text',
                  'this is some more text', 'and here is some more',
                  'and some more', 'and here we go some more yay done',
                  'this is some text', 'this is some more text',
                  'and here is some more', 'and some more',
                  'and here we go some more yay done', 'this is some text',
                  'this is some more text', 'and here is some more',
                  'and some more', 'and here we go some more yay done',
                  'this is some text', 'this is some more text',
                  'and here is some more', 'and some more',
                  'and here we go some more yay done']}

class ItemSelector(BaseEstimator, TransformerMixin):
  def __init__(self, column):
    self.column = column

  def fit(self, X, y=None, **fit_params):
    return self

  def transform(self, X, y=None, **fit_params):
    return X[self.column]

class MultiLabelTransformer(BaseEstimator, TransformerMixin):

  def __init__(self, column):
    self.column = column

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    mlb = MultiLabelBinarizer()
    return mlb.fit_transform(X[self.column])

pipeline = OneVsRestClassifier(
  Pipeline([
  ('union', FeatureUnion(
    transformer_list=[
      ('step1', Pipeline([
        ('selector', ItemSelector(column='response')),
        ('tfidf', TfidfVectorizer())
        ])),
      ('step2', Pipeline([
        ('selector', MultiLabelTransformer(column='multilabels'))
        ]))
      ])),
  ('sgd', SGDClassifier())
  ])
  )

grid = GridSearchCV(pipeline, {}, verbose=5)

df = pd.DataFrame(d, columns=['response', 'multilabels', 'label'])
df['multilabels'] = df['multilabels'].apply(lambda s: eval(s))
X = df[['response', 'multilabels']]
y = df['label']
grid.fit(X, y)

Upvotes: 1

Related Questions