Edward
Edward

Reputation: 4623

Error in the class to create pipeline

I try to create a new variable 'age' from two variables 'date_birth' and 'date_survey'

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model, pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

my dataframe

df = pd.DataFrame({'a':[1,2,3], 
                   'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'],
                   'birth': ['1985', '1984', '1986'] })

The code for pipeline

X = df[['date_survey', 'birth']]
y = df['a']
class MultiColumn:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self
    def transform(self, X):                                                           
        return X[self.columns]
class Age(TransformerMixin):

    def transform(self, X, y=None, **fit_params): 
        X['date_survey'] = pd.to_datetime(X['date_survey'])
        year = pd.DataFrame(X['date_survey'].apply(lambda x: x.year))
        age = X['birth'].convert_objects(convert_numeric=True) - year
        return age

    def fit(self, X, y=None, **fit_params):
        return self
regressor = linear_model.SGDRegressor()
pipeline = Pipeline([
          ('union', FeatureUnion(
        transformer_list=[    
             # age
            ('age', Pipeline([
                ('selector', MultiColumn(columns=['date_survey', 'birth'])),
                ('date', Age())

            ])),
        ])),
    # Use a regression
    ('model_fitting', regressor),
])
pipeline.fit(X, y)

and i get an error

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

I guess that the error in class Age, but i cann't understand how to improve it

Upvotes: 0

Views: 71

Answers (1)

Jarad
Jarad

Reputation: 18983

  date_survey birth date_survey_in_transform  year
0  10.01.2013  1985               2013-10-01  2013
1  20.02.2014  1984               2014-02-20  2014
2  30.03.2015  1986               2015-03-30  2015

birth - year is negative.

age = X['birth'].convert_objects(convert_numeric=True) - year

I modified some of your code to get it to run without errors.

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model, pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor

df = pd.DataFrame({'a':[1,2,3], 
                   'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'],
                   'birth': ['1985', '1984', '1986'] })

X = df[['date_survey', 'birth']]
y = df['a']
class MultiColumn:
    def __init__(self,columns=None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self

    def transform(self, X):                                                           
        return X[self.columns]

class Age(TransformerMixin):

    def transform(self, X, y=None, **fit_params): 
        X['date'] = pd.to_datetime(X['date_survey'])
        X['year'] = X['date'].dt.year
        X['age'] = X['year'] - X['birth'].astype('int64')
        return X['age'].reshape(-1, 1)

    def fit(self, X, y=None, **fit_params):
        return self

pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
            # age
            ('age', Pipeline([
                ('selector', MultiColumn(columns=['date_survey', 'birth'])),
                ('date', Age())
                ])
             ),
            ]
        )
     ),
    # Use a regression
    ('model_fitting', SGDRegressor())
    ])

pipeline.fit(X, y)

Upvotes: 1

Related Questions