aceminer
aceminer

Reputation: 4295

Full sklearn pipeline example

I am trying to use sklearn pipeline. But i tried various tutorials online and it didnt help me.

import pandas as pd 
import numpy as np
import json
import seaborn as sb 
from sklearn.metrics import log_loss
from sklearn import linear_model 
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.stats import zscore
from Transformers import TextTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
len(df)
df = df[['description', 'interest_level']]
from sklearn.pipeline import Pipeline, FeatureUnion
a = TextTransformer('description', max_features=50)
b = TextTransformer('features', max_features=10)
pipeline = Pipeline([
    ('description',a ), # can pass in either a pipeline
        #('features',b ) # or a transformer
J    ('clf', SVC())  # classifier
])
pipeline.fit(df[:,'interest_level'])

My Text transformer

from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk


class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features=5000):
        self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
                                               tokenizer=self._custom_tokenizer, analyzer='word',
                                               max_features=max_features)
        self._vectorizer = None
        self._column = column

    def _custom_tokenizer(self, string):
        # string = re.sub('^[\w]', '', string)
        tokens = nltk.word_tokenize(string)
        cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
        return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]

    def _clean_html_tags(self, content):
        return BeautifulSoup(content, 'lxml').text

    def fit(self, df):
        self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
        return self

    def transform(self, df):
        return self._vectorizer.transform(df[self._column]).todense()

However, i cannot seem to get it right. It keeps on throw this exception in ipython notebook

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-11-b3788282dc5c> in <module>()
      8     ('clf', SVC())  # classifier
      9 ])
---> 10 pipeline.fit(df[:,'interest_level'])

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1382         """Return the cached item, item represents a label indexer."""
   1383         cache = self._item_cache
-> 1384         res = cache.get(item)
   1385         if res is None:
   1386             values = self._data.get(item)

TypeError: unhashable type

Description of data

    description interest_level
10  A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...   medium
10000       low
100004  Top Top West Village location, beautiful Pre-w...   high
100007  Building Amenities - Garage - Garden - fitness...   low
100013  Beautifully renovated 3 bedroom flex 4 bedroom...   low

Interest level would be my target variable

Upvotes: 1

Views: 2382

Answers (3)

Shahzaib Ali
Shahzaib Ali

Reputation: 412

Simple example of sklearn pipelines for all features distribution like categorical, nominal, ordinal etc.

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline




# All data encoders 
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
std = StandardScaler()



# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')

X_dense_data = imp_mean.fit_transform(X)
X = pd.DataFrame(X_dense_data, columns=X.columns.values.tolist())



# All columns distribution
ohe_column_catagorical_feature = ['race', 'sex', 'age group']
std_column_numeric_feature = ['height', 'weight', 'temperature', 'blood glucose']



# Numaric feature transformer
feature_numeric_transformer = Pipeline(steps=[          
          ('scaler_data', std)
])


# catagorical feature transformer
catagorical_numeric_transformer = Pipeline(steps=[          
          ('onehot', ohe)
])


# column transformer to transform the value of each feature
preprocessor_feature = ColumnTransformer(
    transformers=[
        ('num', feature_numeric_transformer, std_column_numeric_feature),
        ('cat', catagorical_numeric_transformer, ohe_column_catagorical_feature)
        ], remainder='drop'
        )

Make sure your data value is fully filled. if not here is the example to fill nullity using sklear SimpleImputer

The imputation strategy.

If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.

If “median”, then replace missing values using the median along each column. Can only be used with numeric data.

If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data.

If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.

# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')

X_dense_data = imp_mean.fit_transform(X)

Upvotes: 1

Uri Goren
Uri Goren

Reputation: 13700

Writing Pipelines is much easier with decorators, see this example

Your code would look something like this:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
@SKTransform
def clean_num( txt):
        return re.compile('\\d+').sub('_NUM_', txt)

@SKTransform
def clean_tags(content):
        return BeautifulSoup(content, 'lxml').text

ppl = Pipeline([clean_tags,
                clean_num,
                TfidfVectorizer(use_idf=False, stop_words='english',tokenizer=nltk.word_tokenize,analyzer='word',max_features=max_features),
      ])

Upvotes: 2

fedeisas
fedeisas

Reputation: 2023

You're fitting only one column (df[:, 'interest_level]), but then your first step (transformer a: TextTransformer) is trying to access the column description.

Upvotes: 2

Related Questions