Reputation: 4295
I am trying to use sklearn pipeline. But i tried various tutorials online and it didnt help me.
import pandas as pd
import numpy as np
import json
import seaborn as sb
from sklearn.metrics import log_loss
from sklearn import linear_model
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.stats import zscore
from Transformers import TextTransformer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str})
len(df)
df = df[['description', 'interest_level']]
from sklearn.pipeline import Pipeline, FeatureUnion
a = TextTransformer('description', max_features=50)
b = TextTransformer('features', max_features=10)
pipeline = Pipeline([
('description',a ), # can pass in either a pipeline
#('features',b ) # or a transformer
J ('clf', SVC()) # classifier
])
pipeline.fit(df[:,'interest_level'])
My Text transformer
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, column, max_features=5000):
self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english',
tokenizer=self._custom_tokenizer, analyzer='word',
max_features=max_features)
self._vectorizer = None
self._column = column
def _custom_tokenizer(self, string):
# string = re.sub('^[\w]', '', string)
tokens = nltk.word_tokenize(string)
cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens]
return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')]
def _clean_html_tags(self, content):
return BeautifulSoup(content, 'lxml').text
def fit(self, df):
self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags))
return self
def transform(self, df):
return self._vectorizer.transform(df[self._column]).todense()
However, i cannot seem to get it right. It keeps on throw this exception in ipython notebook
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-b3788282dc5c> in <module>()
8 ('clf', SVC()) # classifier
9 ])
---> 10 pipeline.fit(df[:,'interest_level'])
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
1382 """Return the cached item, item represents a label indexer."""
1383 cache = self._item_cache
-> 1384 res = cache.get(item)
1385 if res is None:
1386 values = self._data.get(item)
TypeError: unhashable type
Description of data
description interest_level
10 A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ... medium
10000 low
100004 Top Top West Village location, beautiful Pre-w... high
100007 Building Amenities - Garage - Garden - fitness... low
100013 Beautifully renovated 3 bedroom flex 4 bedroom... low
Interest level would be my target variable
Upvotes: 1
Views: 2382
Reputation: 412
Simple example of sklearn pipelines for all features distribution like categorical, nominal, ordinal etc.
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# All data encoders
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
std = StandardScaler()
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
X = pd.DataFrame(X_dense_data, columns=X.columns.values.tolist())
# All columns distribution
ohe_column_catagorical_feature = ['race', 'sex', 'age group']
std_column_numeric_feature = ['height', 'weight', 'temperature', 'blood glucose']
# Numaric feature transformer
feature_numeric_transformer = Pipeline(steps=[
('scaler_data', std)
])
# catagorical feature transformer
catagorical_numeric_transformer = Pipeline(steps=[
('onehot', ohe)
])
# column transformer to transform the value of each feature
preprocessor_feature = ColumnTransformer(
transformers=[
('num', feature_numeric_transformer, std_column_numeric_feature),
('cat', catagorical_numeric_transformer, ohe_column_catagorical_feature)
], remainder='drop'
)
Make sure your data value is fully filled. if not here is the example to fill nullity using sklear SimpleImputer
The imputation strategy.
If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.
If “median”, then replace missing values using the median along each column. Can only be used with numeric data.
If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data.
If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
# To fill the missing values
imp_mean = SimpleImputer(strategy='most_frequent')
X_dense_data = imp_mean.fit_transform(X)
Upvotes: 1
Reputation: 13700
Writing Pipelines is much easier with decorators, see this example
Your code would look something like this:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
@SKTransform
def clean_num( txt):
return re.compile('\\d+').sub('_NUM_', txt)
@SKTransform
def clean_tags(content):
return BeautifulSoup(content, 'lxml').text
ppl = Pipeline([clean_tags,
clean_num,
TfidfVectorizer(use_idf=False, stop_words='english',tokenizer=nltk.word_tokenize,analyzer='word',max_features=max_features),
])
Upvotes: 2
Reputation: 2023
You're fitting only one column (df[:, 'interest_level]
), but then your first step (transformer a: TextTransformer
) is trying to access the column description
.
Upvotes: 2