Reputation: 60
I'm a beginner trying to learn sklearn pipeline. I get a value error of ValueError: could not convert string to float
when I run my code below. I'm not sure what's the reason for it since OneHotEncoder shouldn't have any problem converting string to float for categorical variables
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)
x_cols = [c for c in df.columns if c!='income']
X = df[x_cols]
y = df['income']
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
preprocessor = ColumnTransformer(
transformers=[
('imputer', SimpleImputer(strategy='most_frequent'),['workclass','education','native-country']),
('onehot', OneHotEncoder(), ['workclass', 'education', 'marital-status',
'occupation', 'relationship', 'race', 'sex','native-country'])
]
)
clf = Pipeline([('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
clf.fit(X_train, y_train)
Upvotes: 3
Views: 1319
Reputation: 1482
Unfortunately, there is an issue with scikit-learn's SimpleImputer
when it tries to impute string variables. Here is a open issue about it on their github page.
To get around this, I'd recommend splitting up your pipeline into two steps. One for just the replacement of null values and 2) the rest, something like this:
cols_with_null = ['workclass','education','native-country']
preprocessor = ColumnTransformer(
transformers=[
(
'imputer',
SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
cols_with_null),
])
preprocessor.fit(X_train)
X_train_new = preprocessor.transform(X_train)
for icol, col in enumerate(cols_with_null):
X_train.loc[:, col] = X_train_new[:, icol]
# confirm no null values in these columns:
for col in cols_with_null:
print('{}, null values: {}'.format(col, pd.isnull(X_train[col]).sum()))
Now that you have X_train
with no null values, the rest should work without SimpleImputer:
preprocessor = ColumnTransformer(
transformers=[
('onehot', OneHotEncoder(), ['workclass', 'education', 'marital-status',
'occupation', 'relationship', 'race', 'sex','native-country'])])
clf = Pipeline([('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
clf.fit(X_train, y_train)
Upvotes: 5