Reputation: 3811
I have a dataframe, X_train. I am trying to create the below:
Imputer
It fills NAN values with median for numeric variables.
It fills NAN values with most frequent value for non- numeric variables.
X_train before imputer(code to generate)
import pandas as pd
import numpy as np
X_train = pd.DataFrame({'Default': [1,0,0,0,0,0,1],'Income': [250000,400000,'NAN',440000,500000,700000,800000],'Age': [20,30, 40,35,25,40,'NAN'],'Name':['Allen','Sara','Lily','Rock','David','Rose','Mat'],'Gender':['M','F','F','M','M','F','M'],'Type of job': ['Skilled','Unskilled','Super skilled','Super skilled','NAN','Skilled','Skilled'],'Amt of credit':['NAN',30000,50000,80000,40000,100000,300000],'Years employed':[1,10,12,6,4,13,12]})
X_train=X_train.replace('NAN',np.NaN)
import pandas as pd
X_train_numeric=X_train.select_dtypes(include=['int', 'float']).columns
X_train_non_numeric=X_train.select_dtypes(exclude=['int', 'float']).columns.drop('Name')
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
t = [('num', SimpleImputer(strategy='median'), X_train_numeric),
('cat', SimpleImputer(strategy='most_frequent'), X_train_non_numeric)]
transformer = ColumnTransformer(transformers=t, remainder='passthrough')
X_train = transformer.fit_transform(X_train) #numpy array
#code used to change numpy array to pandas
X_train = pd.DataFrame(X_train, index=range(1, X_train.shape[0] + 1),
columns=range(1, X_train.shape[1] + 1))
X_train after imputer
Expected output
Name
from X_train
but its is not dropped in the final outcome despite doing drop('Name')
at end of X_train_non_numeric
.Upvotes: 2
Views: 410
Reputation: 4215
Try:
X_train = X_train.fillna(X_train.median()).apply(lambda x: x.fillna(x.value_counts().index[0])).drop('Name', axis=1)
Default Income Age Gender Type of job Amt of credit Years employed
0 1 250000.0 20.0 M Skilled 65000.0 1
1 0 400000.0 30.0 F Unskilled 30000.0 10
2 0 470000.0 40.0 F Super skilled 50000.0 12
3 0 440000.0 35.0 M Super skilled 80000.0 6
4 0 500000.0 25.0 M Skilled 40000.0 4
5 0 700000.0 40.0 F Skilled 100000.0 13
6 1 800000.0 32.5 M Skilled 300000.0 12
Or your existing code edited:
X_train_numeric=X_train.select_dtypes(include=['number']).columns
X_train_non_numeric=X_train.select_dtypes(exclude=['number']).columns
my_cols = list(X_train_numeric)+list(X_train_non_numeric)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
t = [('num', SimpleImputer(strategy='median'), X_train_numeric),
('cat', SimpleImputer(strategy='most_frequent'), X_train_non_numeric)]
transformer = ColumnTransformer(transformers=t, remainder='passthrough')
X_train = transformer.fit_transform(X_train) #numpy array
#code used to change numpy array to pandas
X_train = pd.DataFrame(X_train, index=range(1, X_train.shape[0] + 1),
columns=my_cols).drop('Name', axis=1)
Default Income Age Amt of credit Years employed Gender Type of job
1 1 250000 20 65000 1 M Skilled
2 0 400000 30 30000 10 F Unskilled
3 0 470000 40 50000 12 F Super skilled
4 0 440000 35 80000 6 M Super skilled
5 0 500000 25 40000 4 M Skilled
6 0 700000 40 100000 13 F Skilled
7 1 800000 32.5 300000 12 M Skilled
Upvotes: 1
Reputation: 863256
Remove column by DataFrame.drop
, then first replace mising values by numeric with DataFrame.median
(non numeric columns are omitted) and then replace first values of DataFrame.mode
:
X_train = X_train.fillna(X_train.median()).fillna(X_train.drop('Name', axis=1).mode().iloc[0])
print (X_train)
Default Income Age Name Gender Type of job Amt of credit \
0 1 250000.0 20.0 Allen M Skilled 65000.0
1 0 400000.0 30.0 Sara F Unskilled 30000.0
2 0 470000.0 40.0 Lily F Super skilled 50000.0
3 0 440000.0 35.0 Rock M Super skilled 80000.0
4 0 500000.0 25.0 David M Skilled 40000.0
5 0 700000.0 40.0 Rose F Skilled 100000.0
6 1 800000.0 32.5 Mat M Skilled 300000.0
Years employed
0 1
1 10
2 12
3 6
4 4
5 13
6 12
Detail:
print (X_train.median())
Default 0.0
Income 470000.0
Age 32.5
Amt of credit 65000.0
Years employed 10.0
dtype: float64
Another idea is create Series
with all columns with remove Name
column for non numeric and numeric columns and pass to DataFrame.fillna
:
s = X_train.drop('Name', axis=1).select_dtypes(object).mode().iloc[0].append(X_train.median())
print (s)
Gender M
Type of job Skilled
Default 0
Income 470000
Age 32.5
Amt of credit 65000
Years employed 10
dtype: object
X_train = X_train.fillna(s)
print (X_train)
Default Income Age Name Gender Type of job Amt of credit \
0 1 250000.0 20.0 Allen M Skilled 65000.0
1 0 400000.0 30.0 Sara F Unskilled 30000.0
2 0 470000.0 40.0 Lily F Super skilled 50000.0
3 0 440000.0 35.0 Rock M Super skilled 80000.0
4 0 500000.0 25.0 David M Skilled 40000.0
5 0 700000.0 40.0 Rose F Skilled 100000.0
6 1 800000.0 32.5 Mat M Skilled 300000.0
Years employed
0 1
1 10
2 12
3 6
4 4
5 13
6 12
Your solution should be changed:
X_train=X_train.replace('NAN',np.NaN)
$removed column Name
X_train = X_train.drop('Name', axis=1)
#original order of columns
cols = X_train.columns
X_train_numeric=X_train.select_dtypes(include=['int', 'float']).columns
#joined columns numeric and non numeric
X_train_non_numeric=X_train.select_dtypes(exclude=['int', 'float']).columns
new = X_train_numeric.tolist() + X_train_non_numeric.tolist()
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
t = [('num', SimpleImputer(strategy='median'), X_train_numeric),
('cat', SimpleImputer(strategy='most_frequent'), X_train_non_numeric)]
transformer = ColumnTransformer(transformers=t, remainder='passthrough')
X_train = transformer.fit_transform(X_train) #numpy array
#DataFrame constructor with new columns names and added reindex for change by original order
X_train = pd.DataFrame(X_train, columns=new).reindex(cols, axis=1)
print (X_train)
Default Income Age Gender Type of job Amt of credit Years employed
0 1 250000 20 M Skilled 65000 1
1 0 400000 30 F Unskilled 30000 10
2 0 470000 40 F Super skilled 50000 12
3 0 440000 35 M Super skilled 80000 6
4 0 500000 25 M Skilled 40000 4
5 0 700000 40 F Skilled 100000 13
6 1 800000 32.5 M Skilled 300000 12
Upvotes: 2