Reputation: 7281
I have a dict
data structure where the key is a machine learning classifier, and the value is a pandas
dataframe of that classifier's feature importances. For example:
for k,v in clf_importances.items():
print("Classifier: {} | Top 3 Features: {}".format(k,v.head(n=3)))
Yields:
Classifier: XGBClassifier | Top 3 Features: importance
feature
LIMIT_BAL 0.024073
PAY_AMT3 0.025030
BILL_AMT1 0.025860
Classifier: LGBMClassifier | Top 3 Features: importance
feature
PAY_AMT5 155
BILL_AMT3 162
PAY_AMT6 179
Their types are:
print("Key Type: {} | Value Type: {}".format(type(k), type(v)))
<class 'str'> | Value Type: <class 'pandas.core.frame.DataFrame'>
What I am looking to do is construct a final_df
w/columns:
classifier, feature_1, feature_2...feature_n
where the value is the importance (sometimes, it is 0).
Ideally, I would get a dataframe that looks like:
| Classifier | Feature_1 | Feature_2 | Feature_3 | Feature_4 | Feature_5 | …n |
|:----------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---:|
| A | 0.062 | 0.298 | 0.000 | 0.215 | 0.000 | foo |
| B | 0.001 | 0.000 | 0.005 | 0.121 | 0.314 | foo |
| C | 0.005 | 0.054 | 0.015 | 0.000 | 0.587 | foo |
| D | 0.315 | 0.547 | 0.870 | 0.003 | 0.000 | foo |
| …n | foo | foo | foo | foo | foo | foo |
My script that I have used to generate that dict is below:
# Libraries Used
import pandas as pd, numpy as np
# Data Manipulation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# Classifiers Used
# https://www.kaggle.com/grfiv4/plotting-feature-importances
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
# Graphing Libraries
import matplotlib.pyplot as plt
# Other Configuration Settings
import warnings
warnings.filterwarnings('ignore')
# Read in the dataset
df = pd.read_csv('credit.csv')
# Take labels
labels = df['class']
# Drop that from the dataset
df.drop('class', axis=1, inplace=True)
# Remove nan values
df.dropna(inplace=True)
# Print new size
print(df.size)
# Scale the dataset between 0 and 1
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(df.values), columns=df.columns, index=df.index)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=np.random.randint(1,100))
# Instantiate a list of classifiers
clfs = [XGBClassifier(), LGBMClassifier(),
ExtraTreesClassifier(), ExtraTreeClassifier(),
AdaBoostClassifier(), DecisionTreeClassifier(),
GradientBoostingClassifier(), RandomForestClassifier()]
clf_accuracy = {}
clf_importances = {}
for clf in clfs:
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
accuracy = get_accuracy(preds, y_test)
clf_accuracy[clf.__class__.__name__] = accuracy
title = "Top 10 Feature Importances For {}".format(clf.__class__.__name__)
temp_df = pd.DataFrame({'importance':clf.feature_importances_})
temp_df['feature'] = X_train.columns
temp_df.sort_values(by='importance', ascending=False, inplace=True)
#temp_df = temp_df.head(n=10)
temp_df.sort_values(by='importance', inplace=True)
temp_df = temp_df.set_index('feature', drop=True)
clf_importances[clf.__class__.__name__] = temp_df
print("{} had an accuracy of : {}%".format(clf.__class__.__name__,accuracy))
temp_df.plot.barh(title=title, figsize=(8,11))
for k,v in clf_importances.items():
print("Classifier: {} | Top 3 Features: {}".format(k,v.head(n=3)))
print("Key Type: {} | Value Type: {}".format(type(k), type(v)))
How can I transpose this dict
of dataframes
into one dataframe?
Upvotes: 1
Views: 197
Reputation: 16966
pd.concat
would solve your problem.
Try this
pd.concat(list(clf_importances.values()),axis=1).T
Reproducible example:
# Libraries Used
import pandas as pd
import numpy as np
# Data Manipulation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# Classifiers Used
# https://www.kaggle.com/grfiv4/plotting-feature-importances
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
# Graphing Libraries
import matplotlib.pyplot as plt
# Other Configuration Settings
import warnings
warnings.filterwarnings('ignore')
# Read in the dataset
data, labels = make_classification(random_state=42)
data = pd.DataFrame(
data, columns=[f'feature_{i+1}'for i in range(data.shape[1])])
X_train, X_test, y_train, y_test = train_test_split(
data, labels, test_size=0.33, random_state=np.random.randint(1, 100))
# Instantiate a list of classifiers
clfs = [ExtraTreeClassifier(), AdaBoostClassifier(),
DecisionTreeClassifier(), GradientBoostingClassifier(),
RandomForestClassifier()]
f, ax = plt.subplots(1, len(clfs), figsize=(20,10), sharey=True)
clf_accuracy = {}
clf_importances = {}
for ind, clf in enumerate(clfs):
clf_name = clf.__class__.__name__
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
clf_accuracy[clf_name] = clf.score(X_test, y_test)
title = f'{clf_name}'
temp_df = pd.DataFrame({f'imp_{clf_name}': clf.feature_importances_})
temp_df['feature'] = X_train.columns
temp_df = temp_df.set_index('feature', drop=True)
clf_importances[clf_name] = temp_df
print("{} had an accuracy of : {}%".format(
clf_name, clf_accuracy[clf_name]))
temp_df.plot.barh(title=title, ax=ax[ind])
for k, v in clf_importances.items():
print("Classifier: {} | Top 3 Features: {}".format(k, v.head(n=3)))
print("Key Type: {} | Value Type: {}".format(type(k), type(v)))
plt.show()
pd.concat(list(clf_importances.values()), axis=1).T
Upvotes: 1