Elaine
Elaine

Reputation: 69

Feature importance using SVM's coef_ function

I am working on a text classification project and trying to use SVC(kernel= 'linear') to get the feature importance. Here is my code:
(I changed the code from this post)

X = df1[features]
y = df1['label']


# Create selector class for text and numbers
class TextSelector(BaseEstimator, TransformerMixin):
    """Transformer to select a single column from the data frame to perform additional transformations on"""
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key."""
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

scaler = StandardScaler()    
text = Pipeline([
                ('selector', TextSelector(key='title_mainText')),
                ('vect', TfidfVectorizer(ngram_range=(1, 2))),                
            ])

upper_title =  Pipeline([
                ('selector', NumberSelector(key='upper_title')),
                ('standard', scaler),
            ])

upper_mainText =  Pipeline([
                ('selector', NumberSelector(key='upper_mainText')),
                ('standard', scaler),
            ])

punct_title =  Pipeline([
                ('selector', NumberSelector(key='punct_title')),
                ('standard', scaler),
            ])

punct_mainText =  Pipeline([
                ('selector', NumberSelector(key='punct_mainText')),
                ('standard', scaler),
            ])


exclamations_title =  Pipeline([
                ('selector', NumberSelector(key='exclamations_title')),
                ('standard', scaler),
            ])


exclamations_text =  Pipeline([
                ('selector', NumberSelector(key='exclamations_text')),
                ('standard', scaler),
            ])


feats = FeatureUnion([('title_mainText', text), 
                      ('upper_title', upper_title),
                      ('upper_mainText', upper_mainText),
                      ('punct_title', punct_title),
                      ('punct_mainText', punct_mainText),                    
                      ('exclamations_text', exclamations_text),
                      ('exclamations_title', exclamations_title),                        

feature_processing = Pipeline([('feats', feats)])

pipeline = Pipeline([
        ('features', feats),
        ('classifier', SVC(C=1, kernel= 'linear', max_iter= 1000, tol=0.0001, probability=True))
    ])


    def f_importances(coef, names):
        imp = coef
        imp,names = zip(*sorted(zip(imp,names)))
        plt.barh(range(len(names)), imp, align='center')
        plt.yticks(range(len(names)), names)
        plt.show()

    features_names = ['title_mainText', 'upper_title', 'upper_mainText', 'punct_title', 'punct_mainText',
                      'exclamations_title', 'exclamations_text']
    pipeline.fit(X, y)
    clf = pipeline.named_steps['classifier']
    f_importances(clf.coef_, features_names)

However, it shows an error message, and I don't know where I have done wrong. Did someone have the experience with this before?

ValueError Traceback (most recent call last) in () 13 pipeline.fit(X, y) 14 clf = pipeline.named_steps['classifier'] ---> 15 f_importances((clf.coef_[0]), features_names) 16

in f_importances(coef, names) 5 imp = coef 6 imp,names = zip(*sorted(zip(imp,names))) ----> 7 plt.barh(range(len(names)), imp, align='center') 8 plt.yticks(range(len(names)), names) 9 plt.show()

/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py in barh(*args, **kwargs) 2667 mplDeprecation)
2668 try: -> 2669 ret = ax.barh(*args, **kwargs) 2670 finally: 2671 ax._hold = washold

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in barh(self, *args, **kwargs) 2281
kwargs.setdefault('orientation', 'horizontal') 2282 patches = self.bar(x=left, height=height, width=width, -> 2283 bottom=y, **kwargs) 2284 return patches 2285

/anaconda3/lib/python3.6/site-packages/matplotlib/init.py in inner(ax, *args, **kwargs) 1715
warnings.warn(msg % (label_namer, func.name), 1716
RuntimeWarning, stacklevel=2) -> 1717 return func(ax, *args, **kwargs) 1718 pre_doc = inner.doc 1719 if pre_doc is None:

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in bar(self, *args, **kwargs) 2091 elif orientation == 'horizontal': 2092 r.sticky_edges.x.append(l) -> 2093 self.add_patch(r) 2094 patches.append(r) 2095

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in add_patch(self, p) 1852 if p.get_clip_path() is None:
1853 p.set_clip_path(self.patch) -> 1854 self._update_patch_limits(p) 1855 self.patches.append(p) 1856 p._remove_method = lambda h: self.patches.remove(h)

/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _update_patch_limits(self, patch) 1868 # or height. 1869 if (isinstance(patch, mpatches.Rectangle) and -> 1870 ((not patch.get_width()) and (not patch.get_height()))): 1871 return 1872
vertices = patch.get_path().vertices

/anaconda3/lib/python3.6/site-packages/scipy/sparse/base.py in bool(self) 286 return self.nnz != 0 287 else: --> 288 raise ValueError("The truth value of an array with more than one " 289 "element is ambiguous. Use a.any() or a.all().") 290 nonzero = bool

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

Thank you!

Upvotes: 1

Views: 2216

Answers (1)

KRKirov
KRKirov

Reputation: 4004

Scikit-Learn's documentation states that the coef_ attribute is an array of shape shape = [n_class * (n_class-1) / 2, n_features]. Assuming 4 classes and 9 features, _coef is of shape 6 x 9 (six rows and nine columns). barh on the other hand expects one value for each feature instead of six, hence you are getting an error. You can eliminate it if you sum the coefficients along each column as in the example below.

import numpy as np
import matplotlib.pyplot as plt

def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

features_names = ['title_mainText', 'upper_title', 'upper_mainText', 'punct_title', 'punct_mainText',
                  'exclamations_title', 'exclamations_text', 'title_words_not_stopword', 'text_words_not_stopword']

n_classes = 4
n_features = len(features_names)

clf_coef_ = np.random.randint(1, 30, size=(int(0.5*n_classes*(n_classes-1)), n_features))

f_importances(clf_coef_.sum(axis=0), features_names)

enter image description here

Upvotes: 5

Related Questions