Chris T.
Chris T.

Reputation: 1801

Incorporating additional numeric features into text classification model

I modified some Python code from github to run logistic regression on a subset of consumer complaints data using the following code, the text vectorization and classification parts work smoothly. But I am wondering if it's possible to also include non-text, binary numerical indicators, such as timely_response and consumer_disputed. as features (alongside text vectors)? However, when I did this, Python returns an error saying that I have input variables with inconsistent numbers of samples.

%% load packages and data
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell

df = pd.read_csv('https://www.dropbox.com/s/obbs000w7knjmys/example_complaints.csv?dl=1')
df = df[pd.notnull(df['consumer_complaint_narrative'])]

df['product'].value_counts()

%% cleaning text

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(clean_text)

%% include only text as features

X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

%% fit and test with logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
my_tags = ['Debt collection','Mortgage','Credit reporting','Credit card','Bank account or service','Consumer Loan','Student loan','Payday loan','Money transfers','Other financial service','Prepaid card']

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))


%% including binary numerical indicators as additional features

new_X = df[['consumer_complaint_narrative', 'timely_response', 'consumer_disputed.']]
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)

%% fit and test again

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

which returns the following error message

    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
    <ipython-input-28-455c8fd83ba4> in <module>
          8                 ('clf', LogisticRegression(n_jobs=1, C=1e5)),
          9                ])
    ---> 10 logreg.fit(X_train, y_train)
         11 
         12 y_pred = logreg.predict(X_test)

    ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
        265         Xt, fit_params = self._fit(X, y, **fit_params)
        266         if self._final_estimator is not None:
    --> 267             self._final_estimator.fit(Xt, y, **fit_params)
        268         return self
        269 

    ~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
       1286 
       1287         X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
    -> 1288                          accept_large_sparse=solver != 'liblinear')
       1289         check_classification_targets(y)
       1290         self.classes_ = np.unique(y)

    ~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
        764         y = y.astype(np.float64)
        765 
    --> 766     check_consistent_length(X, y)
        767 
        768     return X, y

    ~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
        233     if len(uniques) > 1:
        234         raise ValueError("Found input variables with inconsistent numbers of"
    --> 235                          " samples: %r" % [int(l) for l in lengths])
        236 
        237 

    ValueError: Found input variables with inconsistent numbers of samples: [3, 529]

Will be really grateful if someone could shed some lights on this.

Upvotes: 0

Views: 361

Answers (1)

jawsem
jawsem

Reputation: 771

This has to do with fitting the pipeline with multiple steps.

The CountVectorizer() and TfidfTransformer() are meant for text data, but the additional fields you are adding are not text data. (this is the part i am talking about below.)


logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5))

There are some other threads about adding in other pandas columns to text data however that seems somewhat cumbersome to me. (reference Adding pandas columns to a sparse matrix).

Another option to integrate the additional data is simply to model your logistic regression using the text data and use the output of that regression in another model with your additional features.

Here is a way you could do that.

X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

%% fit and test with logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
## get the output of your model for the new model
add_features = pd.DataFrame(logreg.predict_proba(X))

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))


%% including binary numerical indicators as additional features

new_X = pd.concat([add_features,df[['timely_response', 'consumer_disputed.']]],axis=1)
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)

%% fit and test again
## Do not need pipeline can fit a single logistic regression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

I am trying to think if there could be some sort of over-fitting issue here, but I do not think so. Since you are using the same random_state your split should be exactly the same as before, so we should be good.

After reviewing your comment, I changed my code to turn the add_features into a dataframe so they can concat properly.

I think the other issue you are experiencing is because there are inconsistent indexes in your dataframe.

An easy way to fix this is to simply reset your index in the very beginning of your script.

Look at the first 2 lines where you get your dataframe.

df = pd.read_csv('https://www.dropbox.com/s/obbs000w7knjmys/example_complaints.csv?dl=1')
## add reset_index here
df = df[pd.notnull(df['consumer_complaint_narrative'])].reset_index()

Upvotes: 1

Related Questions