Incorporating additional numeric features into text classification model

Question

I modified some Python code from github to run logistic regression on a subset of consumer complaints data using the following code, the text vectorization and classification parts work smoothly. But I am wondering if it's possible to also include non-text, binary numerical indicators, such as timely_response and consumer_disputed. as features (alongside text vectors)? However, when I did this, Python returns an error saying that I have input variables with inconsistent numbers of samples.

%% load packages and data
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell

df = pd.read_csv('https://www.dropbox.com/s/obbs000w7knjmys/example_complaints.csv?dl=1')
df = df[pd.notnull(df['consumer_complaint_narrative'])]

df['product'].value_counts()

%% cleaning text

REPLACE_BY_SPACE_RE = re.compile('[/(){}\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(clean_text)

%% include only text as features

X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

%% fit and test with logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
my_tags = ['Debt collection','Mortgage','Credit reporting','Credit card','Bank account or service','Consumer Loan','Student loan','Payday loan','Money transfers','Other financial service','Prepaid card']

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))


%% including binary numerical indicators as additional features

new_X = df[['consumer_complaint_narrative', 'timely_response', 'consumer_disputed.']]
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)

%% fit and test again

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

which returns the following error message

    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
     in 
          8                 ('clf', LogisticRegression(n_jobs=1, C=1e5)),
          9                ])
    ---> 10 logreg.fit(X_train, y_train)
         11 
         12 y_pred = logreg.predict(X_test)

    ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
        265         Xt, fit_params = self._fit(X, y, **fit_params)
        266         if self._final_estimator is not None:
    --> 267             self._final_estimator.fit(Xt, y, **fit_params)
        268         return self
        269 

    ~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
       1286 
       1287         X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
    -> 1288                          accept_large_sparse=solver != 'liblinear')
       1289         check_classification_targets(y)
       1290         self.classes_ = np.unique(y)

    ~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
        764         y = y.astype(np.float64)
        765 
    --> 766     check_consistent_length(X, y)
        767 
        768     return X, y

    ~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
        233     if len(uniques) > 1:
        234         raise ValueError("Found input variables with inconsistent numbers of"
    --> 235                          " samples: %r" % [int(l) for l in lengths])
        236 
        237 

    ValueError: Found input variables with inconsistent numbers of samples: [3, 529]

Will be really grateful if someone could shed some lights on this.

Incorporating additional numeric features into text classification model

Answers (1)

Related Questions