Reputation: 1801
I modified some Python
code from github
to run logistic regression on a subset of consumer complaints data using the following code, the text vectorization and classification parts work smoothly. But I am wondering if it's possible to also include non-text, binary numerical indicators, such as timely_response
and consumer_disputed.
as features (alongside text vectors)?
However, when I did this, Python
returns an error saying that I have input variables with inconsistent numbers of samples
.
%% load packages and data
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell
df = pd.read_csv('https://www.dropbox.com/s/obbs000w7knjmys/example_complaints.csv?dl=1')
df = df[pd.notnull(df['consumer_complaint_narrative'])]
df['product'].value_counts()
%% cleaning text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(clean_text)
%% include only text as features
X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
%% fit and test with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
my_tags = ['Debt collection','Mortgage','Credit reporting','Credit card','Bank account or service','Consumer Loan','Student loan','Payday loan','Money transfers','Other financial service','Prepaid card']
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
%% including binary numerical indicators as additional features
new_X = df[['consumer_complaint_narrative', 'timely_response', 'consumer_disputed.']]
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)
%% fit and test again
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
which returns the following error message
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-28-455c8fd83ba4> in <module>
8 ('clf', LogisticRegression(n_jobs=1, C=1e5)),
9 ])
---> 10 logreg.fit(X_train, y_train)
11
12 y_pred = logreg.predict(X_test)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
--> 267 self._final_estimator.fit(Xt, y, **fit_params)
268 return self
269
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1286
1287 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
-> 1288 accept_large_sparse=solver != 'liblinear')
1289 check_classification_targets(y)
1290 self.classes_ = np.unique(y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
764 y = y.astype(np.float64)
765
--> 766 check_consistent_length(X, y)
767
768 return X, y
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
233 if len(uniques) > 1:
234 raise ValueError("Found input variables with inconsistent numbers of"
--> 235 " samples: %r" % [int(l) for l in lengths])
236
237
ValueError: Found input variables with inconsistent numbers of samples: [3, 529]
Will be really grateful if someone could shed some lights on this.
Upvotes: 0
Views: 361
Reputation: 771
This has to do with fitting the pipeline with multiple steps.
The CountVectorizer() and TfidfTransformer() are meant for text data, but the additional fields you are adding are not text data. (this is the part i am talking about below.)
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5))
There are some other threads about adding in other pandas columns to text data however that seems somewhat cumbersome to me. (reference Adding pandas columns to a sparse matrix).
Another option to integrate the additional data is simply to model your logistic regression using the text data and use the output of that regression in another model with your additional features.
Here is a way you could do that.
X = df['consumer_complaint_narrative']
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
%% fit and test with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
logreg = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(n_jobs=1, C=1e5)),
])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
## get the output of your model for the new model
add_features = pd.DataFrame(logreg.predict_proba(X))
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
%% including binary numerical indicators as additional features
new_X = pd.concat([add_features,df[['timely_response', 'consumer_disputed.']]],axis=1)
y = df['product']
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state = 42)
%% fit and test again
## Do not need pipeline can fit a single logistic regression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
I am trying to think if there could be some sort of over-fitting issue here, but I do not think so. Since you are using the same random_state your split should be exactly the same as before, so we should be good.
After reviewing your comment, I changed my code to turn the add_features into a dataframe so they can concat properly.
I think the other issue you are experiencing is because there are inconsistent indexes in your dataframe.
An easy way to fix this is to simply reset your index in the very beginning of your script.
Look at the first 2 lines where you get your dataframe.
df = pd.read_csv('https://www.dropbox.com/s/obbs000w7knjmys/example_complaints.csv?dl=1')
## add reset_index here
df = df[pd.notnull(df['consumer_complaint_narrative'])].reset_index()
Upvotes: 1