Reputation: 15
I used Logistic regression to create a model, later saved the model using joblib. Later i tried loading that model and predicting label in my test.csv . When ever i try this i get an error saying X has 1433445 features per sample; expecting 3797015
.
This is my initial code:-
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
#reading data
train=pd.read_csv('train_yesindia.csv')
test=pd.read_csv('test_yesindia.csv')
train=train.iloc[:,1:]
test=test.iloc[:,1:]
test.info()
train.info()
test['label']='t'
test=test.fillna(' ')
train=train.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
train['total']=train['title']+' '+train['author']+train['text']
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, random_state=0)
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
print('Accuracy of Lasso classifier on training set: {:.2f}'
.format(logreg.score(X_train, y_train)))
print('Accuracy of Lasso classifier on test set: {:.2f}'
.format(logreg.score(X_test, y_test)))
targets = train['label'].values
logreg = LogisticRegression()
logreg.fit(counts, targets)
example_counts = count_vectorizer.transform(test['total'].values)
predictions = logreg.predict(example_counts)
pred=pd.DataFrame(predictions,columns=['label'])
pred['id']=test['id']
pred.groupby('label').count()
#dumping models
from joblib import dump, load
dump(logreg,'mypredmodel1.joblib')
Later i loaded model in a different code that is :-
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
test=pd.read_csv('test_yesindia.csv')
test=test.iloc[:,1:]
test['label']='t'
test=test.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
#check
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
test_counts = count_vectorizer.fit_transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#check
#load_model
logreg = load('mypredmodel1.joblib')
example_counts = count_vectorizer.fit_transform(test['total'].values)
predictions = logreg.predict(example_counts)
When i run it, i get the error:
predictions = logreg.predict(example_counts)
Traceback (most recent call last):
File "<ipython-input-58-f28afd294d38>", line 1, in <module>
predictions = logreg.predict(example_counts)
File "C:\Users\adars\Anaconda3\lib\site-packages\sklearn\linear_model\base.py", line 289, in predict
scores = self.decision_function(X)
File "C:\Users\adars\Anaconda3\lib\site-packages\sklearn\linear_model\base.py", line 270, in decision_function
% (X.shape[1], n_features))
ValueError: X has 1433445 features per sample; expecting 3797015
Upvotes: 0
Views: 127
Reputation: 60321
Most probably, this is because you are re-fitting your transformers in the test set. This must not be done - you should also save them fitted in your training set, and use the test (or any other future) set only for transforming data.
This is easier done with pipelines.
So, remove the following code from your first block:
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
and replace it with:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('counts', CountVectorizer(ngram_range=(1, 2)),
('tf-idf', TfidfTransformer(smooth_idf=False))
])
pipeline.fit(train['total'].values)
tfidf = pipeline.transform(train['total'].values)
targets = train['label'].values
test_tfidf = pipeline.transform(test['total'].values)
dump(pipeline, 'transform_predict.joblib')
Now, in your second code block, remove this part:
#check
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
test_counts = count_vectorizer.fit_transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)
#check
and replace it with:
pipeline = load('transform_predict.joblib')
test_tfidf = pipeline.transform(test['total'].values)
And you should be fine, provided that you predict
the test_tfidf
variable, and not the example_counts
which are not transfomed by TF-IDF:
predictions = logreg.predict(test_tfidf)
Upvotes: 1