Reputation: 33
I am new to programming, but have looked at my code over and over and can't see any mistakes. I don't know how proceed any more because this error pops up no matter what I try. I'll post the full code here.
Any help would be much appreciated, thank you!
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
# classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Original NaiveBayes accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(10)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
voted_classifier = VoteClassifier(classifier,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
This is the error:
Traceback (most recent call last):
File "code/test.py", line 109, in <module>
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/classify/util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/classify/api.py", line 77, in classify_many
return [self.classify(fs) for fs in featuresets]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/classify/api.py", line 56, in classify
raise NotImplementedError()
NotImplementedError
Upvotes: 1
Views: 500
Reputation: 122142
As noted in the comments, there's some bad spaghetti like code in the ClassiferI
api that has classify
calling classify_many
when overriden. It might not be a bad thing when considering that the ClassifierI
is strongly tied with the NaiveBayesClassifier
object.
But for the particular use in the OP, the spaghetti code there isn't welcomed.
Take a look at https://www.kaggle.com/alvations/sklearn-nltk-voteclassifier
From the traceback, the error is starts from nltk.classify.util.accuracy()
calling the ClassifierI.classify()
.
The ClassifierI.classify()
is generally used to classify ONE document and the input is a dictionary of featureset with its binary values.
The ClassifierI.classify_many()
is supposed to classify a MULTIPLE documents and the input is a list of dictionary of featureset with its binary values.
So the quick hack is to overwrite how the accuracy()
function so that the VotedClassifier
won't be dependent on the ClassifierI
definition of classify()
vs classify_many()
. That would also mean that we don't inherit from ClassifierI
. IMHO, if you don't need other functions other than classify()
, there's no need to inherit the baggage that ClassifierI
might come with:
def my_accuracy(classifier, gold):
documents, labels = zip(*gold)
predictions = classifier.classify_documents(documents)
correct = [y == y_hat for y, y_hat in zip(labels, predictions)]
if correct:
return sum(correct) / len(correct)
else:
return 0
class VotraClassifier:
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify_documents(self, documents):
return [self.classify_many(doc) for doc in documents]
def classify_many(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
Now if we call the new my_accuracy()
with the new VotedClassifier
object:
voted_classifier = VotraClassifier(nltk_nb,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier)
my_accuracy(voted_classifier, testing_set)
[out]:
0.86
Note: There's certain randomness when it comes to shuffling the document and then holding out a set to test for the classifier accuracy.
My suggestion is to do the following instead of simple random.shuffle(documents)
Upvotes: 1