Reputation: 642
Below is the code of training Naive Bayes Classifier
on movie_reviews
dataset for unigram
model. I want to train and analyze its performance by considering bigram
, trigram
model. How can we do it.
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def create_word_features(words):
useful_words = [word for word in words if word not in stopwords.words("english")]
my_dict = dict([(word, True) for word in useful_words])
return my_dict
pos_data = []
for fileid in movie_reviews.fileids('pos'):
words = movie_reviews.words(fileid)
pos_data.append((create_word_features(words), "positive"))
neg_data = []
for fileid in movie_reviews.fileids('neg'):
words = movie_reviews.words(fileid)
neg_data.append((create_word_features(words), "negative"))
train_set = pos_data[:800] + neg_data[:800]
test_set = pos_data[800:] + neg_data[800:]
classifier = NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.util.accuracy(classifier, test_set)
Upvotes: 4
Views: 6292
Reputation: 1382
There is a shorter way for this. You can use CountVectorizer
method of sklearn library with a n-gram range parameter:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]
model = CountVectorizer(ngram_range = (2, 2), stop_words='english')
matrix = model.fit_transform(corpus).toarray()
df_output = pd.DataFrame(data = matrix, columns = model.get_feature_names())
df_output.T.tail(5)
Result will be:
0 1 2 3
document second 0 1 0 0
second document 0 1 0 0
For more.
Upvotes: 0
Reputation: 122148
from nltk import ngrams
def create_ngram_features(words, n=2):
ngram_vocab = ngrams(words, n)
my_dict = dict([(ng, True) for ng in ngram_vocab])
return my_dict
BTW, your code will be a lot faster if you change your featurizer to do use a set for your stopword list and initialize it only once.
stoplist = set(stopwords.words("english"))
def create_word_features(words):
useful_words = [word for word in words if word not in stoplist]
my_dict = dict([(word, True) for word in useful_words])
return my_dict
Someone should really tell the NLTK people to convert the stopwords list into a set type since it's "technically" a unique list (i.e. a set).
>>> from nltk.corpus import stopwords
>>> type(stopwords.words('english'))
<class 'list'>
>>> type(set(stopwords.words('english')))
<class 'set'>
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
def create_ngram_features(words, n=2):
ngram_vocab = ngrams(words, n)
my_dict = dict([(ng, True) for ng in ngram_vocab])
return my_dict
for n in [1,2,3,4,5]:
pos_data = []
for fileid in movie_reviews.fileids('pos'):
words = movie_reviews.words(fileid)
pos_data.append((create_ngram_features(words, n), "positive"))
neg_data = []
for fileid in movie_reviews.fileids('neg'):
words = movie_reviews.words(fileid)
neg_data.append((create_ngram_features(words, n), "negative"))
train_set = pos_data[:800] + neg_data[:800]
test_set = pos_data[800:] + neg_data[800:]
classifier = NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(str(n)+'-gram accuracy:', accuracy)
[out]:
1-gram accuracy: 0.735
2-gram accuracy: 0.7625
3-gram accuracy: 0.8275
4-gram accuracy: 0.8125
5-gram accuracy: 0.74
Your original code returns an accuracy of 0.725.
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import everygrams
def create_ngram_features(words, n=2):
ngram_vocab = everygrams(words, 1, n)
my_dict = dict([(ng, True) for ng in ngram_vocab])
return my_dict
for n in range(1,6):
pos_data = []
for fileid in movie_reviews.fileids('pos'):
words = movie_reviews.words(fileid)
pos_data.append((create_ngram_features(words, n), "positive"))
neg_data = []
for fileid in movie_reviews.fileids('neg'):
words = movie_reviews.words(fileid)
neg_data.append((create_ngram_features(words, n), "negative"))
train_set = pos_data[:800] + neg_data[:800]
test_set = pos_data[800:] + neg_data[800:]
classifier = NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print('1-gram to', str(n)+'-gram accuracy:', accuracy)
[out]:
1-gram to 1-gram accuracy: 0.735
1-gram to 2-gram accuracy: 0.7625
1-gram to 3-gram accuracy: 0.7875
1-gram to 4-gram accuracy: 0.8
1-gram to 5-gram accuracy: 0.82
Upvotes: 6