Reputation: 1
I couldn't find and solve multinomial naive Bayes from scratch without the sklearn MultinomialNB library. But, when I fit MultinomialNB Classifier to the training set. But there's some problem. Here's the problem
I make the bag of words model and splitting the train and test set.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split #it was "sklearn.cross_validation" but now it changed
X = corpus
y = dataset.id_sentimen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state=0)
vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2))
X_train = vect.fit_transform(X_train).toarray()
X_test = vect.transform(X_test)
Here's the code MultinomialNB without library from sklearn
class MultinomialNB():
def fit(self, X_train, y_train, ls=0.01):
self.ls = ls
self.y_classes, y_counts = np.unique(y_train, return_counts=True)
self.x_classes = [np.unique(x) for x in X.T]
self.phi_y = 1.0 * y_counts/y_counts.sum()
self.phi_x = self.mean_X(X_train, y_train)
self.c_x = self.count_x(X_train, y_train)
return self
def mean_X(self, X_train, y_train):
return [[self.ls_mean_x(X_train, y_train, k, j) for j in range(len(self.x_classes))] for k in self.y_classes]
def ls_mean_x(self, X_train, y_train, k, j):
x_data = (X_train[:,j][y==k].reshape(-1,1) == self.x_classes[j])
return (x_data.sum(axis=0) + self.ls ) / (len(x_data) + (len(self.x_classes) * self.ls))
def get_mean_x(self, y_train, j):
return 1 + self.ls / (self.c_x[y][j] + (len(self.x_classes) * self.ls))
def count_x(self, X_train, y_train):
return [[len(X[:,j][y==k].reshape(-1,1) == self.x_classes[j])
for j in range(len(self.x_classes))]
for k in self.y_classes]
def predict(self, X_train):
return np.apply_along_axis(lambda x: self.compute_probs(x), 1, X_train)
def compute_probs(self, x):
probs = np.array([self.compute_prob(x, y) for y in range(len(self.y_classes))])
return self.y_classes[np.argmax(probs)]
def compute_prob(self, x, y):
Pxy = 1
for j in range(len(x)):
x_clas = self.x_classes[j]
if x[j] in x_clas:
i = list(x_clas).index(x[j])
p_x_j_y = self.phi_x[y][j][i] # p(xj|y)
Pxy *= p_x_j_y
else:
Pxy *= get_mean_x(y, j)
return Pxy * self.phi_y[y]
def evaluate(self, X_train, y_train):
return (self.predict(X_train) == y_train).mean()
I want to fit the MultinomialNB to my training set
# Fitting MultinomialNB Classifier to the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# Predicting test results
y_pred = classifier.predict(X_test)
ytest = np.array(y_test)
# f1_score(ytest, y_pred, average='weighted')
print(classification_report(ytest, y_pred))
print(confusion_matrix(ytest, y_pred))
Could anyone help solve the code and the error?
Upvotes: 0
Views: 4406
Reputation: 1
I did my own build from scratch.. perhaps can help you build your own
import numpy as np
class multinomialNB:
def __init__(self, alpha=1):
self.alpha = alpha
def fit(self, X_train, y_train):
m, n = X_train.shape
self._classes = np.unique(y_train)
n_classes = len(self._classes)
# init: Prior & Likelihood
self._priors = np.zeros(n_classes)
self._likelihoods = np.zeros((n_classes, n))
# Get Prior and Likelihood
for idx, c in enumerate(self._classes):
X_train_c = X_train[c == y_train]
self._priors[idx] = X_train_c.shape[0] / m
self._likelihoods[idx, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))
def predict(self, X_test):
return [self._predict(x_test) for x_test in X_test]
def _predict(self, x_test):
# Calculate posterior for each class
posteriors = []
for idx, c in enumerate(self._classes):
prior_c = np.log(self._priors[idx])
likelihoods_c = self.calc_likelihood(self._likelihoods[idx,:], x_test)
posteriors_c = np.sum(likelihoods_c) + prior_c
posteriors.append(posteriors_c)
return self._classes[np.argmax(posteriors)]
def calc_likelihood(self, cls_likeli, x_test):
return np.log(cls_likeli) * x_test
def score(self, X_test, y_test):
y_pred = self.predict(X_test)
return np.sum(y_pred == y_test)/len(y_test)
Upvotes: 0