Reputation: 1220
i'm beginner in machine learning and i'm trying to implement my first Naive Bayes by myself for better understanding. So, i have dataset from http://archive.ics.uci.edu/ml/datasets/Adult (american census data, classes are '<=50k' and '>50k').
Here is my python code:
#!/usr/bin/python
import sys
import csv
words_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}
words_cnt = 0
targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each class
class_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each class
items_cnt = 0
def train(dataset, targets):
global words_stats, words_cnt, targets_stats, items_cnt, class_stats
num = len(dataset)
for item in xrange(num):
class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1
for i in xrange(len(dataset[item])):
word = dataset[item][i]
if not words_stats.has_key(word):
words_stats[word] = {}
tgt = targets[item]
cnt = words_stats[word].get(tgt, 0)
words_stats[word][tgt] = cnt + 1
targets_stats[tgt] = targets_stats.get(tgt, 0) + 1
words_cnt += 1
items_cnt = num
def classify(doc, tgt_set):
global words_stats, words_cnt, targets_stats, items_cnt
probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W)
pc = {} #probability of the class in document set P(c)
pwc = {} #probability of the word set in particular class. P(W|c)
pw = 1 #probability of the word set in documet set
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
pw = pw * float(sum(words_stats[word].values())) / words_cnt
for tgt in tgt_set:
pc[tgt] = class_stats[tgt] / float(items_cnt)
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
tgt_wrd_cnt = words_stats[word].get(tgt, 0)
pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt]
probs[tgt] = (pwc[tgt] * pc[tgt]) / pw
l = sorted(probs.items(), key = lambda i: i[1], reverse=True)
print probs
return l[0][0]
def check_results(dataset, targets):
num = len(dataset)
tgt_set = set(targets)
correct = 0
incorrect = 0
for item in xrange(num):
res = classify(dataset[item], tgt_set)
if res == targets[item]:
correct = correct + 1
else:
incorrect = incorrect + 1
print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / num
def load_data(fil):
data = []
tgts = []
reader = csv.reader(fil)
for line in reader:
d = [x.strip() for x in line]
if '?' in d:
continue
if not len(d):
continue
data.append(d[:-1])
tgts.append(d[-1:][0])
return data, tgts
if __name__ == '__main__':
if len(sys.argv) < 3:
print './program train_data.txt test_data.txt'
sys.exit(1)
filename = sys.argv[1]
fil = open(filename, 'r')
data, tgt = load_data(fil)
train(data, tgt)
test_file = open(sys.argv[2], 'r')
test_data, test_tgt = load_data(test_file)
check_results(test_data, tgt)
it gives ~61% of correct results. when i print probabilities i get the following:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
but in case of correct classifier i expect to see sum of both probabilities equal to 1. At first i thought the problem is in float underflow and tried to make all calculations in logarithms, but results were similiar. i understand that omitting some words is gonna affect accuracy, but the probabilities are sooo wrong.
What do i do wrong or don't understand?
for your convinience i've uploaded dataset and python script here: https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
Thank you for your help.
Upvotes: 0
Views: 2196
Reputation: 3770
Naive Bayes doesn't compute a probability directly, rather it computes a "raw score" that is relatively compared to the other scores for each label in order to classify an instance. This score can easily be converted to a "probability" in the range of [0, 1]
:
total = sum(probs.itervalues())
for label, score in probs.iteritems():
probs[label] = score / total
However, keep in mind this still doesn't represent a true probability, as mentioned in this answer:
naive Bayes tends to predict probabilities that are almost always either very close to zero or very close to one.
Upvotes: 1