Reputation: 49
I am reading email data from training set and creating train_matrix, train_labels and test_labels. Now how do I display decision boundary using matplot in python. I am using svm of sklearn. There are online example for pre given data sets through iris. But plot fails on custom data. Here is my code
Error :
Traceback (most recent call last):
File "classifier-plot.py", line 115, in <module>
Z = Z.reshape(xx.shape)
ValueError: cannot reshape array of size 260 into shape (150,1750)
Code:
import os
import numpy as np
from collections import Counter
from sklearn import svm
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
def make_Dictionary(root_dir):
all_words = []
emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
for mail in emails:
with open(mail) as m:
for line in m:
words = line.split()
all_words += words
dictionary = Counter(all_words)
list_to_remove = dictionary.keys()
for item in list_to_remove:
if item.isalpha() == False:
del dictionary[item]
elif len(item) == 1:
del dictionary[item]
dictionary = dictionary.most_common(3000)
return dictionary
def extract_features(mail_dir):
files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
features_matrix = np.zeros((len(files),3000))
train_labels = np.zeros(len(files))
count = 0;
docID = 0;
for fil in files:
with open(fil) as fi:
for i,line in enumerate(fi):
if i == 2:
words = line.split()
for word in words:
wordID = 0
for i,d in enumerate(dictionary):
if d[0] == word:
wordID = i
features_matrix[docID,wordID] = words.count(word)
train_labels[docID] = 0;
filepathTokens = fil.split('/')
lastToken = filepathTokens[len(filepathTokens) - 1]
if lastToken.startswith("spmsg"):
train_labels[docID] = 1;
count = count + 1
docID = docID + 1
return features_matrix, train_labels
TRAIN_DIR = "../train-mails"
TEST_DIR = "../test-mails"
dictionary = make_Dictionary(TRAIN_DIR)
print "reading and processing emails from file."
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)
model = svm.SVC(kernel="rbf", C=10000)
print "Training model."
features_matrix = features_matrix[:len(features_matrix)/10]
labels = labels[:len(labels)/10]
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)
print "FINISHED classifying. accuracy score : "
print accuracy_score(test_labels, predicted_labels)
##----------------
h = .02 # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0 # SVM regularization parameter
X = features_matrix
y = labels
svc = model.fit(X, y)
#svm.SVC(kernel='linear', C=C).fit(X, y)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = y[:].min() - 1, y[:].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# title for the plots
titles = ['SVC with linear kernel']
Z = predicted_labels#svc.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title(titles[0])
plt.show()
Upvotes: 2
Views: 7069
Reputation: 17159
In the tutorial that you were following Z
is computed by applying the classifier to a set of feature vectors generated to form a regular NxM
grid. This makes the plot smooth.
When you replaced
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
with
Z = predicted_labels
you replaced this regular grid with the predictions taken on your dataset. The next line failed with an error since it could not reshape an array of size len(files)
to an NxM
matrix. There is no reason len(files) = NxM
.
There is a reason why you could not follow the tutorial directly. Your data dimension is 3000, so your decision boundary would be a 2999-dimensional hyperplane in a 3000-dimensional space. This is not easy to visualize.
In the tutorial the dimension is 4 and it is reduced to 2 for visualization. The best way to reduce the dimension of your data depends on the data. In the tutorial we just pick the first two components of the 4-dimensional vector.
Another option that works well in many cases is to use Principal Component Analysis to reduce the dimension of data.
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(features_matrix, labels)
reduced_matrix = pca.fit_transform(features_matrix, labels)
model.fit(reduced_matrix, labels)
Such model can be used for 2D visualization. You can just follow the tutorial directly and define
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
We do not have access to your email data, so for illustration we could just use random data.
from sklearn import svm
from sklearn.decomposition import PCA
# initialize algorithms and data with random
model = svm.SVC(gamma=0.001,C=100.0)
pca = PCA(n_components = 2)
rng = np.random.RandomState(0)
U = rng.rand(200, 2000)
v = (rng.rand(200)*2).astype('int')
pca.fit(U,v)
U2 = pca.fit_transform(U,v)
model.fit(U2,v)
# generate grid for plotting
h = 0.2
x_min, x_max = U2[:,0].min() - 1, U2[:, 0].max() + 1
y_min, y_max = U2[:,1].min() - 1, U2[:, 1].max() + 1
xx, yy = np.meshgrid(
np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# create decision boundary plot
Z = s.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
contourf(xx,yy,Z,cmap=plt.cm.coolwarm, alpha=0.8)
scatter(U2[:,0],U2[:,1],c=v)
show()
Would produce a decision boundary that does not look very impressive.
Indeed the first two principal components capture just about 1% of the information contained in the data
>>> print(pca.explained_variance_ratio_)
[ 0.00841935 0.00831764]
If now you introduce just a little bit of carefully disguised asymmetry you would already see an effect.
Modify the data to introduce shifts at just one coordinate randomly selected for each feature
random_shifts = (rng.rand(2000)*200).astype('int')
for i in range(MM):
if v[i] == 1:
U[i,random_shifts[i]] += 5.0
And applying PCA you would get somewhat more informative picture.
Note that here the first two principal components already explain about 5% of the variance and the red part of the picture contains many more red points than blue ones.
Upvotes: 2