Reputation: 123
I am trying to run a simulation where the true population is 2 classes of normal distribution, both mean 0, standard deviation 4000. I am attempting to use a perceptron to determine the relation of sample size to degree of overfitting. However, the perceptron always converges after 6 iterations with a threshold of 0 despite a sample size of 10 per each class which you can clearly see should not have a threshold of 0. Why is the threshold always 0? Also, is there a better way to output the threshold than my code below? I'm using perceptron because I want the simplest possible classifier - is there a simpler, easier classifier to use? Note, Logistic Regression does seem to have thresholds other than 0 when used in exactly the same way.
import numpy as np
mu, sigma = 0, 4000 # mean and standard deviation
pop_size=int(1e4)
p1 = (np.random.normal(mu, sigma, pop_size)) #1 million, pinky
p2 = (np.random.normal(mu, sigma, pop_size))
#take 10 samples of each group and plot on the same plot
def sample_pop(n):
s1 = np.random.choice(p1, size=n, replace=False)
s2 = np.random.choice(p2, size=n, replace=False)
plt.subplot(211)
count, bins, ignored = plt.hist(p1, 50, density=False, color='green', range=[-15000, 15000], histtype='bar', ec='black')
plt.ylabel("n with Rebel Alliance")
ymax=plt.gca().get_ylim()
plt.plot(s1,[ymax]*n,'o',color='green')
plt.subplot(212)
count, bins, ignored = plt.hist(p2, 50, density=False, color = "red", range=[-15000, 15000], histtype='bar', ec='black')
plt.xlabel("Midi Clorian Rate (The Force)")
plt.ylabel("n with Dark Side")
ymax=plt.gca().get_ylim()[1]
plt.plot(s2,[ymax]*n,'x',color='red')
plt.show()
return s1,s2
n=10
s1,s2=sample_pop(n)
from sklearn.linear_model import Perceptron
clf = Perceptron()
s_all=np.hstack((s1,s2)).reshape(-1, 1)
y=np.hstack( ( [0]*len(s1), [1]*len(s2) ) )
clf.fit(s_all, y)
def plot1D(X, y, model,show=True):
# adapted from https://github.com/tirthajyoti/Machine-Learning-with #Python/blob/master/Utilities/ML-Python-utils.py
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.2 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = -.1,.1# X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
# Predictions to obtain the classification results
#Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
Z = model.predict(np.arange(x_min, x_max, 0.1).reshape(-1, 1))
dZ=np.diff(Z)
print(Z[np.where(abs(dZ)>0)[0]]) #this is the threshold
# Plotting
if show:
plt.figure(figsize=(6,6))
plt.contourf(xx, yy, np.vstack((Z,Z)), alpha=0.4)
plt.scatter(X[:, 0], np.array( [-.05]*len(X) ), c=y, alpha=0.8, edgecolor="k")
plt.ylim(-.1,0)
plt.gca().get_yaxis().set_ticks([]) #set_visible(False)
plt.xlabel('Midichlorian Rate (The Force)')
if show:
plt.show()
plot1D(s_all,y,clf)
from sklearn.metrics import accuracy_score
acc=accuracy_score(y, clf.predict(s_all))
acc
clf.n_iter_
PS - adding this image to answer Chris' comment below:
Upvotes: 0
Views: 165
Reputation: 16172
I believe the issue you are facing is that since it is impossible for the perceptron to properly classify between two identical samples, the training loss will not improve no matter how many iterations you run - the model is randomly guessing.
This model has a default minimum number of iterations of 5, and will continue to iterate until the model accuracy improvement falls below the tol
threshold.
In this case there is basically no chance of the model improving on the 6th iteration, because it's an impossible classification task - so it ends after the 6th iteration.
When the model is reduced to random guessing, I don't suspect you'd see the threshold change because there really isn't any threshold that would reliably improve the classification.
This behavior can be demonstrated by shifting the second distribution far outside the upper bounds of the first distribution, lowering the tol
parameter, and increasing the max_iter
parameter to a higher number.
This should give the model a fighting chance.
import numpy as np
import matplotlib.pyplot as plt
mu, sigma = 0, 4000 # mean and standard deviation
pop_size=int(1e4)
p1 = (np.random.normal(mu, sigma, pop_size)) #1 million, pinky
p2 = (np.random.normal(80000, 2000, pop_size))
#take 10 samples of each group and plot on the same plot
def sample_pop(n):
s1 = np.random.choice(p1, size=n, replace=False)
s2 = np.random.choice(p2, size=n, replace=False)
plt.subplot(211)
count, bins, ignored = plt.hist(p1, 50, density=False, color='green', range=[-15000, 15000], histtype='bar', ec='black')
plt.ylabel("n with Rebel Alliance")
ymax=plt.gca().get_ylim()
plt.plot(s1,[ymax]*n,'o',color='green')
plt.subplot(212)
count, bins, ignored = plt.hist(p2, 50, density=False, color = "red", range=[-15000, 15000], histtype='bar', ec='black')
plt.xlabel("Midi Clorian Rate (The Force)")
plt.ylabel("n with Dark Side")
ymax=plt.gca().get_ylim()[1]
plt.plot(s2,[ymax]*n,'x',color='red')
plt.show()
return s1,s2
n=10
s1,s2=sample_pop(n)
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=None, max_iter=20000)
s_all=np.hstack((s1,s2)).reshape(-1, 1)
y=np.hstack( ( [0]*len(s1), [1]*len(s2) ) )
clf.fit(s_all, y)
def plot1D(X, y, model,show=True):
# adapted from https://github.com/tirthajyoti/Machine-Learning-with #Python/blob/master/Utilities/ML-Python-utils.py
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.2 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = -.1,.1# X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
# Predictions to obtain the classification results
#Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
Z = model.predict(np.arange(x_min, x_max, 0.1).reshape(-1, 1))
dZ=np.diff(Z)
print(Z[np.where(abs(dZ)>0)[0]]) #this is the threshold
# Plotting
if show:
plt.figure(figsize=(6,6))
plt.contourf(xx, yy, np.vstack((Z,Z)), alpha=0.4)
plt.scatter(X[:, 0], np.array( [-.05]*len(X) ), c=y, alpha=0.8, edgecolor="k")
plt.ylim(-.1,0)
plt.gca().get_yaxis().set_ticks([]) #set_visible(False)
plt.xlabel('Midichlorian Rate (The Force)')
if show:
plt.show()
plot1D(s_all,y,clf)
from sklearn.metrics import accuracy_score
acc=accuracy_score(y, clf.predict(s_all))
acc
clf.n_iter_
Upvotes: 1