Reputation: 13
By using this k-means clustering from scratch, How can i plot the initial random cluster centers for k=3 on the scatter plot in the photo? Photo for Iris dataset
import numpy as np
from scipy.spatial.distance import cdist
def kmeans(x,k, no_of_iterations):
idx = np.random.choice(len(x), k, replace=False)
#Randomly choosing Centroids
centroids = x[idx, :]
#finding the distance between centroids and all the data points
distances = cdist(x, centroids ,'euclidean')
points = np.array([np.argmin(i) for i in distances])
for _ in range(no_of_iterations):
centroids = []
for idx in range(k):
#Updating Centroids by taking mean of Cluster it belongs to
temp_cent = x[points==idx].mean(axis=0)
centroids.append(temp_cent)
centroids = np.vstack(centroids) #Updated Centroids
distances = cdist(x, centroids ,'euclidean')
points = np.array([np.argmin(i) for i in distances])
return points
Upvotes: 1
Views: 3075
Reputation: 80329
You can draw the points and the centers via matplotlib's scatter
function. Colors can be assigned depending on the group calculated via kmeans
.
Here is an example (the kmeans
function now also return the centroids).
import matplotlib.pyplot as plt
import seaborn as sns # for the iris dataset
import numpy as np
from scipy.spatial.distance import cdist
def kmeans(x, k, no_of_iterations=100):
idx = np.random.choice(len(x), k, replace=False)
# Randomly choosing Centroids
centroids = x[idx, :]
# finding the distance between centroids and all the data points
distances = cdist(x, centroids, 'euclidean')
points = np.array([np.argmin(i) for i in distances])
for _ in range(no_of_iterations):
centroids = []
for idx in range(k):
# Updating Centroids by taking mean of Cluster it belongs to
temp_cent = x[points == idx].mean(axis=0)
centroids.append(temp_cent)
centroids = np.vstack(centroids) # Updated Centroids
distances = cdist(x, centroids, 'euclidean')
points = np.array([np.argmin(i) for i in distances])
return points, centroids
iris = sns.load_dataset('iris')
x = iris[['sepal_length', 'sepal_width']].to_numpy()
k = 3
points, centroids = kmeans(x, k)
colors = plt.cm.Set2.colors
for val, color in zip(range(k), colors):
plt.scatter(centroids[val, 0], centroids[val, 1], facecolor='none', edgecolor=color, lw=3,
s=100, label=f'centroid {val}')
for val, color in zip(range(k), colors):
plt.scatter(x[points == val, 0], x[points == val, 1], color=color, label=f'set {val}')
plt.legend(ncol=2)
plt.show()
Here is an attempt to show the given target names together with the kmeans approximation. Note that the order of the kmeans values is random. The larger background circles show the target names, the smaller circles, nicely grouped towards their centroid, come from kmeans.
from sklearn.datasets import load_iris
iris_data = load_iris()
x = iris_data.data[:, :2]
color_givens = ['magenta', 'gold', 'cyan']
for val, (name, color) in enumerate(zip(iris_data.target_names, color_givens)):
plt.scatter(x[iris_data.target == val, 0], x[iris_data.target == val, 1],
color=color, s=150, alpha=0.6, label=f'given {name}')
k = 3
points, centroids = kmeans(x, k)
colors_kmeans = plt.cm.Set1.colors
for val, color in zip(range(k), colors_kmeans):
plt.scatter(centroids[val, 0], centroids[val, 1], facecolor='none', edgecolor=color, lw=3,
s=150, label=f'centroid {val}')
for val, color in zip(range(k), colors_kmeans):
plt.scatter(x[points == val, 0], x[points == val, 1], color=color, label=f'set {val}')
plt.xlabel(iris_data.feature_names[0])
plt.ylabel(iris_data.feature_names[1])
plt.legend(ncol=3)
plt.show()
Upvotes: 1