How to plot the cluster centers?

Question

By using this k-means clustering from scratch, How can i plot the initial random cluster centers for k=3 on the scatter plot in the photo? Photo for Iris dataset

import numpy as np
from scipy.spatial.distance import cdist 
 
def kmeans(x,k, no_of_iterations):
    idx = np.random.choice(len(x), k, replace=False)
    #Randomly choosing Centroids 
    centroids = x[idx, :]
     
    #finding the distance between centroids and all the data points
    distances = cdist(x, centroids ,'euclidean')
     
    points = np.array([np.argmin(i) for i in distances])
     
    for _ in range(no_of_iterations): 
        centroids = []
        for idx in range(k):
            #Updating Centroids by taking mean of Cluster it belongs to
            temp_cent = x[points==idx].mean(axis=0) 
            centroids.append(temp_cent)
 
        centroids = np.vstack(centroids) #Updated Centroids 
         
        distances = cdist(x, centroids ,'euclidean')
        points = np.array([np.argmin(i) for i in distances])
         
    return points

JohanC · Accepted Answer

You can draw the points and the centers via matplotlib's scatter function. Colors can be assigned depending on the group calculated via kmeans.

Here is an example (the kmeans function now also return the centroids).

import matplotlib.pyplot as plt
import seaborn as sns  # for the iris dataset
import numpy as np
from scipy.spatial.distance import cdist

def kmeans(x, k, no_of_iterations=100):
    idx = np.random.choice(len(x), k, replace=False)
    # Randomly choosing Centroids
    centroids = x[idx, :]
    # finding the distance between centroids and all the data points
    distances = cdist(x, centroids, 'euclidean')
    points = np.array([np.argmin(i) for i in distances])

    for _ in range(no_of_iterations):
        centroids = []
        for idx in range(k):
            # Updating Centroids by taking mean of Cluster it belongs to
            temp_cent = x[points == idx].mean(axis=0)
            centroids.append(temp_cent)
        centroids = np.vstack(centroids)  # Updated Centroids
        distances = cdist(x, centroids, 'euclidean')
        points = np.array([np.argmin(i) for i in distances])
    return points, centroids

iris = sns.load_dataset('iris')
x = iris[['sepal_length', 'sepal_width']].to_numpy()

k = 3
points, centroids = kmeans(x, k)

colors = plt.cm.Set2.colors
for val, color in zip(range(k), colors):
    plt.scatter(centroids[val, 0], centroids[val, 1], facecolor='none', edgecolor=color, lw=3,
                s=100, label=f'centroid {val}')
for val, color in zip(range(k), colors):
    plt.scatter(x[points == val, 0], x[points == val, 1], color=color, label=f'set {val}')
plt.legend(ncol=2)
plt.show()

Here is an attempt to show the given target names together with the kmeans approximation. Note that the order of the kmeans values is random. The larger background circles show the target names, the smaller circles, nicely grouped towards their centroid, come from kmeans.

from sklearn.datasets import load_iris

iris_data = load_iris()
x = iris_data.data[:, :2]

color_givens = ['magenta', 'gold', 'cyan']
for val, (name, color) in enumerate(zip(iris_data.target_names, color_givens)):
    plt.scatter(x[iris_data.target == val, 0], x[iris_data.target == val, 1],
                color=color, s=150, alpha=0.6, label=f'given {name}')

k = 3
points, centroids = kmeans(x, k)
colors_kmeans = plt.cm.Set1.colors
for val, color in zip(range(k), colors_kmeans):
    plt.scatter(centroids[val, 0], centroids[val, 1], facecolor='none', edgecolor=color, lw=3,
                s=150, label=f'centroid {val}')
for val, color in zip(range(k), colors_kmeans):
    plt.scatter(x[points == val, 0], x[points == val, 1], color=color, label=f'set {val}')
plt.xlabel(iris_data.feature_names[0])
plt.ylabel(iris_data.feature_names[1])
plt.legend(ncol=3)
plt.show()

How to plot the cluster centers?

Answers (1)

Related Questions