Reputation: 89
I have A problem with labels data with Kmeans Algorithm. My test Sentences got the True Cluster, But i didn't get the true labels. i already using numpy for matching the cluster with the true_label_test, but this kmeans can moving the cluster, the true labels doesn't match with the number of cluster. I need help for this problem. here's my code
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import numpy as np
from collections import Counter
stop = set(stopwords.words('indonesian'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
# Cleaning the text sentences so that punctuation marks, stop words & digits are removed
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
processed = re.sub(r"\d+","",normalized)
y = processed.split()
#print (y)
return y
path = "coba.txt"
train_clean_sentences = []
fp = open(path,'r')
for line in fp:
line = line.strip()
cleaned = clean(line)
cleaned = ' '.join(cleaned)
train_clean_sentences.append(cleaned)
#print(train_clean_sentences)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_clean_sentences)
# Clustering the training 30 sentences with K-means technique
modelkmeans = KMeans(n_clusters=3, init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)
teks_satu = "Aplikasi Machine Learning untuk mengenali daun mangga dengan metode CNN"
test_clean_sentence = []
cleaned_test = clean(teks_satu)
cleaned = ' '.join(cleaned_test)
cleaned = re.sub(r"\d+","",cleaned)
test_clean_sentence.append(cleaned)
Test = vectorizer.transform(test_clean_sentence)
true_test_labels = ['AI','VR','Sistem Informasi']
predicted_labels_kmeans = modelkmeans.predict(Test)
print(predicted_labels_kmeans)
print ("\n-------------------------------PREDICTIONS BY K-Means--------------------------------------")
print ("\nIndex of Virtual Reality : ",Counter(modelkmeans.labels_[5:10]).most_common(1)[0][0])
print ("Index of Machine Learning : ",Counter(modelkmeans.labels_[0:5]).most_common(1)[0][0])
print ("Index of Sistem Informasi : ",Counter(modelkmeans.labels_[10:15]).most_common(1)[0][0])
print ("\n",teks_satu,":",true_test_labels[np.int(predicted_labels_kmeans)],":",predicted_labels_kmeans)
Upvotes: 7
Views: 6012
Reputation: 11
Solution from Albert G Lieu is good and helped me a lot but could have a duplicated index value issue if the confusion matrix gives equal results on some axis.
This portion :
cm_argmax = cm.argmax(axis=0)
cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
Should be replaced by :
cm_argmax = cm.argmax(axis=0)
# Find the duplicate value
duplicate_value = None
for value in cm_argmax:
if np.count_nonzero(cm_argmax == value) > 1:
duplicate_value = value
break
# Find the missing value
missing_value = None
for i in range(len(cm_argmax)):
if i not in cm_argmax:
missing_value = i
break
# Replace one of the duplicate values with the missing value at the correct index
corrected_cm_argmax = np.copy(cm_argmax)
for i, value in enumerate(cm_argmax):
if value == duplicate_value:
corrected_cm_argmax[i] = missing_value
break
corrected_cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
Upvotes: 1
Reputation: 402
you can assign label of majority of true labels in each clusterto to that cluster
Upvotes: 0
Reputation: 911
Here is a concrete example showing how to match KMeans
cluster ids with training data labels. The underlying idea is confusion_matrix
shall have large values on its diagonal line assuming that classification is done correctly. Here is the confusion matrix before associating cluster center ids with training labels:
cm =
array([[ 0, 395, 0, 5, 0],
[ 0, 2, 5, 391, 2],
[ 2, 0, 0, 0, 398],
[ 0, 0, 400, 0, 0],
[398, 0, 0, 0, 2]])
Now we just need to reorder the confusion matrix to make its large values relocate on the diagonal line. It can be achieved easily with
cm_argmax = cm.argmax(axis=0)
cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
Here we get the new confusion matrix, which looks much familiar now, right?
cm_ =
array([[395, 5, 0, 0, 0],
[ 2, 391, 2, 5, 0],
[ 0, 0, 398, 0, 2],
[ 0, 0, 0, 400, 0],
[ 0, 0, 2, 0, 398]])
You can further verify the result with accuracy_score
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
accuracy_score(y,y_pred_)
# 0.991
The entire standalone code is here:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import confusion_matrix,accuracy_score
blob_centers = np.array(
[[ 0.2, 2.3],
[-1.5 , 2.3],
[-2.8, 1.8],
[-2.8, 2.8],
[-2.8, 1.3]])
blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])
X, y = make_blobs(n_samples=2000, centers=blob_centers,
cluster_std=blob_std, random_state=7)
def plot_clusters(X, y=None):
plt.scatter(X[:, 0], X[:, 1], c=y, s=1)
plt.xlabel("$x_1$", fontsize=14)
plt.ylabel("$x_2$", fontsize=14, rotation=0)
plt.figure(figsize=(8, 4))
plot_clusters(X)
plt.show()
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X)
cm = confusion_matrix(y, y_pred)
cm
cm_argmax = cm.argmax(axis=0)
cm_argmax
y_pred_ = np.array([cm_argmax[i] for i in y_pred])
cm_ = confusion_matrix(y, y_pred)
cm_
accuracy_score(y,y_pred_)
Upvotes: 1