Reputation: 702
I'm trying to find an optimal number of clusters on my data with elbow method and silhouette score while using KMeans. Although, I'm testing these methods using dimensionality reduction.
If I try PCA several times, I will get the same graphs for elbow method and silhouette every time. But if I try an encoder with a neural net structure for the same purpose, I get different graphs every time. And consequentely, I don't have confidence to use this encoder technique as it results differents optimal clusters numbers.
Why this happens? Even if I normalize my data, the results keep varying.
What can I do to use this encoder technique properly? I know I could simply choose PCA for this, but I would like to understand and see if I'm doing something wrong.
Here is my code and you can run it several times to see what I'm talking about. I used iris dataset as an example:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.metrics import silhouette_score, silhouette_samples
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
def autoencoding(data):
n_input_layer = data.shape[1]
n_encoding_layer = 2
n_output_layer = n_input_layer
# AUTOENCODER
autoencoder = tf.keras.models.Sequential([
# ENCODER
Dense(n_input_layer, input_shape = (n_input_layer,), activation = 'relu'), # Input layer
# CENTRAL LAYER
Dense(n_encoding_layer, activation = 'relu', name = 'central_layer'),
# DECODER
Dense(n_output_layer, activation = 'relu') # Output layer
])
n_epochs = 2000
loss = tf.keras.losses.MeanSquaredError()
optimizer = tf.optimizers.Adam(learning_rate = 0.001, decay = 0.0001, clipvalue = 0.5)
loss_history = [] # save loss improvement
data = np.array(data, dtype=np.float)
for epoch in range(n_epochs):
with tf.GradientTape() as tape:
current_loss = loss(autoencoder(data), data)
gradients = tape.gradient(current_loss, autoencoder.trainable_variables) # get the gradient of the loss function
optimizer.apply_gradients(zip(gradients, autoencoder.trainable_variables)) # update the weights
loss_history.append(current_loss.numpy()) # save current loss in its history
# show loss improvement every 200 epochs
if (epoch+1) % 200 == 0:
print(str(epoch+1) + '.\tLoss: ' + str(current_loss.numpy()))
print('\nEncoding complete')
return autoencoder
X_autoencoded = autoencoding(X)
# ENCODER EXTRACTION
def encoded(autoencoder, data):
# create a Keras function
extract_encoded_data = K.function(inputs = autoencoder.layers[0].input,
outputs = autoencoder.layers[1].output)
# extract encoded dataframe
encoded_dataframe = extract_encoded_data(data.values)
encoded_data = pd.DataFrame(encoded_dataframe)
return encoded_data
X_encoded = encoded(X_autoencoded, X)
# ELBOW METHOD AND SILHOUETTE SCORE
inertia =[]
sil =[]
for k in range(2,14):
kmeans_rand = KMeans(n_clusters=k, init='k-means++', random_state=42)
kmeans_rand.fit(X_encoded)
y_pred = kmeans_rand.predict(X_encoded)
inertia.append(kmeans_rand.inertia_)
sil.append((k, silhouette_score(X_encoded, y_pred)))
sil_samples = silhouette_samples(X_encoded, y_pred)
fig, ax = plt.subplots(1, 2, figsize=(12,4))
ax[0].plot(range(2,14), inertia)
ax[0].set_title('Elbow Method')
ax[0].set_xlabel('Number of clusters')
ax[0].set_ylabel('Inertia')
x_sil = [x[0] for x in sil]
y_sil = [x[1] for x in sil]
ax[1].plot(x_sil, y_sil)
ax[1].set_xlabel('Number of Clusters')
ax[1].set_ylabel('Silhouetter Score')
ax[1].set_title('Silhouetter Score Curve')
Upvotes: 1
Views: 170
Reputation: 22031
try to set the seed using this lines at the top of your code:
tf.random.set_seed(33)
os.environ['PYTHONHASHSEED'] = str(33)
np.random.seed(33)
random.seed(33)
session_conf = tf.compat.v1.ConfigProto(
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1
)
sess = tf.compat.v1.Session(
graph=tf.compat.v1.get_default_graph(),
config=session_conf
)
tf.compat.v1.keras.backend.set_session(sess)
I'm using tf.keras (TF 2.2) no gpu and I achieve the same results every run
https://colab.research.google.com/drive/1S9iB7AsLLkdTAY827eOBN_VRRi2EVWRA?usp=sharing
Upvotes: 1