This articles describes the Within Cluster Sum Of Squares, a metric that is used to determine the spread of the clusters
K-Means : WCSS and Compactness ( by Ashwani Kumar )¶
In [ ]:
Part1: What is WCSS (AKA SSE) ?¶
- Within Cluster Sum of Squares (AKA Sum of Squared Errors)
- It measures compactness of clusters
In [1]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
plt.style.use('dark_background') # Black background
In [7]:
# 1) Lets create some data points
X = np.array([
[1, 1],
[1, 2],
[4, 3],
[5, 4],
[6, 3]
])
# Scatter plot
plt.figure(figsize=(6, 5))
plt.scatter(X[:, 0], X[:, 1], s=100)
# Axis labels and title
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("2D Feature Space")
plt.grid(True)
plt.show()
In [8]:
# 2) Find k=2 clusters
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
print("Cluster labels:", kmeans.labels_)
print("Cluster centroids:\n", kmeans.cluster_centers_)
print("WCSS for K=2:", kmeans.inertia_)
Cluster labels: [0 0 1 1 1] Cluster centroids: [[1. 1.5 ] [5. 3.33333333]] WCSS for K=2: 3.166666666666667
In [9]:
# 3) Plot the points and the cluster centroid
# Labels and centroids
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
plt.figure(figsize=(6, 6))
# Colors for clusters
colors = ['cyan', 'yellow']
# Plot each cluster
for cluster_id in range(2):
plt.scatter(
X[labels == cluster_id, 0], # x-coordinates
X[labels == cluster_id, 1], # y-coordinates
s=120,
color=colors[cluster_id],
label=f"Cluster {cluster_id+1}"
)
# Plot centroids
plt.scatter(
centroids[:, 0],
centroids[:, 1],
s=200,
color='red',
marker='X',
edgecolors='white',
linewidth=1.5,
label='Centroids'
)
plt.title("K-Means Clustering (k=2)")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.grid(color='gray', linestyle='--', linewidth=0.3)
plt.show()
In [ ]:
In [ ]:
Part2: WCSS vs Compactness¶
In [23]:
# 1) Generate compact clusters
X_compact, _ = make_blobs(n_samples=20, centers=3, cluster_std=0.7, random_state=42)
# Generate spread-out clusters
X_spread, _ = make_blobs(n_samples=20, centers=3, cluster_std=2.0, random_state=42)
# print(f"X_compact: \n{X_compact}")
# print(f"X_spread: \n{X_spread}")
# Plot compact clusters
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_compact[:, 0], X_compact[:, 1], edgecolors='k')
plt.title(f'Compact Clusters')
# Plot spread-out clusters
plt.subplot(1, 2, 2)
plt.scatter(X_spread[:, 0], X_spread[:, 1], edgecolors='k')
plt.title(f'Spread-Out Clusters')
plt.show()
In [27]:
# 2) Apply KMeans with k=3 clusters
kmeans_compact = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_spread = KMeans(n_clusters=3, random_state=42, n_init=10)
# Fit KMeans
kmeans_compact.fit(X_compact)
kmeans_spread.fit(X_spread)
# Calculate WCSS
wcss_compact = kmeans_compact.inertia_
wcss_spread = kmeans_spread.inertia_
# Plot compact clusters
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_compact[:, 0], X_compact[:, 1],
c=kmeans_compact.labels_,
cmap='viridis', edgecolors='white'
)
plt.scatter(kmeans_compact.cluster_centers_[:, 0],
kmeans_compact.cluster_centers_[:, 1],
s=100, c='yellow', marker='X',
label='Centroids'
)
plt.title(f'Compact Clusters (WCSS: {wcss_compact:.2f})')
plt.legend()
# Plot spread-out clusters
plt.subplot(1, 2, 2)
plt.scatter(X_spread[:, 0], X_spread[:, 1],
c=kmeans_spread.labels_,
cmap='viridis', edgecolors='white'
)
plt.scatter(kmeans_spread.cluster_centers_[:, 0],
kmeans_spread.cluster_centers_[:, 1],
s=100, c='yellow', marker='X',
label='Centroids'
)
plt.title(f'Spread-Out Clusters (WCSS: {wcss_spread:.2f})')
plt.legend()
plt.show()
# Print WCSS values
print(f'WCSS for compact clusters: {wcss_compact:.2f}')
print(f'WCSS for spread-out clusters: {wcss_spread:.2f}')
WCSS for compact clusters: 15.89 WCSS for spread-out clusters: 129.73
Observation:¶
- Points that are less spread out, i.e. more compact, have small WCSS
- Points that are more spread out, i.e. less compact, have large WCSS
In [ ]:
