Getting reliable K in K means clustering

Here I am sharing the codes for deriving the number of clusters in K means clustering algorithm as shown by Bhavesh Bhatt in his youtube video.


import math
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns 
from sklearn.cluster import KMeans
import warnings
sns.set_color_codes()
sns.set_context("poster")
warnings.filterwarnings("ignore")
np.random.seed(8)
a = np.random.multivariate_normal([10,0],[[3,1],[1,4]], size=[100,])
b = np.random.multivariate_normal([0,20],[[3,1],[1,4]], size=[100,])
c = np.random.multivariate_normal([20,30],[[3,1],[1,4]], size=[100,])
X = np.concatenate((a,b,c))
print X.shape
## (300, 2)
fig = plt.figure(figsize=(15, 10))
plt.xlim(-5,35)
plt.ylim(-5,35)
plt.scatter(X[:,0],X[:,1], c='b', s=5)

dist_points_from_cluster_center = []
K = range(1,10)
for no_of_clusters in K:
    k_model = KMeans(n_clusters=no_of_clusters) ;
    k_model.fit(X) ;
    dist_points_from_cluster_center.append(k_model.inertia_) ;
print dist_points_from_cluster_center
## [70403.82188589728, 27407.17034387677, 2311.4033586287333, 1976.5540367939961, 1643.8735323124279, 1337.0526214543424, 1166.12005389885, 1036.87136535438, 942.9775249201616]
fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K, dist_points_from_cluster_center)
plt.xlabel("No. of clusters K")
plt.ylabel("Sum of squared distance")

fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K,dist_points_from_cluster_center)
plt.plot([K[0], K[8]], [dist_points_from_cluster_center[0],
                       dist_points_from_cluster_center[8]], 'ro-')
plt.xlabel("No. of clusters K")
plt.ylabel("Sum of squared distance")

# Function to find distance
# between a point and a line in 2-d

def calc_distance(x1,y1,a,b,c):
    return abs((a*x1+b*y1+c)) / (math.sqrt(a*a+b*b))

Solving linear equation

https://bobobobo.wordpress.com/2008/01/07/solving-linear-equations-ax-by-c-0/

a = dist_points_from_cluster_center[0] - dist_points_from_cluster_center[8]
b = K[8] - K[0]
c1 = K[0] * dist_points_from_cluster_center[8]
c2 = K[8] * dist_points_from_cluster_center[0]
c = c1-c2
distance_of_points_from_line = []
for k in range(9):
    distance_of_points_from_line.append(
    calc_distance(K[k], dist_points_from_cluster_center[k],a,b,c))
fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K, distance_of_points_from_line)

print "Optimum value of k = " + str(distance_of_points_from_line.index(max(distance_of_points_from_line))+1)
## Optimum value of k = 3
Avatar
Puneet Sharma
Research Scholar

My research interests include cloud & aerosol modeling and statistics.

Related

Previous
comments powered by Disqus