Using Fisher's 1936 set of measurements of three species of the iris flower to look at clustering methods.
The data set consists of 50 samples from each of three species of Iris
(Iris setosa, Iris virginica and Iris versicolor). Four features were
measured from each sample: the length and the width of the sepals and petals,
in centimeters. Based on the combination of these four features, Fisher
developed a linear discriminant model to distinguish the species from each other.
... wikipedia
Jim Mahoney | April2020
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
iris = pd.read_csv('./data/iris.csv')
print('\n -- first five rows - raw data -- ')
print(iris[:5])
print('\n -- statistics summary - raw data -- ')
print(iris.describe())
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
In other words, subtract the mean and divide by the standard deviation from each column so that they all will have (mean=0, std=1.0) and therefore the same importance in the later analytic work.
I'm going to just modify the data in place, discarding the original numbers.
Notice how pandas lets you do numeric operations on the whole column at once.
Also, remove the 'Iris-' from all the species names : that's just annonying.
for feature in features:
iris[feature] = (iris[feature] - iris[feature].mean()) / iris[feature].std()
iris['species'] = iris['species'].apply(lambda name: name.replace('Iris-', ''));
print('\n -- first five rows - normalized data -- ')
print(iris[:5])
print('\n -- statistics summary - normalized data -- ')
print(iris.describe())
def cluster_init(iris, k):
""" Modify iris in place by creating a 'cluster' column if needed,
and initializing each cluster value to a random number from 0 to k-1
where k is the number of clusters.
Also store k within iris object as iris.k
"""
iris.k = k # Store value of k.
iris['cluster'] = None # Create the column if it doesn't exist yet.
# Pick a point at random to be the center of each cluster;
# then assign the closest to that one to each cluster.
# (At first I assigned each cluster index to a random number.
# But this didn't work out so well ... for large k,
# the points all ended up in the same cluster.)
centers = []
for i in range(k):
i_random = random.randint(0, len(iris)-1)
centers.append(np.array(iris.iloc[i_random][:4])) # 1st 4 values are data values
iris['cluster'] = cluster_closest(iris, centers)
def cluster_center(iris, _k):
""" Return numpy array of values for one cluster with index _k,
calculating the center by taking the mean of the rows assigned to that cluster. """
return np.array([ np.mean(iris[feature][iris['cluster'] == _k])
for feature in features ])
def cluster_centers(iris):
""" return cluster centers (mean of assigned points) for the k clusters """
return np.array([ cluster_center(iris, _k) for _k in range(iris.k) ])
def squared_distance(vector1, vector2):
""" return squared distance between two numpy vectors """
return sum( (vector1 - vector2)**2 )
assert squared_distance(np.array([0,0]), # squared distance from this
np.array([3,4])) == 5**2 # to this is
def closest_index(vector1, vectors):
""" return i such that vectors[i] has shortest distance to vector1 """
return np.argmin(np.array( [squared_distance(vector1, _vec) for _vec in vectors] ))
assert closest_index( np.array([0,0,0]), # This one,
np.array([ [1,1,1], [2,2,2], [3,3,3] ]) # compared to these,
) == 0 # is closest to 0'th .
def cluster_closest(iris, centers):
""" Return list of closest cluster indexes """
closest = []
for row in iris.itertuples():
# see pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.itertuples.html
# row columns in this itertuples() are
# 0 index
# 1,2,3,4 data values
# 5 species
# 6 cluster assignment
values = np.array( row[1:5] ) # columns are (index, sepal_length, ... )
closest.append(closest_index(values, centers))
return np.array(closest)
def cluster_convergence_step(iris):
""" Run one step of the cluster convergence algorithm,
modifing the iris['cluster'] values in place. """
# Find the cluster centers from their assignments
centers = cluster_centers(iris)
# Find the index of the closest cluster_position for each row
closest = cluster_closest(iris, centers)
# modify the iris data frame in place
iris['cluster'] = closest # modify iris dataframe
def clone_clusters(iris):
""" return a copy of the cluster assignments """
return list(iris['cluster'])[:]
def cluster_algorithm(iris, k, max_steps=100):
""" modify iris by applying the whole algorithm. Return (steps, status)"""
cluster_init(iris, k)
old_clusters = clone_clusters(iris)
for step in tqdm(range(max_steps)):
cluster_convergence_step(iris)
new_clusters = clone_clusters(iris)
if old_clusters == new_clusters:
return (step, 'CONVERGED')
return (max_steps, 'MAX_STEPS_TAKEN')
def squared_cluster_error(iris):
""" return total squared errors between points and cluster centers """
result = 0.0
centers = cluster_centers(iris)
for row in iris.itertuples():
values = np.array( row[1:5] ) # see comment in cluster_closest
i_cluster = row[6]
result += squared_distance(values, centers[i_cluster])
return result
def cluster_plot(iris, x_name='sepal_length', y_name='petal_length'):
""" Draw a plot of the clusters. (step just puts a "step=" into title) """
plt.figure(dpi=220, figsize=(2.5, 2.5)) # dots_per_inch and (width, height) in inches
k = iris.k
plt.title(f'iris clusters k={k}')
plt.xlabel(x_name.replace('_', ' '), fontsize=8)
plt.ylabel(y_name.replace('_', ' '), fontsize=8)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
colors = ['red', 'blue', 'green', 'magenta', 'cyan', 'grey', 'darkred', 'darkblue' ]
centers = cluster_centers(iris)
for _k in range(k):
x = iris[x_name][iris.cluster == _k]
y = iris[y_name][iris.cluster == _k]
plt.plot(x, y, linestyle='none', marker='.', markersize=2, color=colors[_k])
i_x = features.index(x_name) # index for this feature in vector of data values
i_y = features.index(y_name)
plt.annotate(s=str(_k), xy=(centers[_k][i_x], centers[_k][i_y]), color=colors[_k], fontsize=8)
k = 3
cluster_init(iris, k)
print(iris[:10])
print()
print(f'cluster {1} position for \n{features} is \n{cluster_center(iris, 1)}')
# initial random assignment
cluster_plot(iris)
# printing out assignments and centers for each step
for i in range(5):
print('----- step ', i)
print('first 10 cluster assignments: ', list(iris['cluster'][:10]))
centers = cluster_centers(iris)
print('cluster centers are')
print(centers)
closest_centers = cluster_closest(iris, centers)
print('closest centers are')
print(closest_centers)
print('reassigning clusters')
iris['cluster'] = closest_centers
# and here's what we get
cluster_plot(iris)
... which looks like it's working.
# re-initialize to a random assignment
k = 3
cluster_init(iris, k)
cluster_plot(iris)
# take 5 steps
for i in range(5):
cluster_convergence_step(iris)
cluster_plot(iris)
(steps, result) = cluster_algorithm(iris, 3, max_steps=500)
print(f"result='{result}' in steps={steps}")
cluster_plot(iris)
I've done several iterations ... and found very different results depending on the starting state.
Sometimes there are two clusters in the bottom clump, other times two in the top clump. Though when I tried to repeat that result, most of what I got looked like the one above.
Perhaps do multiple times & average somehow? Or vote on ones that look similar?
Dunno.
errors = []
for k in range(1, 10):
print()
attempt = 0
while attempt < 10:
(steps, result) = cluster_algorithm(iris, k) # initilize data & run algorithm.
attempt += 1
error = squared_cluster_error(iris)
print(f' k={k} result={result} steps={steps} error={error}')
if k==1 or error<500:
break
else:
print(' ... all in one cluster ... trying again')
errors += [error]
plt.plot(range(1, k+1), errors)
plt.xlabel('k')
plt.ylabel('error')
... which is something like the "knee" plots discussed in the text.
As you see in the code, sometimes the algorithm would have all of the points fall into one cluster and get stuck, giving a high error - the same as k=1. I modified the code to try again for those cases, up to 10 times.
The true number is k=3, but visually (at least for what I plotted) k=2 also doesn't look bad. The plot maybe suggests this ... YMMV.
;)