Taking a look at one of the well-known standard datasets : Fisher's 1936 set of measurements of three species of the iris flower.
The data set consists of 50 samples from each of three species of Iris
(Iris setosa, Iris virginica and Iris versicolor). Four features were
measured from each sample: the length and the width of the sepals and petals,
in centimeters. Based on the combination of these four features, Fisher
developed a linear discriminant model to distinguish the species from each other.
... wikipedia
Jim Mahoney | Feb 3 2020
# python modules
from matplotlib import pyplot as plt
from typing import Union, List
import pprint
pp = pprint.PrettyPrinter(indent=4) # see https://docs.python.org/3/library/pprint.html
First, here's a version that uses matplotlib but not pandas or numpy.
# Read in the raw data.
csv_filename = './data/iris.csv'
# column indices (See the first header line of the csv file.
(i_sepal_length, i_sepal_width, i_petal_length, i_petal_width, i_species) = [0, 1, 2, 3, 4]
raw_lines = open(csv_filename).readlines()[1:] # skip the first header line.
print(f"number of lines in '{csv_filename}' is {len(raw_lines)}")
raw_lines[:3]
# Define a data conversion utility routine.
def cleanup(entry: str) -> Union[float, str]:
""" Convert entries like '5.0' to 5.0, and those 'Iris-setosa' to 'setosa' """
entry = entry.strip() # remove trailing newline
try:
entry = float(entry) # convert number string to float
except:
pass
try:
entry = entry.replace('Iris-', '') # remove 'Iris-' prefix from species
except:
pass
return entry
assert cleanup('3.2') == 3.2
assert cleanup('Iris-setosa\n') == 'setosa'
# Process the data into a useable form.
iris_data = []
for line in raw_lines:
iris_data.append( [cleanup(entry) for entry in line.split(',')] )
# See a sample.
iris_data[:5] + ['...'] + iris_data[-5:]
Just for fun, I'm going to look at (length - width)/length for the sepals and petals, perhaps as a way to find something like how far away it is from being square.
I'll call this "skew".
I should end up with 6 sets of 50 numbers, mostly in the 0 to 1 range, one set for each of the three species and two flower parts (which I'll call "leafs" even though they aren't).
def is_species(row, specie: str) -> bool:
""" True if this row matches this specie """
# This will help me get the data from just one of the three species.
return row[i_species] == specie
assert is_species(iris_data[0], 'setosa')
def calc_skew(length: float, width: float) -> float:
""" My own mangling of the data : (length - width)/length """
return (length - width) / length
species = ('setosa', 'virginica', 'versicolor')
leaves = ('sepal', 'petal')
indeces_lengthwidth = {'sepal': (0, 1), 'petal': (2, 3)}
# Now put the numbers into a data structe that will let is get the six lists of numbers,
# which I've decided will be a dict of dicts of lists :
#
# species leaf (length-width)/length
# skew['setosa']['sepal'] = [0.2, 0.31, 0.14, ...] # 50 values
skew = {}
for specie in species:
skew[specie] = {}
for leaf in leaves:
(i_length, i_width) = indeces_lengthwidth[leaf]
skew[specie][leaf] = [calc_skew(row[i_length], row[i_width])
for row in iris_data
if is_species(row, name)]
# Let's see if what I just made looks like what I think it is.
skew.keys()
skew['setosa'].keys()
skew['setosa']['petal'][0:5]
len(skew['setosa']['petal'])
# To start, I'll just grab one of the six skew lists.
specie = 'setosa'
leaf = 'petal'
xs = skew[specie][leaf] # "xs" is the plural of "x" ... a Haskell convention. :)
print([f'{x:.4f}' for x in xs]) # ... four decimal places
# Count how many of these numbers fall into each bin.
n_bins = 15
(bin_low, bin_hi) = (min(xs), max(xs))
bin_size = (bin_hi - bin_low)/n_bins
bin_centers = [bin_low + (n + 0.5) * bin_size for n in range(n_bins)]
counts = [0] * n_bins
for i in range(n_bins):
for x in xs:
if bin_low + i * bin_size <= x <= bin_low + (1+i) * bin_size:
counts[i] += 1
# Does this look OK ?
counts, sum(counts)
# A bar graph ! (after a bit of trial and error for sizes and what-not)
plt.figure(dpi=220, figsize=(3, 2)) # init & set size of the new plot (3" x 2", 220dpi)
bin_labels = [f'{bin:.2f}' for bin in bin_centers]
plt.bar(bin_centers, counts, 0.8 * bin_size)
plt.title(f"skew of ({specie}, {leaf})")
plt.ylabel("count")
plt.xticks(bin_centers, bin_labels, fontsize=4)
plt.xlim(bin_low, bin_hi)
plt.show ()
Either
# How about a scatter plot ?
# Here are the numbers for the other flower part, for the same specie
leaf2 = 'sepal'
ys = skew[specie][leaf2]
print([f'{y:.4f}' for y in ys]) # ... four decimal places
plt.figure(dpi=220, figsize=(2, 2))
plt.xticks(fontsize=7) # Size set by trial and error.
plt.yticks(fontsize=7) # (The changes from the last graph are still in effect.)
plt.scatter(xs, ys, s=2) # s is marker size ; scalar or array (size of each one)
plt.title(f"skew for {specie}")
plt.xlabel(leaf)
plt.ylabel(leaf2)
plt.axis("equal") # set distance scale on the two axes the same
## The two types of flower parts have different skewness; this puts them on the same scale.
# plt.xlim(0.4, 0.8)
# plt.ylim(0.4, 0.8)
plt.show()
Put all three species on the same plot, with different colors.
Try the same thing with pandas and numpy?
Pandas :
Numpy :
# Two new libraries :
import pandas as pd
import numpy as np
iris = pd.read_csv(csv_filename)
iris
a = 2
iris['species'] == 'Iris-virginica'
# We can extract one of the species like this
x = iris['sepal_length'][iris['species'] == 'Iris-virginica']
type(x)
# And we can simple arithmetic on the whole collection at once.
x * 2
# To work with one species at a time,
# we could make a new data frame for each ...
# or write a function that would apply a filter like that.
# Here's an example of a manipulation on the whole thing, all species.
iris['sepal_skew'] = (iris['sepal_length'] - iris['sepal_width'])/iris['sepal_length']
iris['petal_skew'] = (iris['petal_length'] - iris['petal_width'])/iris['petal_length']
iris
iris.hist(column='sepal_skew', by='species') # This is all three species ... it would take a bit more work to pull out one at a time.
iris.plot.scatter('sepal_skew', 'petal_skew') # again, thish is all three species
# suggestive ... next step would be to use different colors for different species.
Pandas (and numpy, though I didn't use it explicitly here) are very powerful, and let us do this sort of work more quickly.
However, they are also more complicated, and sometimes that complexity can get in the way of what you're trying to do. It's another system and API to learn, on top of all the rest.
Sometimes, even with all of pandas bells and whistles, you want to do something that isn't already built-in ... and then you start falling back on the "from scratch" methods. Which is in part why we're looking at them.