Looking at some abalone data as described by Jason Brownlee :
"It is a multi-class classification problem, but can also be
framed as a regression. The number of observations for each
class is not balanced. There are 4,177 observations with
8 input variables and 1 output variable.
The variable names are as follows:
column what
------ ----
0 sex (M, F, I) means Male, Female, or Infant
1 length
2 diameter
3 height
4 whole weight
5 shucked weight
6 viscera weight
7 shell weight
8 rings target
The baseline performance of predicting the
most prevalent class is a classification accuracy
of approximately 16%. The baseline performance of predicting the
mean value is an RMSE of approximately 3.2 rings."
Examples of work with this data set include
Jim Mahoney | cs.marlboro.college | March 2 2020
from utilities import * # my read_abalone(), normalize_column()
from scratch.linear_algebra import * # scratch's get_column(), distance
from copy import deepcopy
from random import shuffle
from matplotlib import pyplot as plt
raw_abalone = read_abalone() # original data
def peek(data, nrows=5):
""" Look at the first few rows """
for row in data[:nrows]:
print(row[0], ' ', ' '.join([f'{value:7.3f}' for value in row[1:-1]]), f' {row[-1]:6.0f} ')
peek(raw_abalone)
To apply nearest neighbors to this data, I need to decide
To keep things simple, I'll use all the numeric columns but only consider those with the same sex, since that should be comparing apples to apples but not apples to oranges.
Are there enough of each sex to make this practical? Yes :
for sex in ('M', 'F', 'I'):
print(f'count {sex} = ', len([row for row in raw_abalone if row[0]==sex]))
Define names for the column indices.
sex_col = 0
feature_cols = (1, 2, 3, 4, 5, 6, 7)
target_col = 8
Make each numeric feature normalized, i.e. mean=0, sigma=1.
abalone = deepcopy(raw_abalone)
for col in feature_cols:
normalize_column(abalone, col)
peek(abalone)
Create training and test sets.
training_fraction = 0.7 # 70% training, 30% testing
n_rows = len(abalone)
i_cutoff = int(training_fraction * n_rows)
i_rows = list(range(n_rows)) # [0, 1, 2, 3, ...]
shuffle(i_rows) # [100, 23, 17, 204, ...] in place shuffle
training = [abalone[i] for i in i_rows[:i_cutoff]]
testing = [abalone[i] for i in i_rows[i_cutoff:]]
for (name, data) in (('training', training), ('testing', testing)):
print(f'--- {len(data)} {name} rows --')
peek(data)
print()
# Here's the nearest neigbors routine itself:
def abby_distance(row_a, row_b):
""" pythogorean distance between two rows of data
using all columns except the first and last """
return distance(row_a[1:-1], row_b[1:-1])
assert abby_distance(training[0], training[0]) == 0.0
assert abby_distance(training[0], testing[0]) > 0.0
def knn(training_data, test_row, k=5, verbose=False):
""" return k-nearest neighbors prediction of target """
# Find the individuals in training_data closest to test_row.
# Return the mean of the k'th closest target values.
# Only use rows in the training_data with the same sex as test_row.
#
# training set members with the same sex :
candidates = [row for row in training_data if row[sex_col]==test_row[sex_col]]
# sorted by distance to example :
neighbors = sorted(candidates, key=lambda c: abby_distance(c, test_row))
# mean target value for 1st k sorted in training
prediction = mean([neighbor[target_col] for neighbor in neighbors[:k]])
return prediction
def find_knn_error(training_data, test_data, k=5):
""" Return differences between predicted and actual target values """
return [knn(training_data, t, k) - t[target_col] for t in test_data]
k=5
errors = find_knn_error(training, testing, k=k)
for (name, f) in (('min', min), ('max', max),
('mean', mean), ('sigma', standard_deviation)):
print(f' {name}(errors) = {f(errors)}')
targets = get_column(raw_abalone, target_col)
for (name, f) in (('min', min), ('max', max),
('mean', mean), ('sigma', standard_deviation)):
print(f' {name}(targets) = {f(targets)}')
predictions = [errors[i] + testing[i][target_col]
for i in range(len(testing))]
fractional_errors = [errors[i]/testing[i][target_col]
for i in range(len(testing))]
plt.figure(dpi=220, figsize=(3, 2)) # dots_per_inch and (width, height) in inches
plt.plot(fractional_errors, linestyle='none', marker='.', markersize=1)
plt.title(f'prediction errors k={k}')
plt.ylabel('fractional error')
plt.show()
Is this a good result? Hmmm.
TO DO : repeat with different features and/or different k.
# testing get_column
column_one = normalize(get_column(abalone, 1))
standard_deviation(column_one), mean(column_one)
# testing in-place normalize_column
t = [[1,2,3],
[4,5,6],
[10,11,12]]
normalize_column(t, 1) # in place
t
# see if scratch's distance() function works and looks OK
row_a = raw_abalone[13][1:-1] # random row; skip sex and target
row_b = raw_abalone[17][1:-1] # random row; skip sex and target
print('row_a : ', row_a)
print('row_b : ', row_b)
print('distance between is ', distance(row_a, row_b))
# looking at the data ...
# Is a a weight of 0.0 might a valid value? Maybe for tiny infants?? Hmmm.
for col in feature_columns:
column = get_column(raw_abalone, col)
print(f'column {col} has min={min(column):6.2f}, max={max(column):6.2f}')
# make sure we know how to use the shuffle function
from random import shuffle
a = [1,2,3,4]
shuffle(a)
a