Skip to content

Commit 815a29e

Browse files
committed
2 parents 414953d + 2905bdf commit 815a29e

35 files changed

+699
-147
lines changed

SimpleImageDataset/building02.jpg

1.49 MB
Loading

SimpleImageDataset/scene02.jpg

1.4 MB
Loading

SimpleImageDataset/scene08.jpg

1.82 MB
Loading

ch02/README.rst

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
=========
2+
Chapter 2
3+
=========
4+
5+
Support code for *Chapter 2: Learning How to Classify with Real-world
6+
Examples*. The directory data contains the seeds dataset, originally downloaded
7+
from https://archive.ics.uci.edu/ml/datasets/seeds
8+
9+
figure1.py
10+
Figure 1 in the book: all 2-by-2 scatter plots
11+
12+
figure2.py
13+
Figure 2 in the book: threshold & decision area
14+
15+
figure4_5_sklearn.py
16+
Figures 4 and 5 in the book: Knn decision borders before and after feature
17+
normalization. This also produces a version of the figure using 11
18+
neighbors (not in the book), which shows that the result is smoother, not
19+
as sensitive to exact positions of each datapoint.
20+
21+
figure4_5_no_sklearn.py
22+
Alternative code for Figures 4 and 5 without using scikit-learn
23+
24+
load.py
25+
Code to load the seeds data
26+
27+
simple_threshold.py
28+
Code from the book: finds the first partition, between Setosa and the other classes.
29+
30+
stump.py
31+
Code from the book: finds the second partition, between Virginica and Versicolor.
32+
33+
threshold.py
34+
Functional implementation of a threshold classifier
35+
36+
heldout.py
37+
Evalute the threshold model on heldout data
38+
39+
seeds_knn_sklearn.py
40+
Demonstrate cross-validation and feature normalization using scikit-learn
41+
42+
seeds_threshold.py
43+
Test thresholding model on the seeds dataset (result mention in book, but no code)
44+
45+
seeds_knn_increasing_k.py
46+
Test effect of increasing num_neighbors on accuracy.
47+
48+
knn.py
49+
Implementation of K-Nearest neighbor without using scikit-learn.
50+
51+
seeds_knn.py
52+
Demonstrate cross-validation (without scikit-learn)

ch02/extra/create_tsv.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#
66
# It is made available under the MIT License
77

8-
import milksets.iris
98
import milksets.seeds
109

1110

@@ -16,5 +15,4 @@ def save_as_tsv(fname, module):
1615
for f, n in zip(features, nlabels):
1716
print >>ofile, "\t".join(map(str, f) + [n])
1817

19-
save_as_tsv('iris.tsv', milksets.iris)
2018
save_as_tsv('seeds.tsv', milksets.seeds)

ch02/figure1.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#
66
# It is made available under the MIT License
77

8-
import numpy as np
98
from sklearn.datasets import load_iris
109
from matplotlib import pyplot as plt
1110

@@ -19,11 +18,14 @@
1918
pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
2019
for i, (p0, p1) in enumerate(pairs):
2120
ax = axes.flat[i]
21+
22+
# Use a different marker/color for each class `t`
2223
for t, marker, c in zip(range(3), ">ox", "rgb"):
2324
ax.scatter(features[target == t, p0], features[
2425
target == t, p1], marker=marker, c=c)
2526
ax.set_xlabel(feature_names[p0])
2627
ax.set_ylabel(feature_names[p1])
2728
ax.set_xticks([])
2829
ax.set_yticks([])
30+
fig.tight_layout()
2931
fig.savefig('figure1.png')

ch02/figure2.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@
1414
feature_names = data['feature_names']
1515
species = data['target_names'][data['target']]
1616

17-
setosa = (species == 'setosa')
18-
features = features[~setosa]
19-
species = species[~setosa]
20-
virginica = species == 'virginica'
17+
is_setosa = (species == 'setosa')
18+
features = features[~is_setosa]
19+
species = species[~is_setosa]
20+
is_virginica = (species == 'virginica')
2121

22+
# Hand fixed threshold:
2223
t = 1.75
23-
p0, p1 = 3, 2
24+
25+
# Features to use: 3 & 2
26+
f0, f1 = 3, 2
2427

2528
if COLOUR_FIGURE:
2629
area1c = (1., .8, .8)
@@ -29,19 +32,27 @@
2932
area1c = (1., 1, 1)
3033
area2c = (.7, .7, .7)
3134

32-
x0, x1 = [features[:, p0].min() * .9, features[:, p0].max() * 1.1]
33-
y0, y1 = [features[:, p1].min() * .9, features[:, p1].max() * 1.1]
34-
35-
plt.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
36-
plt.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
37-
plt.plot([t, t], [y0, y1], 'k--', lw=2)
38-
plt.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
39-
plt.scatter(features[virginica, p0],
40-
features[virginica, p1], c='b', marker='o')
41-
plt.scatter(features[~virginica, p0],
42-
features[~virginica, p1], c='r', marker='x')
43-
plt.ylim(y0, y1)
44-
plt.xlim(x0, x1)
45-
plt.xlabel(feature_names[p0])
46-
plt.ylabel(feature_names[p1])
47-
plt.savefig('figure2.png')
35+
# Plot from 90% of smallest value to 110% of largest value
36+
# (all feature values are positive, otherwise this would not work very well)
37+
38+
x0 = features[:, f0].min() * .9
39+
x1 = features[:, f0].max() * 1.1
40+
41+
y0 = features[:, f1].min() * .9
42+
y1 = features[:, f1].max() * 1.1
43+
44+
fig,ax = plt.subplots()
45+
ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
46+
ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
47+
ax.plot([t, t], [y0, y1], 'k--', lw=2)
48+
ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
49+
ax.scatter(features[is_virginica, f0],
50+
features[is_virginica, f1], c='b', marker='o')
51+
ax.scatter(features[~is_virginica, f0],
52+
features[~is_virginica, f1], c='r', marker='x')
53+
ax.set_ylim(y0, y1)
54+
ax.set_xlim(x0, x1)
55+
ax.set_xlabel(feature_names[f0])
56+
ax.set_ylabel(feature_names[f1])
57+
fig.tight_layout()
58+
fig.savefig('figure2.png')
Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from matplotlib.colors import ListedColormap
1212
from load import load_dataset
1313
import numpy as np
14-
from knn import fit_model, predict, accuracy
14+
from knn import fit_model, predict
1515

1616
feature_names = [
1717
'area',
@@ -24,7 +24,19 @@
2424
]
2525

2626

27-
def train_plot(features, labels):
27+
def plot_decision(features, labels):
28+
'''Plots decision boundary for KNN
29+
30+
Parameters
31+
----------
32+
features : ndarray
33+
labels : sequence
34+
35+
Returns
36+
-------
37+
fig : Matplotlib Figure
38+
ax : Matplotlib Axes
39+
'''
2840
y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
2941
x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
3042
X = np.linspace(x0, x1, 100)
@@ -38,28 +50,30 @@ def train_plot(features, labels):
3850
cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
3951
else:
4052
cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
41-
plt.xlim(x0, x1)
42-
plt.ylim(y0, y1)
43-
plt.xlabel(feature_names[0])
44-
plt.ylabel(feature_names[2])
45-
plt.pcolormesh(X, Y, C, cmap=cmap)
53+
fig,ax = plt.subplots()
54+
ax.set_xlim(x0, x1)
55+
ax.set_ylim(y0, y1)
56+
ax.set_xlabel(feature_names[0])
57+
ax.set_ylabel(feature_names[2])
58+
ax.pcolormesh(X, Y, C, cmap=cmap)
4659
if COLOUR_FIGURE:
4760
cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
48-
plt.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
61+
ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
4962
else:
5063
for lab, ma in zip(range(3), "Do^"):
51-
plt.plot(features[labels == lab, 0], features[
64+
ax.plot(features[labels == lab, 0], features[
5265
labels == lab, 2], ma, c=(1., 1., 1.))
66+
return fig,ax
5367

5468

5569
features, labels = load_dataset('seeds')
5670
names = sorted(set(labels))
5771
labels = np.array([names.index(ell) for ell in labels])
5872

59-
train_plot(features, labels)
60-
plt.savefig('figure4.png')
73+
fig,ax = plot_decision(features, labels)
74+
fig.savefig('figure4.png')
6175

6276
features -= features.mean(0)
6377
features /= features.std(0)
64-
train_plot(features, labels)
65-
plt.savefig('figure5.png')
78+
fig,ax = plot_decision(features, labels)
79+
fig.savefig('figure5.png')

ch02/figure4_5_sklearn.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# This code is supporting material for the book
2+
# Building Machine Learning Systems with Python
3+
# by Willi Richert and Luis Pedro Coelho
4+
# published by PACKT Publishing
5+
#
6+
# It is made available under the MIT License
7+
8+
COLOUR_FIGURE = False
9+
10+
from matplotlib import pyplot as plt
11+
from matplotlib.colors import ListedColormap
12+
from load import load_dataset
13+
import numpy as np
14+
from sklearn.neighbors import KNeighborsClassifier
15+
16+
feature_names = [
17+
'area',
18+
'perimeter',
19+
'compactness',
20+
'length of kernel',
21+
'width of kernel',
22+
'asymmetry coefficien',
23+
'length of kernel groove',
24+
]
25+
26+
27+
def plot_decision(features, labels, num_neighbors=1):
28+
'''Plots decision boundary for KNN
29+
30+
Parameters
31+
----------
32+
features : ndarray
33+
labels : sequence
34+
35+
Returns
36+
-------
37+
fig : Matplotlib Figure
38+
ax : Matplotlib Axes
39+
'''
40+
y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
41+
x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
42+
X = np.linspace(x0, x1, 1000)
43+
Y = np.linspace(y0, y1, 1000)
44+
X, Y = np.meshgrid(X, Y)
45+
46+
model = KNeighborsClassifier(num_neighbors)
47+
model.fit(features[:, (0,2)], labels)
48+
C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
49+
if COLOUR_FIGURE:
50+
cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
51+
else:
52+
cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
53+
fig,ax = plt.subplots()
54+
ax.set_xlim(x0, x1)
55+
ax.set_ylim(y0, y1)
56+
ax.set_xlabel(feature_names[0])
57+
ax.set_ylabel(feature_names[2])
58+
ax.pcolormesh(X, Y, C, cmap=cmap)
59+
if COLOUR_FIGURE:
60+
cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
61+
ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
62+
else:
63+
for lab, ma in zip(range(3), "Do^"):
64+
ax.plot(features[labels == lab, 0], features[
65+
labels == lab, 2], ma, c=(1., 1., 1.))
66+
return fig,ax
67+
68+
69+
features, labels = load_dataset('seeds')
70+
names = sorted(set(labels))
71+
labels = np.array([names.index(ell) for ell in labels])
72+
73+
fig,ax = plot_decision(features, labels)
74+
fig.tight_layout()
75+
fig.savefig('figure4sklearn.png')
76+
77+
features -= features.mean(0)
78+
features /= features.std(0)
79+
fig,ax = plot_decision(features, labels)
80+
fig.tight_layout()
81+
fig.savefig('figure5sklearn.png')
82+
83+
fig,ax = plot_decision(features, labels, 11)
84+
fig.tight_layout()
85+
fig.savefig('figure5sklearn_with_11_neighbors.png')

ch02/heldout.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
# This script demonstrates the difference between the training accuracy and
99
# testing (held-out) accuracy.
1010

11-
from matplotlib import pyplot as plt
1211
import numpy as np
1312
from sklearn.datasets import load_iris
1413
from threshold import fit_model, accuracy
@@ -18,20 +17,23 @@
1817
labels = data['target_names'][data['target']]
1918

2019
# We are going to remove the setosa examples as they are too easy:
21-
setosa = (labels == 'setosa')
22-
features = features[~setosa]
23-
labels = labels[~setosa]
20+
is_setosa = (labels == 'setosa')
21+
features = features[~is_setosa]
22+
labels = labels[~is_setosa]
2423

2524
# Now we classify virginica vs non-virginica
26-
virginica = (labels == 'virginica')
25+
is_virginica = (labels == 'virginica')
2726

2827
# Split the data in two: testing and training
2928
testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...]
29+
30+
# Training is the negation of testing: i.e., datapoints not used for testing,
31+
# will be used for training
3032
training = ~testing
3133

32-
model = fit_model(features[training], virginica[training])
33-
train_accuracy = accuracy(features[training], virginica[training], model)
34-
test_accuracy = accuracy(features[testing], virginica[testing], model)
34+
model = fit_model(features[training], is_virginica[training])
35+
train_accuracy = accuracy(features[training], is_virginica[training], model)
36+
test_accuracy = accuracy(features[testing], is_virginica[testing], model)
3537

3638
print('''\
3739
Training accuracy was {0:.1%}.

0 commit comments

Comments
 (0)