Zymous
diff --git a/‎SimpleImageDataset/building02.jpg‎
1.49 MB b/‎SimpleImageDataset/building02.jpg‎
1.49 MB
diff --git a/‎SimpleImageDataset/scene02.jpg‎
1.4 MB b/‎SimpleImageDataset/scene02.jpg‎
1.4 MB
diff --git a/‎SimpleImageDataset/scene08.jpg‎
1.82 MB b/‎SimpleImageDataset/scene08.jpg‎
1.82 MB
diff --git a/‎ch02/README.rst‎
Lines changed: 52 additions & 0 deletions b/‎ch02/README.rst‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎ch02/extra/create_tsv.py‎
Lines changed: 0 additions & 2 deletions b/‎ch02/extra/create_tsv.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎ch02/figure1.py‎
Lines changed: 3 additions & 1 deletion b/‎ch02/figure1.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎ch02/figure2.py‎
Lines changed: 32 additions & 21 deletions b/‎ch02/figure2.py‎
Lines changed: 32 additions & 21 deletions
diff --git a/‎ch02/figure4_5.py‎ ‎ch02/figure4_5_no_sklearn.py‎ch02/figure4_5.py renamed to ch02/figure4_5_no_sklearn.py
Lines changed: 27 additions & 13 deletions b/‎ch02/figure4_5.py‎ ‎ch02/figure4_5_no_sklearn.py‎ch02/figure4_5.py renamed to ch02/figure4_5_no_sklearn.py
Lines changed: 27 additions & 13 deletions
diff --git a/‎ch02/figure4_5_sklearn.py‎
Lines changed: 85 additions & 0 deletions b/‎ch02/figure4_5_sklearn.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎ch02/heldout.py‎
Lines changed: 10 additions & 8 deletions b/‎ch02/heldout.py‎
Lines changed: 10 additions & 8 deletions
@@ -0,0 +1,52 @@
+=========
+Chapter 2
+=========
+
+Support code for *Chapter 2: Learning How to Classify with Real-world
+Examples*. The directory data contains the seeds dataset, originally downloaded
+from https://archive.ics.uci.edu/ml/datasets/seeds
+
+figure1.py
+    Figure 1 in the book: all 2-by-2 scatter plots
+
+figure2.py
+    Figure 2 in the book: threshold & decision area
+
+figure4_5_sklearn.py
+    Figures 4 and 5 in the book: Knn decision borders before and after feature
+    normalization. This also produces a version of the figure using 11
+    neighbors (not in the book), which shows that the result is smoother, not
+    as sensitive to exact positions of each datapoint.
+
+figure4_5_no_sklearn.py
+    Alternative code for Figures 4 and 5 without using scikit-learn
+    
+load.py
+    Code to load the seeds data
+
+simple_threshold.py
+    Code from the book: finds the first partition, between Setosa and the other classes.
+
+stump.py
+    Code from the book: finds the second partition, between Virginica and Versicolor.
+
+threshold.py
+    Functional implementation of a threshold classifier
+
+heldout.py
+    Evalute the threshold model on heldout data
+
+seeds_knn_sklearn.py
+    Demonstrate cross-validation and feature normalization using scikit-learn
+    
+seeds_threshold.py
+    Test thresholding model on the seeds dataset (result mention in book, but no code)
+
+seeds_knn_increasing_k.py
+    Test effect of increasing num_neighbors on accuracy.
+
+knn.py
+    Implementation of K-Nearest neighbor without using scikit-learn.
+
+seeds_knn.py
+    Demonstrate cross-validation (without scikit-learn)
@@ -5,7 +5,6 @@
 #
 # It is made available under the MIT License
 
-import milksets.iris
 import milksets.seeds
 
 
@@ -16,5 +15,4 @@ def save_as_tsv(fname, module):
         for f, n in zip(features, nlabels):
             print >>ofile, "\t".join(map(str, f) + [n])
 
-save_as_tsv('iris.tsv', milksets.iris)
 save_as_tsv('seeds.tsv', milksets.seeds)
@@ -5,7 +5,6 @@
 #
 # It is made available under the MIT License
 
-import numpy as np
 from sklearn.datasets import load_iris
 from matplotlib import pyplot as plt
 
@@ -19,11 +18,14 @@
 pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
 for i, (p0, p1) in enumerate(pairs):
     ax = axes.flat[i]
+
+    # Use a different marker/color for each class `t`
     for t, marker, c in zip(range(3), ">ox", "rgb"):
         ax.scatter(features[target == t, p0], features[
                     target == t, p1], marker=marker, c=c)
     ax.set_xlabel(feature_names[p0])
     ax.set_ylabel(feature_names[p1])
     ax.set_xticks([])
     ax.set_yticks([])
+fig.tight_layout()
 fig.savefig('figure1.png')
@@ -14,13 +14,16 @@
 feature_names = data['feature_names']
 species = data['target_names'][data['target']]
 
-setosa = (species == 'setosa')
-features = features[~setosa]
-species = species[~setosa]
-virginica = species == 'virginica'
+is_setosa = (species == 'setosa')
+features = features[~is_setosa]
+species = species[~is_setosa]
+is_virginica = (species == 'virginica')
 
+# Hand fixed threshold:
 t = 1.75
-p0, p1 = 3, 2
+
+# Features to use: 3 & 2
+f0, f1 = 3, 2
 
 if COLOUR_FIGURE:
     area1c = (1., .8, .8)
@@ -29,19 +32,27 @@
     area1c = (1., 1, 1)
     area2c = (.7, .7, .7)
 
-x0, x1 = [features[:, p0].min() * .9, features[:, p0].max() * 1.1]
-y0, y1 = [features[:, p1].min() * .9, features[:, p1].max() * 1.1]
-
-plt.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
-plt.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
-plt.plot([t, t], [y0, y1], 'k--', lw=2)
-plt.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
-plt.scatter(features[virginica, p0],
-            features[virginica, p1], c='b', marker='o')
-plt.scatter(features[~virginica, p0],
-            features[~virginica, p1], c='r', marker='x')
-plt.ylim(y0, y1)
-plt.xlim(x0, x1)
-plt.xlabel(feature_names[p0])
-plt.ylabel(feature_names[p1])
-plt.savefig('figure2.png')
+# Plot from 90% of smallest value to 110% of largest value
+# (all feature values are positive, otherwise this would not work very well)
+
+x0 = features[:, f0].min() * .9
+x1 = features[:, f0].max() * 1.1
+
+y0 = features[:, f1].min() * .9
+y1 = features[:, f1].max() * 1.1
+
+fig,ax = plt.subplots()
+ax.fill_between([t, x1], [y0, y0], [y1, y1], color=area2c)
+ax.fill_between([x0, t], [y0, y0], [y1, y1], color=area1c)
+ax.plot([t, t], [y0, y1], 'k--', lw=2)
+ax.plot([t - .1, t - .1], [y0, y1], 'k:', lw=2)
+ax.scatter(features[is_virginica, f0],
+            features[is_virginica, f1], c='b', marker='o')
+ax.scatter(features[~is_virginica, f0],
+            features[~is_virginica, f1], c='r', marker='x')
+ax.set_ylim(y0, y1)
+ax.set_xlim(x0, x1)
+ax.set_xlabel(feature_names[f0])
+ax.set_ylabel(feature_names[f1])
+fig.tight_layout()
+fig.savefig('figure2.png')
@@ -11,7 +11,7 @@
 from matplotlib.colors import ListedColormap
 from load import load_dataset
 import numpy as np
-from knn import fit_model, predict, accuracy
+from knn import fit_model, predict
 
 feature_names = [
     'area',
@@ -24,7 +24,19 @@
 ]
 
 
-def train_plot(features, labels):
+def plot_decision(features, labels):
+    '''Plots decision boundary for KNN
+
+    Parameters
+    ----------
+    features : ndarray
+    labels : sequence
+
+    Returns
+    -------
+    fig : Matplotlib Figure
+    ax  : Matplotlib Axes
+    '''
     y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
     x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
     X = np.linspace(x0, x1, 100)
@@ -38,28 +50,30 @@ def train_plot(features, labels):
         cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
     else:
         cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
-    plt.xlim(x0, x1)
-    plt.ylim(y0, y1)
-    plt.xlabel(feature_names[0])
-    plt.ylabel(feature_names[2])
-    plt.pcolormesh(X, Y, C, cmap=cmap)
+    fig,ax = plt.subplots()
+    ax.set_xlim(x0, x1)
+    ax.set_ylim(y0, y1)
+    ax.set_xlabel(feature_names[0])
+    ax.set_ylabel(feature_names[2])
+    ax.pcolormesh(X, Y, C, cmap=cmap)
     if COLOUR_FIGURE:
         cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
-        plt.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
     else:
         for lab, ma in zip(range(3), "Do^"):
-            plt.plot(features[labels == lab, 0], features[
+            ax.plot(features[labels == lab, 0], features[
                      labels == lab, 2], ma, c=(1., 1., 1.))
+    return fig,ax
 
 
 features, labels = load_dataset('seeds')
 names = sorted(set(labels))
 labels = np.array([names.index(ell) for ell in labels])
 
-train_plot(features, labels)
-plt.savefig('figure4.png')
+fig,ax = plot_decision(features, labels)
+fig.savefig('figure4.png')
 
 features -= features.mean(0)
 features /= features.std(0)
-train_plot(features, labels)
-plt.savefig('figure5.png')
+fig,ax = plot_decision(features, labels)
+fig.savefig('figure5.png')
@@ -0,0 +1,85 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+COLOUR_FIGURE = False
+
+from matplotlib import pyplot as plt
+from matplotlib.colors import ListedColormap
+from load import load_dataset
+import numpy as np
+from sklearn.neighbors import KNeighborsClassifier
+
+feature_names = [
+    'area',
+    'perimeter',
+    'compactness',
+    'length of kernel',
+    'width of kernel',
+    'asymmetry coefficien',
+    'length of kernel groove',
+]
+
+
+def plot_decision(features, labels, num_neighbors=1):
+    '''Plots decision boundary for KNN
+
+    Parameters
+    ----------
+    features : ndarray
+    labels : sequence
+
+    Returns
+    -------
+    fig : Matplotlib Figure
+    ax  : Matplotlib Axes
+    '''
+    y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
+    x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
+    X = np.linspace(x0, x1, 1000)
+    Y = np.linspace(y0, y1, 1000)
+    X, Y = np.meshgrid(X, Y)
+
+    model = KNeighborsClassifier(num_neighbors)
+    model.fit(features[:, (0,2)], labels)
+    C = model.predict(np.vstack([X.ravel(), Y.ravel()]).T).reshape(X.shape)
+    if COLOUR_FIGURE:
+        cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
+    else:
+        cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
+    fig,ax = plt.subplots()
+    ax.set_xlim(x0, x1)
+    ax.set_ylim(y0, y1)
+    ax.set_xlabel(feature_names[0])
+    ax.set_ylabel(feature_names[2])
+    ax.pcolormesh(X, Y, C, cmap=cmap)
+    if COLOUR_FIGURE:
+        cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
+        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
+    else:
+        for lab, ma in zip(range(3), "Do^"):
+            ax.plot(features[labels == lab, 0], features[
+                     labels == lab, 2], ma, c=(1., 1., 1.))
+    return fig,ax
+
+
+features, labels = load_dataset('seeds')
+names = sorted(set(labels))
+labels = np.array([names.index(ell) for ell in labels])
+
+fig,ax = plot_decision(features, labels)
+fig.tight_layout()
+fig.savefig('figure4sklearn.png')
+
+features -= features.mean(0)
+features /= features.std(0)
+fig,ax = plot_decision(features, labels)
+fig.tight_layout()
+fig.savefig('figure5sklearn.png')
+
+fig,ax = plot_decision(features, labels, 11)
+fig.tight_layout()
+fig.savefig('figure5sklearn_with_11_neighbors.png')
@@ -8,7 +8,6 @@
 # This script demonstrates the difference between the training accuracy and
 # testing (held-out) accuracy.
 
-from matplotlib import pyplot as plt
 import numpy as np
 from sklearn.datasets import load_iris
 from threshold import fit_model, accuracy
@@ -18,20 +17,23 @@
 labels = data['target_names'][data['target']]
 
 # We are going to remove the setosa examples as they are too easy:
-setosa = (labels == 'setosa')
-features = features[~setosa]
-labels = labels[~setosa]
+is_setosa = (labels == 'setosa')
+features = features[~is_setosa]
+labels = labels[~is_setosa]
 
 # Now we classify virginica vs non-virginica
-virginica = (labels == 'virginica')
+is_virginica = (labels == 'virginica')
 
 # Split the data in two: testing and training
 testing = np.tile([True, False], 50) # testing = [True,False,True,False,True,False...]
+
+# Training is the negation of testing: i.e., datapoints not used for testing,
+# will be used for training
 training = ~testing
 
-model = fit_model(features[training], virginica[training])
-train_accuracy = accuracy(features[training], virginica[training], model)
-test_accuracy = accuracy(features[testing], virginica[testing], model)
+model = fit_model(features[training], is_virginica[training])
+train_accuracy = accuracy(features[training], is_virginica[training], model)
+test_accuracy = accuracy(features[testing], is_virginica[testing], model)
 
 print('''\
 Training accuracy was {0:.1%}.
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`#`
`6`	`6`	`# It is made available under the MIT License`
`7`	`7`
`8`		`-import milksets.iris`
`9`	`8`	`import milksets.seeds`
`10`	`9`
`11`	`10`
`@@ -16,5 +15,4 @@ def save_as_tsv(fname, module):`
`16`	`15`	`for f, n in zip(features, nlabels):`
`17`	`16`	`print >>ofile, "\t".join(map(str, f) + [n])`
`18`	`17`
`19`		`-save_as_tsv('iris.tsv', milksets.iris)`
`20`	`18`	`save_as_tsv('seeds.tsv', milksets.seeds)`