Reworked for 2nd edition

wrichert · wrichert · commit 5e667ef3f683 · 2014-07-12T21:40:23.000+02:00
diff --git a/ch03/plot_kmeans_example.py b/ch03/plot_kmeans_example.py
@@ -15,7 +15,7 @@
 from matplotlib import pylab
 from sklearn.cluster import KMeans
 
-from utils import DATA_DIR, CHART_DIR
+from utils import CHART_DIR
 
 seed = 2
 sp.random.seed(seed)  # to reproduce the data later on
@@ -33,7 +33,6 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
     pylab.title(title)
     pylab.xlabel("Occurrence word 1")
     pylab.ylabel("Occurrence word 2")
-    # pylab.xticks([w*7*24 for w in range(10)], ['week %i'%w for w in range(10)])
 
     pylab.autoscale(tight=True)
     pylab.ylim(ymin=0, ymax=1)
diff --git a/ch03/rel_post_01.py b/ch03/rel_post_01.py
@@ -43,8 +43,7 @@ def build_analyzer(self):
         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
 
 vectorizer = StemmedTfidfVectorizer(
-    min_df=1, stop_words='english', charset_error='ignore')
-print(vectorizer)
+    min_df=1, stop_words='english', decode_error='ignore')
 
 X_train = vectorizer.fit_transform(posts)
 
diff --git a/ch03/rel_post_mlcomp_01.py b/ch03/rel_post_mlcomp_01.py
@@ -5,35 +5,38 @@
 #
 # It is made available under the MIT License
 
-import os
-import sys
 import sklearn.datasets
 import scipy as sp
 
-from utils import DATA_DIR
-
-if not os.path.exists(DATA_DIR):
-    print("""\
-It seems that you have not yet downloaded the MLCOMP data set.
-Please do so and place it into %s."""%DATA_DIR)
-    sys.exit(1)
-
 new_post = \
     """Disk drive problems. Hi, I have a problem with my hard disk.
 After 1 year it is working only sporadically now.
 I tried to format it, but now it doesn't boot any more.
 Any ideas? Thanks.
 """
 
+print("""\
+Dear reader of the 1st edition of 'Building Machine Learning Systems with Python'!
+For the 2nd edition we introduced a couple of changes that will result into
+results that differ from the results in the 1st edition.
+E.g. we now fully rely on scikit's fetch_20newsgroups() instead of requiring
+you to download the data manually from MLCOMP.
+If you have any questions, please ask at http://www.twotoreal.com
+""")
+
+all_data = sklearn.datasets.fetch_20newsgroups(subset="all")
+print("Number of total posts: %i" % len(all_data.filenames))
+# Number of total posts: 18846
+
 groups = [
     'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
-    'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
-dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
-                                       mlcomp_root=DATA_DIR,
-                                       categories=groups)
-print("Number of posts:", len(dataset.filenames))
+    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
+train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
+                                                 categories=groups)
+print("Number of training posts in tech groups:", len(train_data.filenames))
+# Number of training posts in tech groups: 3529
 
-labels = dataset.target
+labels = train_data.target
 num_clusters = 50  # sp.unique(labels).shape[0]
 
 import nltk.stem
@@ -49,31 +52,41 @@ def build_analyzer(self):
         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
 
 vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
-                                    # max_features=1000,
-                                    stop_words='english', charset_error='ignore'
+                                    stop_words='english', decode_error='ignore'
                                     )
-vectorized = vectorizer.fit_transform(dataset.data)
+
+vectorized = vectorizer.fit_transform(train_data.data)
 num_samples, num_features = vectorized.shape
 print("#samples: %d, #features: %d" % (num_samples, num_features))
-
+# samples: 3529, #features: 4712
 
 from sklearn.cluster import KMeans
 
-km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
-            verbose=1)
-
+km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
 clustered = km.fit(vectorized)
 
+print("km.labels_=%s" % km.labels_)
+# km.labels_=[ 6 34 22 ...,  2 21 26]
+
+print("km.labels_.shape=%s" % km.labels_.shape)
+# km.labels_.shape=3529
+
 from sklearn import metrics
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
+# Homogeneity: 0.400
 print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
+# Completeness: 0.206
 print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
+# V-measure: 0.272
 print("Adjusted Rand Index: %0.3f" %
       metrics.adjusted_rand_score(labels, km.labels_))
+# Adjusted Rand Index: 0.064
 print("Adjusted Mutual Information: %0.3f" %
       metrics.adjusted_mutual_info_score(labels, km.labels_))
+# Adjusted Mutual Information: 0.197
 print(("Silhouette Coefficient: %0.3f" %
        metrics.silhouette_score(vectorized, labels, sample_size=1000)))
+# Silhouette Coefficient: 0.006
 
 new_post_vec = vectorizer.transform([new_post])
 new_post_label = km.predict(new_post_vec)[0]
@@ -83,13 +96,14 @@ def build_analyzer(self):
 similar = []
 for i in similar_indices:
     dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
-    similar.append((dist, dataset.data[i]))
+    similar.append((dist, train_data.data[i]))
 
 similar = sorted(similar)
+print("Count similar: %i" % len(similar))
 
 show_at_1 = similar[0]
-show_at_2 = similar[len(similar) / 2]
-show_at_3 = similar[-1]
+show_at_2 = similar[int(len(similar) / 10)]
+show_at_3 = similar[int(len(similar) / 2)]
 
 print("=== #1 ===")
 print(show_at_1)
diff --git a/ch03/utils.py b/ch03/utils.py
@@ -15,3 +15,8 @@
     print("Uh, we were expecting a data directory, which contains the toy data")
     sys.exit(1)
 
+CHART_DIR = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "charts")
+if not os.path.exists(CHART_DIR):
+    os.mkdir(CHART_DIR)
+