Skip to content

Commit 5e667ef

Browse files
committed
Reworked for 2nd edition
1 parent 847e33f commit 5e667ef

File tree

4 files changed

+47
-30
lines changed

4 files changed

+47
-30
lines changed

ch03/plot_kmeans_example.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from matplotlib import pylab
1616
from sklearn.cluster import KMeans
1717

18-
from utils import DATA_DIR, CHART_DIR
18+
from utils import CHART_DIR
1919

2020
seed = 2
2121
sp.random.seed(seed) # to reproduce the data later on
@@ -33,7 +33,6 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
3333
pylab.title(title)
3434
pylab.xlabel("Occurrence word 1")
3535
pylab.ylabel("Occurrence word 2")
36-
# pylab.xticks([w*7*24 for w in range(10)], ['week %i'%w for w in range(10)])
3736

3837
pylab.autoscale(tight=True)
3938
pylab.ylim(ymin=0, ymax=1)

ch03/rel_post_01.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ def build_analyzer(self):
4343
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
4444

4545
vectorizer = StemmedTfidfVectorizer(
46-
min_df=1, stop_words='english', charset_error='ignore')
47-
print(vectorizer)
46+
min_df=1, stop_words='english', decode_error='ignore')
4847

4948
X_train = vectorizer.fit_transform(posts)
5049

ch03/rel_post_mlcomp_01.py

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,38 @@
55
#
66
# It is made available under the MIT License
77

8-
import os
9-
import sys
108
import sklearn.datasets
119
import scipy as sp
1210

13-
from utils import DATA_DIR
14-
15-
if not os.path.exists(DATA_DIR):
16-
print("""\
17-
It seems that you have not yet downloaded the MLCOMP data set.
18-
Please do so and place it into %s."""%DATA_DIR)
19-
sys.exit(1)
20-
2111
new_post = \
2212
"""Disk drive problems. Hi, I have a problem with my hard disk.
2313
After 1 year it is working only sporadically now.
2414
I tried to format it, but now it doesn't boot any more.
2515
Any ideas? Thanks.
2616
"""
2717

18+
print("""\
19+
Dear reader of the 1st edition of 'Building Machine Learning Systems with Python'!
20+
For the 2nd edition we introduced a couple of changes that will result into
21+
results that differ from the results in the 1st edition.
22+
E.g. we now fully rely on scikit's fetch_20newsgroups() instead of requiring
23+
you to download the data manually from MLCOMP.
24+
If you have any questions, please ask at http://www.twotoreal.com
25+
""")
26+
27+
all_data = sklearn.datasets.fetch_20newsgroups(subset="all")
28+
print("Number of total posts: %i" % len(all_data.filenames))
29+
# Number of total posts: 18846
30+
2831
groups = [
2932
'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
30-
'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
31-
dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
32-
mlcomp_root=DATA_DIR,
33-
categories=groups)
34-
print("Number of posts:", len(dataset.filenames))
33+
'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
34+
train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
35+
categories=groups)
36+
print("Number of training posts in tech groups:", len(train_data.filenames))
37+
# Number of training posts in tech groups: 3529
3538

36-
labels = dataset.target
39+
labels = train_data.target
3740
num_clusters = 50 # sp.unique(labels).shape[0]
3841

3942
import nltk.stem
@@ -49,31 +52,41 @@ def build_analyzer(self):
4952
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
5053

5154
vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
52-
# max_features=1000,
53-
stop_words='english', charset_error='ignore'
55+
stop_words='english', decode_error='ignore'
5456
)
55-
vectorized = vectorizer.fit_transform(dataset.data)
57+
58+
vectorized = vectorizer.fit_transform(train_data.data)
5659
num_samples, num_features = vectorized.shape
5760
print("#samples: %d, #features: %d" % (num_samples, num_features))
58-
61+
# samples: 3529, #features: 4712
5962

6063
from sklearn.cluster import KMeans
6164

62-
km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
63-
verbose=1)
64-
65+
km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
6566
clustered = km.fit(vectorized)
6667

68+
print("km.labels_=%s" % km.labels_)
69+
# km.labels_=[ 6 34 22 ..., 2 21 26]
70+
71+
print("km.labels_.shape=%s" % km.labels_.shape)
72+
# km.labels_.shape=3529
73+
6774
from sklearn import metrics
6875
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
76+
# Homogeneity: 0.400
6977
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
78+
# Completeness: 0.206
7079
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
80+
# V-measure: 0.272
7181
print("Adjusted Rand Index: %0.3f" %
7282
metrics.adjusted_rand_score(labels, km.labels_))
83+
# Adjusted Rand Index: 0.064
7384
print("Adjusted Mutual Information: %0.3f" %
7485
metrics.adjusted_mutual_info_score(labels, km.labels_))
86+
# Adjusted Mutual Information: 0.197
7587
print(("Silhouette Coefficient: %0.3f" %
7688
metrics.silhouette_score(vectorized, labels, sample_size=1000)))
89+
# Silhouette Coefficient: 0.006
7790

7891
new_post_vec = vectorizer.transform([new_post])
7992
new_post_label = km.predict(new_post_vec)[0]
@@ -83,13 +96,14 @@ def build_analyzer(self):
8396
similar = []
8497
for i in similar_indices:
8598
dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
86-
similar.append((dist, dataset.data[i]))
99+
similar.append((dist, train_data.data[i]))
87100

88101
similar = sorted(similar)
102+
print("Count similar: %i" % len(similar))
89103

90104
show_at_1 = similar[0]
91-
show_at_2 = similar[len(similar) / 2]
92-
show_at_3 = similar[-1]
105+
show_at_2 = similar[int(len(similar) / 10)]
106+
show_at_3 = similar[int(len(similar) / 2)]
93107

94108
print("=== #1 ===")
95109
print(show_at_1)

ch03/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,8 @@
1515
print("Uh, we were expecting a data directory, which contains the toy data")
1616
sys.exit(1)
1717

18+
CHART_DIR = os.path.join(
19+
os.path.dirname(os.path.realpath(__file__)), "charts")
20+
if not os.path.exists(CHART_DIR):
21+
os.mkdir(CHART_DIR)
22+

0 commit comments

Comments
 (0)