55#
66# It is made available under the MIT License
77
8- import os
9- import sys
108import sklearn .datasets
119import scipy as sp
1210
13- from utils import DATA_DIR
14-
15- if not os .path .exists (DATA_DIR ):
16- print ("""\
17- It seems that you have not yet downloaded the MLCOMP data set.
18- Please do so and place it into %s.""" % DATA_DIR )
19- sys .exit (1 )
20-
2111new_post = \
2212 """Disk drive problems. Hi, I have a problem with my hard disk.
2313After 1 year it is working only sporadically now.
2414I tried to format it, but now it doesn't boot any more.
2515Any ideas? Thanks.
2616"""
2717
18+ print ("""\
19+ Dear reader of the 1st edition of 'Building Machine Learning Systems with Python'!
20+ For the 2nd edition we introduced a couple of changes that will result into
21+ results that differ from the results in the 1st edition.
22+ E.g. we now fully rely on scikit's fetch_20newsgroups() instead of requiring
23+ you to download the data manually from MLCOMP.
24+ If you have any questions, please ask at http://www.twotoreal.com
25+ """ )
26+
27+ all_data = sklearn .datasets .fetch_20newsgroups (subset = "all" )
28+ print ("Number of total posts: %i" % len (all_data .filenames ))
29+ # Number of total posts: 18846
30+
2831groups = [
2932 'comp.graphics' , 'comp.os.ms-windows.misc' , 'comp.sys.ibm.pc.hardware' ,
30- 'comp.sys.ma c .hardware' , 'comp.windows.x' , 'sci.space' ]
31- dataset = sklearn .datasets .load_mlcomp ( "20news-18828" , "train" ,
32- mlcomp_root = DATA_DIR ,
33- categories = groups )
34- print ( " Number of posts:" , len ( dataset . filenames ))
33+ 'comp.sys.mac .hardware' , 'comp.windows.x' , 'sci.space' ]
34+ train_data = sklearn .datasets .fetch_20newsgroups ( subset = "train" ,
35+ categories = groups )
36+ print ( "Number of training posts in tech groups:" , len ( train_data . filenames ) )
37+ # Number of training posts in tech groups: 3529
3538
36- labels = dataset .target
39+ labels = train_data .target
3740num_clusters = 50 # sp.unique(labels).shape[0]
3841
3942import nltk .stem
@@ -49,31 +52,41 @@ def build_analyzer(self):
4952 return lambda doc : (english_stemmer .stem (w ) for w in analyzer (doc ))
5053
5154vectorizer = StemmedTfidfVectorizer (min_df = 10 , max_df = 0.5 ,
52- # max_features=1000,
53- stop_words = 'english' , charset_error = 'ignore'
55+ stop_words = 'english' , decode_error = 'ignore'
5456 )
55- vectorized = vectorizer .fit_transform (dataset .data )
57+
58+ vectorized = vectorizer .fit_transform (train_data .data )
5659num_samples , num_features = vectorized .shape
5760print ("#samples: %d, #features: %d" % (num_samples , num_features ))
58-
61+ # samples: 3529, #features: 4712
5962
6063from sklearn .cluster import KMeans
6164
62- km = KMeans (n_clusters = num_clusters , init = 'k-means++' , n_init = 1 ,
63- verbose = 1 )
64-
65+ km = KMeans (n_clusters = num_clusters , n_init = 1 , verbose = 1 , random_state = 3 )
6566clustered = km .fit (vectorized )
6667
68+ print ("km.labels_=%s" % km .labels_ )
69+ # km.labels_=[ 6 34 22 ..., 2 21 26]
70+
71+ print ("km.labels_.shape=%s" % km .labels_ .shape )
72+ # km.labels_.shape=3529
73+
6774from sklearn import metrics
6875print ("Homogeneity: %0.3f" % metrics .homogeneity_score (labels , km .labels_ ))
76+ # Homogeneity: 0.400
6977print ("Completeness: %0.3f" % metrics .completeness_score (labels , km .labels_ ))
78+ # Completeness: 0.206
7079print ("V-measure: %0.3f" % metrics .v_measure_score (labels , km .labels_ ))
80+ # V-measure: 0.272
7181print ("Adjusted Rand Index: %0.3f" %
7282 metrics .adjusted_rand_score (labels , km .labels_ ))
83+ # Adjusted Rand Index: 0.064
7384print ("Adjusted Mutual Information: %0.3f" %
7485 metrics .adjusted_mutual_info_score (labels , km .labels_ ))
86+ # Adjusted Mutual Information: 0.197
7587print (("Silhouette Coefficient: %0.3f" %
7688 metrics .silhouette_score (vectorized , labels , sample_size = 1000 )))
89+ # Silhouette Coefficient: 0.006
7790
7891new_post_vec = vectorizer .transform ([new_post ])
7992new_post_label = km .predict (new_post_vec )[0 ]
@@ -83,13 +96,14 @@ def build_analyzer(self):
8396similar = []
8497for i in similar_indices :
8598 dist = sp .linalg .norm ((new_post_vec - vectorized [i ]).toarray ())
86- similar .append ((dist , dataset .data [i ]))
99+ similar .append ((dist , train_data .data [i ]))
87100
88101similar = sorted (similar )
102+ print ("Count similar: %i" % len (similar ))
89103
90104show_at_1 = similar [0 ]
91- show_at_2 = similar [len (similar ) / 2 ]
92- show_at_3 = similar [- 1 ]
105+ show_at_2 = similar [int ( len (similar ) / 10 ) ]
106+ show_at_3 = similar [int ( len ( similar ) / 2 ) ]
93107
94108print ("=== #1 ===" )
95109print (show_at_1 )
0 commit comments