3030
3131import nltk
3232
33- # splitting questions into train (70%) and test(30%) and then take their
34- # answers
35- all_posts = list (meta .keys ())
36- all_questions = [q for q , v in meta .items () if v ['ParentId' ] == - 1 ]
37- all_answers = [q for q , v in meta .items () if v ['ParentId' ] != - 1 ] # [:500]
33+ # The sorting below is only to ensure reproducable numbers. Further down
34+ # we will occasionally skip a fold when it contains instances of only
35+ # one label. The two lines below ensure that the behavior is exactly the
36+ # same for different runs.
37+ all_questions = sorted ([q for q , v in meta .items () if v ['ParentId' ] == - 1 ])
38+ all_answers = sorted ([q for q , v in meta .items () if v ['ParentId' ] != - 1 ])
3839
3940feature_names = np .array ((
4041 'NumTextTokens' ,
4748 'NumImages'
4849))
4950
50- # activate the following for reduced feature space
51- """
52- feature_names = np.array((
53- 'NumTextTokens',
54- 'LinkCount',
55- ))
56- """
57-
5851
5952def prepare_sent_features ():
6053 for pid , text in fetch_posts (chosen , with_index = True ):
@@ -80,17 +73,26 @@ def get_features(aid):
8073 return tuple (meta [aid ][fn ] for fn in feature_names )
8174
8275qa_X = np .asarray ([get_features (aid ) for aid in all_answers ])
83- # Score > 0 tests => positive class is good answer
84- # Score <= 0 tests => positive class is poor answer
85- qa_Y = np .asarray ([meta [aid ]['Score' ] > 0 for aid in all_answers ])
76+
8677classifying_answer = "good"
78+ #classifying_answer = "poor"
79+
80+ if classifying_answer == "good" :
81+ # Score > 0 tests => positive class is good answer
82+ qa_Y = np .asarray ([meta [aid ]['Score' ] > 0 for aid in all_answers ])
83+ elif classifying_answer == "poor" :
84+ # Score <= 0 tests => positive class is poor answer
85+ qa_Y = np .asarray ([meta [aid ]['Score' ] <= 0 for aid in all_answers ])
86+ else :
87+ raise Exception ("classifying_answer='%s' is not supported" %
88+ classifying_answer )
8789
8890for idx , feat in enumerate (feature_names ):
8991 plot_feat_hist ([(qa_X [:, idx ], feat )])
90- """
91- plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')
92- plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')
93- """
92+
93+ # plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')
94+ # plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')
95+
9496avg_scores_summary = []
9597
9698
@@ -115,10 +117,16 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
115117 pr_scores = []
116118 precisions , recalls , thresholds = [], [], []
117119
118- for train , test in cv :
120+ for fold_idx , ( train , test ) in enumerate ( cv ) :
119121 X_train , y_train = X [train ], Y [train ]
120122 X_test , y_test = X [test ], Y [test ]
121123
124+ only_one_class_in_train = len (set (y_train )) == 1
125+ only_one_class_in_test = len (set (y_test )) == 1
126+ if only_one_class_in_train or only_one_class_in_test :
127+ # this would pose problems later on
128+ continue
129+
122130 clf = clf_class (** parameters )
123131
124132 clf .fit (X_train , y_train )
@@ -145,12 +153,20 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
145153 precisions .append (precision )
146154 recalls .append (recall )
147155 thresholds .append (pr_thresholds )
156+
157+ # This threshold is determined at the end of the chapter 5,
158+ # where we find conditions such that precision is in the area of
159+ # about 80%. With it we trade off recall for precision.
160+ threshold_for_detecting_good_answers = 0.59
161+
162+ print ("Clone #%i" % fold_idx )
148163 print (classification_report (y_test , proba [:, label_idx ] >
149- 0.63 , target_names = ['not accepted' , 'accepted' ]))
164+ threshold_for_detecting_good_answers , target_names = ['not accepted' , 'accepted' ]))
150165
151166 # get medium clone
152167 scores_to_sort = pr_scores # roc_scores
153168 medium = np .argsort (scores_to_sort )[len (scores_to_sort ) / 2 ]
169+ print ("Medium clone is #%i" % medium )
154170
155171 if plot :
156172 #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
@@ -178,6 +194,7 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
178194
179195
180196def bias_variance_analysis (clf_class , parameters , name ):
197+ #import ipdb;ipdb.set_trace()
181198 data_sizes = np .arange (60 , 2000 , 4 )
182199
183200 train_errors = []
@@ -208,13 +225,16 @@ def k_complexity_analysis(clf_class, parameters):
208225
209226 plot_k_complexity (ks , train_errors , test_errors )
210227
211- for k in [5 ]: # [5, 10, 40, 90]:
228+ for k in [5 ]:
229+ # for k in [5, 10, 40]:
230+ #measure(neighbors.KNeighborsClassifier, {'n_neighbors': k}, "%iNN" % k)
212231 bias_variance_analysis (neighbors .KNeighborsClassifier , {
213232 'n_neighbors' : k }, "%iNN" % k )
214233 k_complexity_analysis (neighbors .KNeighborsClassifier , {'n_neighbors' : k })
215234
216235from sklearn .linear_model import LogisticRegression
217- for C in [0.1 ]: # [0.01, 0.1, 1.0, 10.0]:
236+ for C in [0.1 ]:
237+ # for C in [0.01, 0.1, 1.0, 10.0]:
218238 name = "LogReg C=%.2f" % C
219239 bias_variance_analysis (LogisticRegression , {'penalty' : 'l2' , 'C' : C }, name )
220240 measure (LogisticRegression , {'penalty' : 'l2' , 'C' : C }, name , plot = True )
0 commit comments