Fixing train/test bug in charts; updating for 2nd edition of the book.

wrichert · wrichert · commit e885d6a05c68 · 2014-08-24T17:51:07.000+02:00
diff --git a/ch05/README.md b/ch05/README.md
@@ -1,11 +1,10 @@
 Chapter 5 - Classification - Detecting Poor Answers
 ===================================================
 
-The book chapter is based on StackExchange's data blob from August 2012:
-[http://www.clearbits.net/get/2076-aug-2012.torrent](http://www.clearbits.net/get/2076-aug-2012.torrent)
+The book chapter is based on StackExchange's data blob from August 2012 for the first edition. 
 
-After publishing the book, StackExchange stayed as awesome as it always has been and released an updated version:
-[https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent)
+After publishing the book, StackExchange released the May 2014 version at
+[https://archive.org/download/stackexchange/stackexchange_archive.torrent](https://archive.org/download/stackexchange/stackexchange_archive.torrent).
 
 Note that using the latest version, you will get slightly different results.
 
diff --git a/ch05/classify.py b/ch05/classify.py
@@ -30,11 +30,12 @@
 
 import nltk
 
-# splitting questions into train (70%) and test(30%) and then take their
-# answers
-all_posts = list(meta.keys())
-all_questions = [q for q, v in meta.items() if v['ParentId'] == -1]
-all_answers = [q for q, v in meta.items() if v['ParentId'] != -1]  # [:500]
+# The sorting below is only to ensure reproducable numbers. Further down
+# we will occasionally skip a fold when it contains instances of only
+# one label. The two lines below ensure that the behavior is exactly the
+# same for different runs.
+all_questions = sorted([q for q, v in meta.items() if v['ParentId'] == -1])
+all_answers = sorted([q for q, v in meta.items() if v['ParentId'] != -1])
 
 feature_names = np.array((
     'NumTextTokens',
@@ -47,14 +48,6 @@
     'NumImages'
 ))
 
-# activate the following for reduced feature space
-"""
-feature_names = np.array((
-    'NumTextTokens',
-    'LinkCount',
-))
-"""
-
 
 def prepare_sent_features():
     for pid, text in fetch_posts(chosen, with_index=True):
@@ -80,17 +73,26 @@ def get_features(aid):
     return tuple(meta[aid][fn] for fn in feature_names)
 
 qa_X = np.asarray([get_features(aid) for aid in all_answers])
-# Score > 0 tests => positive class is good answer
-# Score <= 0 tests => positive class is poor answer
-qa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])
+
 classifying_answer = "good"
+#classifying_answer = "poor"
+
+if classifying_answer == "good":
+    # Score > 0 tests => positive class is good answer
+    qa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])
+elif classifying_answer == "poor":
+    # Score <= 0 tests => positive class is poor answer
+    qa_Y = np.asarray([meta[aid]['Score'] <= 0 for aid in all_answers])
+else:
+    raise Exception("classifying_answer='%s' is not supported" %
+                    classifying_answer)
 
 for idx, feat in enumerate(feature_names):
     plot_feat_hist([(qa_X[:, idx], feat)])
-"""
-plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')
-plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')
-"""
+
+#plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')
+#plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')
+
 avg_scores_summary = []
 
 
@@ -115,10 +117,16 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
     pr_scores = []
     precisions, recalls, thresholds = [], [], []
 
-    for train, test in cv:
+    for fold_idx, (train, test) in enumerate(cv):
         X_train, y_train = X[train], Y[train]
         X_test, y_test = X[test], Y[test]
 
+        only_one_class_in_train = len(set(y_train)) == 1
+        only_one_class_in_test = len(set(y_test)) == 1
+        if only_one_class_in_train or only_one_class_in_test:
+            # this would pose problems later on
+            continue
+
         clf = clf_class(**parameters)
 
         clf.fit(X_train, y_train)
@@ -145,12 +153,20 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
         precisions.append(precision)
         recalls.append(recall)
         thresholds.append(pr_thresholds)
+
+        # This threshold is determined at the end of the chapter 5,
+        # where we find conditions such that precision is in the area of
+        # about 80%. With it we trade off recall for precision.
+        threshold_for_detecting_good_answers = 0.59
+
+        print("Clone #%i" % fold_idx)
         print(classification_report(y_test, proba[:, label_idx] >
-              0.63, target_names=['not accepted', 'accepted']))
+              threshold_for_detecting_good_answers, target_names=['not accepted', 'accepted']))
 
     # get medium clone
     scores_to_sort = pr_scores  # roc_scores
     medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
+    print("Medium clone is #%i" % medium)
 
     if plot:
         #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
@@ -178,6 +194,7 @@ def measure(clf_class, parameters, name, data_size=None, plot=False):
 
 
 def bias_variance_analysis(clf_class, parameters, name):
+    #import ipdb;ipdb.set_trace()
     data_sizes = np.arange(60, 2000, 4)
 
     train_errors = []
@@ -208,13 +225,16 @@ def k_complexity_analysis(clf_class, parameters):
 
     plot_k_complexity(ks, train_errors, test_errors)
 
-for k in [5]:  # [5, 10, 40, 90]:
+for k in [5]:
+# for k in [5, 10, 40]:
+    #measure(neighbors.KNeighborsClassifier, {'n_neighbors': k}, "%iNN" % k)
     bias_variance_analysis(neighbors.KNeighborsClassifier, {
                            'n_neighbors': k}, "%iNN" % k)
     k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors': k})
 
 from sklearn.linear_model import LogisticRegression
-for C in [0.1]:  # [0.01, 0.1, 1.0, 10.0]:
+for C in [0.1]:
+# for C in [0.01, 0.1, 1.0, 10.0]:
     name = "LogReg C=%.2f" % C
     bias_variance_analysis(LogisticRegression, {'penalty': 'l2', 'C': C}, name)
     measure(LogisticRegression, {'penalty': 'l2', 'C': C}, name, plot=True)
diff --git a/ch05/data.py b/ch05/data.py
@@ -7,7 +7,7 @@
 
 import os
 
-DATA_DIR = "data"  # put your posts-2011-12.xml into this directory
+DATA_DIR = "data"  # put your posts-2012.xml into this directory
 CHART_DIR = "charts"
 
 filtered = os.path.join(DATA_DIR, "filtered.tsv")
diff --git a/ch05/so_xml_to_tsv.py b/ch05/so_xml_to_tsv.py
@@ -10,6 +10,7 @@
 # to a question that has been asked in 2011 or 2012.
 #
 
+import sys
 import os
 import re
 try:
@@ -24,9 +25,13 @@
 
 from data import DATA_DIR
 
-filename = os.path.join(DATA_DIR, "posts-2011-12.xml")
+#filename = os.path.join(DATA_DIR, "posts-2011-12.xml")
+filename = os.path.join(DATA_DIR, "posts-2012.xml")
+print("Reading from xml %s" % filename)
 filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")
+print("Filtered: %s" % filename_filtered)
 filename_filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")
+print("Meta: %s" % filename_filtered_meta)
 
 q_creation = {}  # creation datetimes of questions
 q_accepted = {}  # id of accepted answer
@@ -77,22 +82,26 @@ def filter_html(s):
 num_questions = 0
 num_answers = 0
 
-from itertools import imap
+if sys.version_info.major < 3:
+    # Python 2, map() returns a list, which will lead to out of memory errors.
+    # The following import ensures that the script behaves like being executed
+    # with Python 3.
+    from itertools import imap as map
 
 
 def parsexml(filename):
     global num_questions, num_answers
 
     counter = 0
 
-    it = imap(itemgetter(1),
-              iter(etree.iterparse(filename, events=('start',))))
+    it = map(itemgetter(1),
+             iter(etree.iterparse(filename, events=('start',))))
 
     root = next(it)  # get posts element
 
     for elem in it:
         if counter % 100000 == 0:
-            print(counter)
+            print("Processed %i <row/> elements" % counter)
 
         counter += 1
 
diff --git a/ch05/utils.py b/ch05/utils.py
@@ -171,8 +171,8 @@ def plot_feat_hist(data_name_list, filename=None):
         assert filename is not None
 
     pylab.figure(num=None, figsize=(8, 6))
-    num_rows = 1 + (len(data_name_list) - 1) / 2
-    num_cols = 1 if len(data_name_list) == 1 else 2
+    num_rows = int(1 + (len(data_name_list) - 1) / 2)
+    num_cols = int(1 if len(data_name_list) == 1 else 2)
     pylab.figure(figsize=(5 * num_cols, 4 * num_rows))
 
     for i in range(num_rows):
@@ -191,7 +191,7 @@ def plot_feat_hist(data_name_list, filename=None):
             else:
                 bins = max_val
             n, bins, patches = pylab.hist(
-                x, bins=bins, normed=1, facecolor='blue', alpha=0.75)
+                x, bins=bins, normed=1, alpha=0.75)
 
             pylab.grid(True)
 
@@ -209,7 +209,7 @@ def plot_bias_variance(data_sizes, train_errors, test_errors, name, title):
     pylab.title("Bias-Variance for '%s'" % name)
     pylab.plot(
         data_sizes, test_errors, "--", data_sizes, train_errors, "b-", lw=1)
-    pylab.legend(["train error", "test error"], loc="upper right")
+    pylab.legend(["test error", "train error"], loc="upper right")
     pylab.grid(True, linestyle='-', color='0.75')
     pylab.savefig(
         os.path.join(CHART_DIR, "bv_" + name.replace(" ", "_") + ".png"), bbox_inches="tight")
@@ -220,10 +220,10 @@ def plot_k_complexity(ks, train_errors, test_errors):
     pylab.ylim([0.0, 1.0])
     pylab.xlabel('k')
     pylab.ylabel('Error')
-    pylab.title('Errors for for different values of k')
+    pylab.title('Errors for for different values of $k$')
     pylab.plot(
         ks, test_errors, "--", ks, train_errors, "-", lw=1)
-    pylab.legend(["train error", "test error"], loc="upper right")
+    pylab.legend(["test error", "train error"], loc="upper right")
     pylab.grid(True, linestyle='-', color='0.75')
     pylab.savefig(
         os.path.join(CHART_DIR, "kcomplexity.png"), bbox_inches="tight")