RFCT Use predict on 2d arrays instead of loop

luispedro · luispedro · commit 702bd01cd81b · 2014-08-16T17:35:58.000+02:00
Simplify the code in general.
diff --git a/ch07/boston_cv10_penalized.py b/ch07/boston_cv10_penalized.py
@@ -8,13 +8,12 @@
 # This script fits several forms of penalized regression
 
 from __future__ import print_function
+import numpy as np
 from sklearn.cross_validation import KFold
 from sklearn.linear_model import ElasticNet, Lasso, Ridge
-from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
-import numpy as np
 from sklearn.datasets import load_boston
 boston = load_boston()
-x = np.array([np.concatenate((v, [1])) for v in boston.data])
+x = boston.data
 y = boston.target
 
 for name, met in [
@@ -26,7 +25,7 @@
     met.fit(x, y)
 
     # Predict on the whole data:
-    p = np.array([met.predict(xi) for xi in x])
+    p = met.predict(x)
 
     e = p - y
     # np.dot(e, e) == sum(ei**2 for ei in e) but faster
@@ -38,7 +37,7 @@
     err = 0
     for train, test in kf:
         met.fit(x[train], y[train])
-        p = np.array([met.predict(xi) for xi in x[test]])
+        p = met.predict(x[test])
         e = p - y[test]
         err += np.dot(e, e)
 
diff --git a/ch07/cv10_lr.py b/ch07/cv10_lr.py
@@ -5,32 +5,37 @@
 #
 # It is made available under the MIT License
 
+import numpy as np
 from sklearn.cross_validation import KFold
 from sklearn.linear_model import LinearRegression, ElasticNet
-import numpy as np
 from sklearn.datasets import load_boston
 boston = load_boston()
-x = np.array([np.concatenate((v, [1])) for v in boston.data])
+x = boston.data
 y = boston.target
+
+
+# Switch this variable to use an Elastic Net instead of OLS
 FIT_EN = False
 
 if FIT_EN:
     model = ElasticNet(fit_intercept=True, alpha=0.5)
 else:
     model = LinearRegression(fit_intercept=True)
+
 model.fit(x, y)
-p = np.array([model.predict(xi) for xi in x])
-e = p - y
-total_error = np.dot(e, e)
-rmse_train = np.sqrt(total_error / len(p))
+rmse_train = np.sqrt(model.residues_/len(x))
+
+# Alternatively, we could have computed rmse_train using this expression:
+# rmse_train = np.sqrt(np.mean( (model.predict(x) - y) ** 2))
+# The results are equivalent
 
 kf = KFold(len(x), n_folds=10)
 err = 0
 for train, test in kf:
     model.fit(x[train], y[train])
-    p = np.array([model.predict(xi) for xi in x[test]])
+    p = model.predict(x[test])
     e = p - y[test]
-    err += np.dot(e, e)
+    err += np.dot(e, e) # This is the same as np.sum(e * e)
 
 rmse_10cv = np.sqrt(err / len(x))
 print('RMSE on training: {}'.format(rmse_train))