Repository: madlib Updated Branches: refs/heads/master 4aa073294 -> edc93f529
Regularized Regression: Change cross validation stats JIRA:MADLIB-1169 Cross Validation seems to be supported by Elastic Net, SVM, and Decision Trees. If a module is run with cross validation optimization params, the output table corresponding to it displays `mean` and `std` of the negative loss error for each permutation of the CV params. - This commit changes column names: `mean`->`mean_neg_loss` and `std`->`std_neg_loss`. - CV now uses negative Root Mean Squared Error, instead of the negative Mean Squared Error. - Update Elastic Net user docs to reflect these changes. Additional Author: Nandish Jayaram <[email protected]> Closes #210 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/edc93f52 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/edc93f52 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/edc93f52 Branch: refs/heads/master Commit: edc93f5295256a18943aa3c0f88e9435081ff50f Parents: 4aa0732 Author: Swati Soni <[email protected]> Authored: Wed Dec 6 11:58:46 2017 -0800 Committer: Nandish Jayaram <[email protected]> Committed: Fri Dec 8 10:15:58 2017 -0800 ---------------------------------------------------------------------- .../modules/elastic_net/elastic_net.sql_in | 46 ++++++++++---------- .../validation/internal/cross_validation.py_in | 16 +++---- 2 files changed, 31 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/edc93f52/src/ports/postgres/modules/elastic_net/elastic_net.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in index f3a8980..f367774 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in @@ -231,12 +231,12 @@ cross validation is used. Also, cross validation is not supported if grouping i Hyperparameter optimization can be carried out using the built-in cross validation mechanism, which is activated by assigning a value greater than 1 to -the parameter \e n_folds. Misclassification error is used -for classification and mean squared error is used for regression. +the parameter \e n_folds. Negative misclassification error is used +for classification and negative root mean squared error is used for regression. The values of a parameter to cross validate should be provided in a list. For example, to regularize with the L1 norm and use a lambda value -from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'. +from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'. Note that the use of '{}' and '[]' are both valid here. <DL class="arglist"> @@ -733,9 +733,9 @@ The two queries above will result in same residuals: <h4>Example with Cross Validation</h4> -# Reuse the houses table above. -Here we use 3-fold cross validation with 3 automatically generated -lambda values and 3 specified alpha values. (This can take some time to -run since elastic net is effectively being called 27 times for +Here we use 3-fold cross validation with 3 automatically generated +lambda values and 3 specified alpha values. (This can take some time to +run since elastic net is effectively being called 27 times for these combinations, then a 28th time for the whole dataset.) <pre class="example"> DROP TABLE IF EXISTS houses_en3, houses_en3_summary, houses_en3_cv; @@ -751,9 +751,9 @@ SELECT madlib.elastic_net_train( 'houses', -- Source table 'fista', -- Optimizer $$ n_folds = 3, -- Cross validation parameters validation_result=houses_en3_cv, - n_lambdas = 3, + n_lambdas = 3, alpha = {0, 0.1, 1} - $$, + $$, NULL, -- Excluded columns 10000, -- Maximum iterations 1e-6 -- Tolerance value @@ -765,12 +765,12 @@ SELECT * FROM houses_en3; family | gaussian features | {tax,bath,size} features_selected | {tax,bath,size} -coef_nonzero | {22.4584783679,11657.0825871,52.1622899664} -coef_all | {22.4584783679,11657.0825871,52.1622899664} -intercept | -5067.27288499 +coef_nonzero | {22.4584188479,11657.0739045,52.1624090811} +coef_all | {22.4584188479,11657.0739045,52.1624090811} +intercept | -5067.33396522 log_likelihood | -543193170.15 standardize | t -iteration_run | 392 +iteration_run | 10000 </pre> -# Details of the cross validation: @@ -778,17 +778,17 @@ iteration_run | 392 SELECT * FROM houses_en3_cv ORDER BY lambda_value DESC, alpha ASC; </pre> <pre class="result"> -alpha | lambda_value | mean | std -------+--------------+---------------------+-------------------- - 0 | 100000 | -1.41777698585e+110 | 1.80536123195e+110 - 0.1 | 100000 | -1.19953054719e+107 | 1.72846143163e+107 - 1 | 100000 | -4175743937.91 | 2485189261.38 - 0 | 100 | -4054694238.18 | 2424765457.66 - 0.1 | 100 | -4041768667.28 | 2418294966.72 - 1 | 100 | -1458791218.11 | 483327430.802 - 0 | 0.1 | -1442293698.38 | 426795110.876 - 0.1 | 0.1 | -1442705511.6 | 429680202.16 -| 1 | 0.1 | -1459206061.39 | 485107796.02 + alpha | lambda_value | mean_neg_loss | std_neg_loss +-------+--------------+--------------------+------------------- + 0.0 | 100000.0 | -1.617365261170+55 | 1.26711815498+55 + 0.0 | 100.0 | -63555.0502789 | 3973.78527042 + 0.0 | 0.1 | -37136.5397256 | 9022.78236248 + 0.1 | 100000.0 | -3.260479720340+53 | 9.10745448826+53 + 0.1 | 100.0 | -63445.8310011 | 3965.83900962 + 0.1 | 0.1 | -37192.0390897 | 9058.79757772 + 1.0 | 100000.0 | -64569.8882099 | 4051.1856361 + 1.0 | 100.0 | -38121.9154268 | 9332.65800111 + 1.0 | 0.1 | -38117.5477067 | 9384.36765881 (9 rows) </pre> http://git-wip-us.apache.org/repos/asf/madlib/blob/edc93f52/src/ports/postgres/modules/validation/internal/cross_validation.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in index 11cde2f..84e52e9 100644 --- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in +++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in @@ -67,8 +67,8 @@ class ValidationResult(object): List of dictionaries. Each dictionary contains the following three keys: - - mean: float, average of scores using sub_args - - std: float, standard deviation of scores using sub_args + - mean_neg_loss: float, average of scores using sub_args + - std_neg_loss: float, standard deviation of scores using sub_args - sub_args: dict, the values of arguments being validated """ def __init__(self, cv_history=None): @@ -98,12 +98,12 @@ class ValidationResult(object): def add_one(self, mean, std, sub_args): """Add one record to the history""" - record = dict(mean=mean, std=std, sub_args=sub_args) + record = dict(mean_neg_loss=mean, std_neg_loss=std, sub_args=sub_args) self._cv_history.append(record) def sorted(self): """Sort the history w.r.t. mean value and return a new ValidationResult object""" - ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean')) + ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean_neg_loss')) return ValidationResult(ch) def first(self, attr=None): @@ -112,7 +112,7 @@ class ValidationResult(object): Parameters ========== attr : string, optional - Any string in {'mean', 'std', 'sub_args'} or None + Any string in {'mean_neg_loss', 'std_neg_loss', 'sub_args'} or None Returns ======= @@ -133,13 +133,13 @@ class ValidationResult(object): def output_tbl(self, tbl_name): """Create a table tbl_name that contains the history - The columns of tbl_name are mean, std and the leaf keys in sub_args. + The columns of tbl_name are mean_neg_loss, std_neg_loss and the leaf keys in sub_args. All column types are assumed to be double precision. """ if not tbl_name or not str(tbl_name).strip(): return - header = self._cv_history[0]['sub_args'].keys() + ['mean', 'std'] + header = self._cv_history[0]['sub_args'].keys() + ['mean_neg_loss', 'std_neg_loss'] header_str = ','.join(map(str, header)) data = [] @@ -352,7 +352,7 @@ class CrossValidator(object): return plpy.execute( """ SELECT - -avg(({target}-prediction)^2) AS accuracy + -sqrt(avg(({target}-prediction)^2)) AS accuracy FROM {pred} JOIN {orig} ON {pred}.{id} = {orig}.{id} """.format(pred=pred,
