Repository: incubator-madlib
Updated Branches:
  refs/heads/master 6025c4b0d -> ceefae4f4


Elastic Net: Fix normalization issue

MADLIB-1094 and MADLIB-1146

avg in psql is numerically unstable
Data scaling was not occuring when
grouping is true.

Closes #164


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ceefae4f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ceefae4f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ceefae4f

Branch: refs/heads/master
Commit: ceefae4f4180b88a1aa5712d0e43f0b00573c378
Parents: 6025c4b
Author: Cooper Sloan <cooper.sl...@gmail.com>
Authored: Thu Aug 10 12:04:04 2017 -0700
Committer: Orhan Kislal <okis...@pivotal.io>
Committed: Fri Aug 11 11:31:12 2017 -0700

----------------------------------------------------------------------
 .../elastic_net_generate_result.py_in           |  6 +--
 .../elastic_net/elastic_net_optimizer_igd.py_in |  4 +-
 .../modules/elastic_net/elastic_net_utils.py_in | 42 ++++++++++++++------
 3 files changed, 35 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in 
b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
index df5489f..7a87ef6 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
@@ -38,13 +38,13 @@ def _elastic_net_generate_result(optimizer, iteration_run, 
**args):
         select_mean_and_std = ''
         inner_join_x = ''
         inner_join_y = ''
+        grouping_cols_list = split_quoted_delimited_str(grouping_column)
+        select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp)
+                        for grp in grouping_cols_list]) + ','
         if data_scaled:
-            grouping_cols_list = split_quoted_delimited_str(grouping_column)
             select_grouping_info = ','.join([
                 grp_col.strip()+"\t"+cols_types[grp_col.strip()]
                 for grp_col in grouping_column.split(',')]) + ","
-            select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp)
-                            for grp in grouping_cols_list]) + ','
             x_grp_cols = ' AND '.join([
                     'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp,
                     args["x_mean_table"], grp) for grp in grouping_cols_list])

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in 
b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
index d73a754..c5d21c2 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
@@ -4,7 +4,7 @@ from utilities.utilities import unique_string
 from utilities.in_mem_group_control import GroupIterationController
 from elastic_net_utils import _compute_means
 from elastic_net_utils import _normalize_data
-from elastic_net_utils import _compute_data_scales
+from elastic_net_utils import _compute_scales
 from elastic_net_utils import _tbl_dimension_rownum
 from elastic_net_utils import _elastic_net_validate_args
 from utilities.utilities import _array_to_string
@@ -216,7 +216,7 @@ def _elastic_net_igd_train_compute(schema_madlib, 
func_step_aggregate,
             args["col_ind_var_new"] = args["col_ind_var_norm_new"]
             args["col_dep_var_new"] = args["col_dep_var_norm_new"]
         else:
-            _compute_data_scales(args)
+            _compute_scales(args)
             tbl_used = tbl_source
             args["col_ind_var_new"] = col_ind_var
             args["col_dep_var_new"] = col_dep_var

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceefae4f/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in 
b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
index b2f2505..154ac31 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
@@ -129,18 +129,27 @@ def _compute_log_likelihood(coef, intercept, **args):
     Compute the log-likelihood at the end of calculation
     """
     if args["family"] == "gaussian":  # linear models
+        loss_query = """
+        select
+            {method}(({col_dep_var_new} - 
{schema_madlib}.elastic_net_gaussian_predict(
+               '{coefficients}'::double precision[],
+               {intercept}::double precision,
+               {col_ind_var_new}))^2)/({denominator})
+            as loss
+        from
+            {tbl_used}
+        """
+        # See jira 1094, avg experiences numerical instability
+        denominator = "2."
+        method = "avg"
+        if not args["normalization"]:
+            method = "sum"
+            denominator = "count(*) * 2."
         loss = plpy.execute(
-            """
-            select
-                avg(({col_dep_var_new} - 
{schema_madlib}.elastic_net_gaussian_predict(
-                   '{coefficients}'::double precision[],
-                   {intercept}::double precision,
-                   {col_ind_var_new}))^2) / 2.
-                as loss
-            from
-                {tbl_used}
-            """.format(coefficients=_array_to_string(coef),
+            loss_query.format(coefficients=_array_to_string(coef),
                        intercept=intercept,
+                       method=method,
+                       denominator=denominator,
                        **args))[0]["loss"]
     elif args["family"] == "binomial":  # logistic models
         loss = plpy.execute(
@@ -192,8 +201,18 @@ def _elastic_net_validate_args(tbl_source, col_ind_var, 
col_dep_var,
 
     return None
 # ------------------------------------------------------------------------
+def _compute_scales(args):
+    if args["grouping_col"]:
+        _compute_data_scales_grouping(args)
+    else:
+        _compute_data_scales(args)
 
 def _compute_data_scales_grouping(args):
+    # When grouping_col is defined, we must find an array containing
+    # the mean of every dimension in the independent variable (x), the
+    # mean of dependent variable (y) and the standard deviation for them
+    # specific to groups. Store these results in temp tables x_mean_table
+    # and y_mean_table.
     __utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"],
         args["dimension"], args["schema_madlib"], args["grouping_col"],
         args["x_mean_table"])
@@ -227,13 +246,13 @@ def _normalize_data(args):
     The output is stored in tbl_data_scaled
     """
     y_decenter = True if args["family"] == "gaussian" else False
+    _compute_scales(args)
     if args["grouping_col"]:
         # When grouping_col is defined, we must find an array containing
         # the mean of every dimension in the independent variable (x), the
         # mean of dependent variable (y) and the standard deviation for them
         # specific to groups. Store these results in temp tables x_mean_table
         # and y_mean_table.
-        _compute_data_scales_grouping(args)
         # __utils_normalize_data_grouping reads the various means and stds
         # from the tables.
         __utils_normalize_data_grouping(y_decenter=y_decenter,
@@ -251,7 +270,6 @@ def _normalize_data(args):
         # When no grouping_col is defined, the mean and std for both 'x' and
         # 'y' can be defined using strings, stored in x_mean_str, x_std_str
         # etc. We don't need a table like how we needed for grouping.
-        _compute_data_scales(args)
         __utils_normalize_data(y_decenter=y_decenter,
                                tbl_data=args["tbl_source"],
                                col_ind_var=args["col_ind_var"],

Reply via email to