Repository: madlib
Updated Branches:
  refs/heads/master 441f16bd5 -> a3b59356f


Elastic Net: Allow grouping by non-numeric column

JIRA: MADLIB-1262

- Grouping columns should be quoted if the type of the column is of type
TEXT.
- Grouping column names that require double quoting need special
handling.

Closes #309

Co-authored-by: Domino Valdano <dvald...@pivotal.io>
Co-authored-by: Rahul Iyer <ri...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/ec328dba
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/ec328dba
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/ec328dba

Branch: refs/heads/master
Commit: ec328dba6853d31df5b1bd6bbdcd35933596fe78
Parents: 441f16b
Author: Arvind Sridhar <arvindsrid...@berkeley.edu>
Authored: Thu Aug 16 20:02:48 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Aug 16 20:03:22 2018 -0700

----------------------------------------------------------------------
 .../elastic_net_generate_result.py_in           |  63 ++++++----
 .../modules/elastic_net/test/elastic_net.sql_in | 122 +++++++++++++++++++
 2 files changed, 161 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/ec328dba/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in 
b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
index 1dbd664..15881b4 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
@@ -2,7 +2,10 @@ import plpy
 from elastic_net_utils import _process_results
 from elastic_net_utils import _compute_log_likelihood
 from utilities.validate_args import get_cols_and_types
+from utilities.validate_args import quote_ident
 from utilities.utilities import split_quoted_delimited_str
+from internal.db_utils import quote_literal
+
 
 def _elastic_net_generate_result(optimizer, iteration_run, **args):
     """
@@ -33,26 +36,30 @@ def _elastic_net_generate_result(optimizer, iteration_run, 
**args):
         col_grp_key = args['col_grp_key']
         grouping_str = args['grouping_str']
         cols_types = dict(get_cols_and_types(args["tbl_source"]))
-        grouping_str1 = grouping_column + ","
+        grouping_cols_list = split_quoted_delimited_str(grouping_column)
+        grouping_str1 = ','.join(['{0} AS {1}'.format(g, quote_ident(g))
+                                 for g in grouping_cols_list])
 
         select_mean_and_std = ''
         inner_join_x = ''
         inner_join_y = ''
-        grouping_cols_list = split_quoted_delimited_str(grouping_column)
-        select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp)
-                        for grp in grouping_cols_list]) + ','
-        select_grouping_info = ','.join([grp_col+"\t"+cols_types[grp_col]
+        select_grp = ','.join(['n_tuples_including_nulls_subq.' + 
str(quote_ident(grp))
+                              for grp in grouping_cols_list]) + ','
+        select_grouping_info = ','.join([grp_col + "\t" + cols_types[grp_col]
                                         for grp_col in grouping_cols_list]) + 
","
         if data_scaled:
             x_grp_cols = ' AND '.join([
-                    'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp,
-                    args["x_mean_table"], grp) for grp in grouping_cols_list])
+                '{0} = {1}.{2}'.format('n_tuples_including_nulls_subq.' + 
str(quote_ident(grp)),
+                                       args["x_mean_table"], grp)
+                for grp in grouping_cols_list])
             y_grp_cols = ' AND '.join([
-                    'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp,
-                    args["y_mean_table"], grp) for grp in grouping_cols_list])
-            select_mean_and_std = ' {0}.mean AS x_mean, 
'.format(args["x_mean_table"]) +\
-                ' {0}.mean AS y_mean, '.format(args["y_mean_table"]) +\
-                ' {0}.std AS x_std, '.format(args["x_mean_table"])
+                '{0}={1}.{2}'.format('n_tuples_including_nulls_subq.' + 
str(quote_ident(grp)),
+                                     args["y_mean_table"], grp)
+                for grp in grouping_cols_list])
+            select_mean_and_std = (
+                ' {0}.mean AS x_mean, '.format(args["x_mean_table"]) +
+                ' {0}.mean AS y_mean, '.format(args["y_mean_table"]) +
+                ' {0}.std AS x_std, '.format(args["x_mean_table"]))
             inner_join_x = ' INNER JOIN {0} ON {1} '.format(
                 args["x_mean_table"], x_grp_cols)
             inner_join_y = ' INNER JOIN {0} ON {1} '.format(
@@ -66,7 +73,7 @@ def _elastic_net_generate_result(optimizer, iteration_run, 
**args):
             FROM
                 (
                     SELECT
-                        {grouping_str1}
+                        {grouping_str1},
                         array_to_string(ARRAY[{grouping_str}], ',') AS 
{col_grp_key}
                     FROM {source_table}
                     GROUP BY {grouping_column}, {col_grp_key}
@@ -83,7 +90,7 @@ def _elastic_net_generate_result(optimizer, iteration_run, 
**args):
                        col_grp_iteration=args["col_grp_iteration"], **locals())
     else:
         # It's a much simpler query when there is no grouping.
-        grouping_str1 = ""
+        grouping_cols_list = []
         select_grouping_info = ""
         out_table_qstr = """
             SELECT
@@ -117,13 +124,15 @@ def _elastic_net_generate_result(optimizer, 
iteration_run, **args):
 
     result = plpy.execute(out_table_qstr)
     for res in result:
-        build_output_table(res, grouping_column, grouping_str1,
+        build_output_table(res, grouping_column, grouping_cols_list,
                            standardize_flag, iteration_run, **args)
 
     # Create summary table, listing the grouping columns used.
     grouping_text = "NULL" if not grouping_column else grouping_column
     failed_groups = plpy.execute("""
-        SELECT count(*) AS num_failed_groups FROM {0} WHERE coef_all IS NULL
+        SELECT count(*) AS num_failed_groups
+        FROM {0}
+        WHERE coef_all IS NULL
         """.format(args['tbl_result']))[0]
     all_groups = plpy.execute("SELECT count(*) AS num_all_groups FROM {0} ".
                               format(args['tbl_result']))[0]
@@ -142,7 +151,8 @@ def _elastic_net_generate_result(optimizer, iteration_run, 
**args):
             '{family}'::varchar                 AS family,
             {alpha}::float                      AS alpha,
             {lambda_value}::float               AS lambda_value,
-            '{grouping_text}'::varchar          AS grouping_col,
+            $madlib_super_quote${grouping_text}$madlib_super_quote$::varchar
+                                                AS grouping_col,
             {num_all_groups}::integer           AS num_all_groups,
             {num_failed_groups}::integer        AS num_failed_groups
         """.format(grouping_text=grouping_text,
@@ -150,7 +160,7 @@ def _elastic_net_generate_result(optimizer, iteration_run, 
**args):
     return None
 
 
-def build_output_table(res, grouping_column, grouping_str1,
+def build_output_table(res, grouping_column, grouping_col_list,
                        standardize_flag, iteration_run, **args):
     """
     Insert model captured in "res" into the output table
@@ -160,10 +170,11 @@ def build_output_table(res, grouping_column, 
grouping_str1,
         if args["normalization"]:
             if grouping_column:
                 (coef, intercept) = _restore_scale(r_coef, res["intercept"],
-                    args, res["x_mean"], res["x_std"], res["y_mean"])
+                                                   args, res["x_mean"],
+                                                   res["x_std"], res["y_mean"])
             else:
                 (coef, intercept) = _restore_scale(r_coef,
-                    res["intercept"], args)
+                                                   res["intercept"], args)
         else:
             coef = r_coef
             intercept = res["intercept"]
@@ -173,11 +184,13 @@ def build_output_table(res, grouping_column, 
grouping_str1,
 
         log_likelihood = _compute_log_likelihood(r_coef, res["intercept"], 
**args)
         if grouping_column:
-            grouping_info = ','.join([str(res[grp_col.strip()])
-                                      for grp_col in grouping_str1.split(',')
-                                      if grp_col.strip() in res.keys()]) + ","
+            grouping_info = ",".join([quote_literal(str(res[grp_col.strip()]))
+                                      for grp_col in grouping_col_list
+                                      if grp_col.strip() in res.keys()])
         else:
             grouping_info = ""
+        if grouping_info:
+            grouping_info += ', '
         fquery = """
             INSERT INTO {tbl_result} VALUES
                 ({grouping_info} '{family}', '{features}'::text[], 
'{features_selected}'::text[],
@@ -191,8 +204,10 @@ def build_output_table(res, grouping_column, grouping_str1,
                        **args)
         plpy.execute(fquery)
 # ------------------------------------------------------------------------
+
+
 def _restore_scale(coef, intercept, args,
-    x_mean=None, x_std=None, y_mean=None):
+                   x_mean=None, x_std=None, y_mean=None):
     """
     Restore the original scales
     """

http://git-wip-us.apache.org/repos/asf/madlib/blob/ec328dba/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in 
b/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in
index e3765ee..94ef902 100644
--- a/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/test/elastic_net.sql_in
@@ -839,3 +839,125 @@ SELECT elastic_net_train(
 SELECT * FROM house_en;
 SELECT * FROM house_en_summary;
 SELECT * FROM house_en_cv;
+
+-- Test grouping on non-numeric column
+
+DROP TABLE IF EXISTS grouping_test_non_numeric;
+CREATE TABLE grouping_test_non_numeric ( id INT,
+       tax INT,
+       bedroom INT,
+       bath FLOAT,
+       price INT,
+       size INT,
+       lot INT,
+       zipcode TEXT,
+       testbool boolean);
+INSERT INTO grouping_test_non_numeric (id, tax, bedroom, bath, price, size, 
lot, zipcode, testbool) VALUES
+       (1  ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100  , 'test', 'true'),
+       (2  , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000  , 'test2', 'true'),
+       (3  ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500  , 'test', 'false'),
+       (4  ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500  , 'test2', 
'false');
+
+DROP TABLE IF EXISTS grouping_test_non_numeric_en1, 
grouping_test_non_numeric_en1_summary;
+SELECT elastic_net_train( 'grouping_test_non_numeric',                  -- 
Source table
+    'grouping_test_non_numeric_en1',               -- Result table
+    'price',                   -- Dependent variable
+    'array[tax, bath, size]',  -- Independent variable
+    'gaussian',                -- Regression family
+    0.5,                       -- Alpha value
+    0.1,                       -- Lambda value
+    TRUE,                      -- Standardize
+    'zipcode',                 -- Grouping column(s)
+    'fista',                   -- Optimizer
+    '',                        -- Optimizer parameters
+    NULL,                      -- Excluded columns
+    10000,                     -- Maximum iterations
+    1e-6                       -- Tolerance value
+);
+
+DROP TABLE IF EXISTS grouping_test_non_numeric_en1, 
grouping_test_non_numeric_en1_summary;
+SELECT elastic_net_train( 'grouping_test_non_numeric',                  -- 
Source table
+    'grouping_test_non_numeric_en1',               -- Result table
+    'price',                   -- Dependent variable
+    'array[tax, bath, size]',  -- Independent variable
+    'gaussian',                -- Regression family
+    0.5,                       -- Alpha value
+    0.1,                       -- Lambda value
+    TRUE,                      -- Standardize
+    'testbool',                -- Grouping column(s)
+    'fista',                   -- Optimizer
+    '',                        -- Optimizer parameters
+    NULL,                      -- Excluded columns
+    10000,                     -- Maximum iterations
+    1e-6                       -- Tolerance value
+);
+
+-- Special character tests
+
+DROP TABLE IF EXISTS special_character_tests;
+CREATE TABLE special_character_tests ( id INT,
+       tax INT,
+       bedroom INT,
+       bath FLOAT,
+       price INT,
+       size INT,
+       lot INT,
+       zipcode TEXT,
+       testbool boolean);
+INSERT INTO special_character_tests (id, tax, bedroom, bath, price, size, lot, 
zipcode, testbool) VALUES
+       (1  ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100  , 'tes''t', 
'true'),
+       (2  , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000  , 
'test$$%*Ж!#"()2', 'true'),
+       (3  ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500  , 'tes''t', 
'false'),
+       (4  ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500  , 
'test$$%*Ж!#"()2', 'false');
+
+DROP TABLE IF EXISTS special_character_tests_en1, 
special_character_tests_en1_summary;
+SELECT elastic_net_train( 'special_character_tests',                  -- 
Source table
+    'special_character_tests_en1',               -- Result table
+    'price',                   -- Dependent variable
+    'array[tax, bath, size]',  -- Independent variable
+    'gaussian',                -- Regression family
+    0.5,                       -- Alpha value
+    0.1,                       -- Lambda value
+    TRUE,                      -- Standardize
+    'zipcode',                 -- Grouping column(s)
+    'fista',                   -- Optimizer
+    '',                        -- Optimizer parameters
+    NULL,                      -- Excluded columns
+    10000,                     -- Maximum iterations
+    1e-6                       -- Tolerance value
+);
+
+-- Test case is commented out because support for special characters in column 
names has not yet been added to this module
+
+DROP TABLE IF EXISTS special_character_tests2;
+CREATE TABLE special_character_tests2 ( id INT,
+       tax INT,
+       bedroom INT,
+       bath FLOAT,
+       price INT,
+       size INT,
+       lot INT,
+       "zip'code" TEXT,
+       testbool boolean);
+INSERT INTO special_character_tests2 (id, tax, bedroom, bath, price, size, 
lot, "zip'code", testbool) VALUES
+       (1  ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100  , 'test', 'true'),
+       (2  , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000  , 'test2', 'true'),
+       (3  ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500  , 'test', 'false'),
+       (4  ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500  , 'test2', 
'false');
+
+DROP TABLE IF EXISTS special_character_tests2_en1, 
special_character_tests2_en1_summary;
+SELECT elastic_net_train( 'special_character_tests2',                  -- 
Source table
+    'special_character_tests2_en1',               -- Result table
+    'price',                   -- Dependent variable
+    'array[tax, bath, size]',  -- Independent variable
+    'gaussian',                -- Regression family
+    0.5,                       -- Alpha value
+    0.1,                       -- Lambda value
+    TRUE,                      -- Standardize
+    '"zip''code"',                 -- Grouping column(s)
+    'fista',                   -- Optimizer
+    '',                        -- Optimizer parameters
+    NULL,                      -- Excluded columns
+    10000,                     -- Maximum iterations
+    1e-6                       -- Tolerance value
+);

Reply via email to