[madlib] 01/02: Revert "Linear Regression: Support for JSON and special characters"

okislal Tue, 22 Jan 2019 02:07:06 -0800

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


commit 56f29f77a137294ca2cde66a860006d808b556ca
Author: Orhan Kislal <[email protected]>
AuthorDate: Tue Jan 22 13:04:34 2019 +0300

    Revert "Linear Regression: Support for JSON and special characters"
    
    This reverts commit afd408d0f73cb849bbbdc651642d8aa39e51ffcf to fix the
    author info.
---
 src/ports/postgres/modules/regress/linear.py_in    |  78 +++--------
 .../postgres/modules/regress/test/linear.sql_in    | 150 ---------------------
 2 files changed, 17 insertions(+), 211 deletions(-)

diff --git a/src/ports/postgres/modules/regress/linear.py_in 
b/src/ports/postgres/modules/regress/linear.py_in
index 2be78e9..70fe78a 100644
--- a/src/ports/postgres/modules/regress/linear.py_in
+++ b/src/ports/postgres/modules/regress/linear.py_in
@@ -11,53 +11,19 @@ from utilities.validate_args import table_is_empty
 from utilities.utilities import add_postfix
 from utilities.utilities import _assert
 from utilities.control import MinWarning
-from utilities.utilities import get_table_qualified_col_str
-from utilities.utilities import strip_end_quotes
-from utilities.utilities import _string_to_array
-from utilities.validate_args import get_expr_type
-from utilities.utilities import _string_to_sql_array
-from utilities.validate_args import quote_ident
-
-
 # ----------------------------------------------------------------------
 
 
 def linregr_train(schema_madlib, source_table, out_table,
                   dependent_varname, independent_varname, grouping_cols,
                   heteroskedasticity_option, **kwargs):
-    """
-        Args:
-        @param source_table,                -- name of input table
-        @param out_table,                   -- name of output table
-        @param dependent_varname,           -- name of dependent variable
-        @param independent_varname,         -- name of independent variables
-        @param grouping_cols,               -- names of columns to group-by
-        @param heteroskedasticity_option    -- perform heteroskedasticity test?
-
-    """
-
     with MinWarning('warning'):
         _validate_args(schema_madlib, source_table, out_table,
                        dependent_varname, independent_varname,
                        grouping_cols, heteroskedasticity_option)
 
-        group_col_list = ''if grouping_cols is None else 
_string_to_array_with_quotes(
-            grouping_cols)
-
-        group_str = '' if grouping_cols is None else 'GROUP BY %s' % 
get_table_qualified_col_str(source_table,
-                                                                               
                  group_col_list)
-        # For json expressions like data->>'ID', this will quote them to
-        # "data->>'ID'"
-        cols_wo_quotes = ''if grouping_cols is None else ' '.join(
-            strip_end_quotes(col).format(**locals()) for col in group_col_list)
-        group_cols_w_quotes = ''if grouping_cols is None else ' ,'.join(
-            " \"" + strip_end_quotes(col) + "\" ".format(**locals()) for col 
in group_col_list)
-
-        # For json expressions like data->>'ID' it will alias the columns to
-        # "data->>'ID'"
-        group_str_sel = ''if grouping_cols is None else ' , 
'.join("{source_table}.{col} as \"".format(source_table=source_table, col=col) 
+ strip_end_quotes(col) + "\" "
-                                                                   for col in 
group_col_list) + ","
-
+        group_str = '' if grouping_cols is None else 'GROUP BY %s' % 
grouping_cols
+        group_str_sel = '' if grouping_cols is None else grouping_cols + ','
         join_str = ',' if grouping_cols is None else 'JOIN'
         using_str = '' if grouping_cols is None else 'USING (%s)' % 
grouping_cols
 
@@ -84,13 +50,6 @@ def linregr_train(schema_madlib, source_table, out_table,
                        independent_varname=independent_varname,
                        source_table=source_table))
 
-        # USING Clause does not support expressions. So modifying to the 
regular
-        # join clause instead.
-        join_clause = '' if grouping_cols is None else " ON " + ' AND 
'.join("{source_table}.{col} = {temp_lin_rst}.\""
-                                                                             
.format(source_table=source_table, col=col,
-                                                                               
      temp_lin_rst=temp_lin_rst) + strip_end_quotes(col) + "\""
-                                                                             
for col in group_col_list)
-
         # Run heteroskedasticity test
         if heteroskedasticity_option:
             temp_hsk_rst = unique_string()
@@ -105,15 +64,15 @@ def linregr_train(schema_madlib, source_table, out_table,
                         {independent_varname},
                         {temp_lin_rst}.coef) AS hsk_rst
                 FROM
-                    {source_table} {join_str} {temp_lin_rst} {join_clause}
-               {group_str}
+                    {source_table} {join_str} {temp_lin_rst} {using_str}
+                {group_str}
                 """.format(schema_madlib=schema_madlib,
                            temp_hsk_rst=temp_hsk_rst,
                            dependent_varname=dependent_varname,
                            independent_varname=independent_varname,
                            group_str_sel=group_str_sel, group_str=group_str,
-                           join_str=join_str, source_table=source_table,
-                           temp_lin_rst=temp_lin_rst, join_clause=join_clause))
+                           join_str=join_str, using_str=using_str,
+                           source_table=source_table, 
temp_lin_rst=temp_lin_rst))
 
         # Output the results
         join_str = ''
@@ -121,19 +80,17 @@ def linregr_train(schema_madlib, source_table, out_table,
         if heteroskedasticity_option:
             if grouping_cols is not None:
                 join_str = 'JOIN %s AS hsk' % temp_hsk_rst
-                using_str = 'USING (%s)' % group_cols_w_quotes
+                using_str = 'USING (%s)' % (grouping_cols)
             else:
                 join_str = ', %s AS hsk' % temp_hsk_rst
         bp_stats = '(hsk.hsk_rst).bp_stats,' if heteroskedasticity_option else 
''
         bp_p_value = '(hsk.hsk_rst).bp_p_value,' if heteroskedasticity_option 
else ''
 
-        group_cols_w_quotes = '' if grouping_cols is None else 
group_cols_w_quotes + ","
-
         plpy.execute(
             """
             CREATE TABLE {out_table} AS
             SELECT
-               {group_cols_w_quotes}
+                {group_str_sel}
                 coef,
                 r2,
                 std_err,
@@ -152,11 +109,10 @@ def linregr_train(schema_madlib, source_table, out_table,
                 END AS num_missing_rows_skipped,
                 vcov as variance_covariance
             FROM
-                {temp_lin_rst} AS lin {join_str}  {using_str}
-           """.format(out_table=out_table,
+                {temp_lin_rst} AS lin {join_str} {using_str}
+           """.format(out_table=out_table, group_str_sel=group_str_sel,
                       bp_stats=bp_stats, bp_p_value=bp_p_value,
-                      temp_lin_rst=temp_lin_rst, join_str=join_str, 
using_str=using_str,
-                      group_cols_w_quotes=group_cols_w_quotes))
+                      temp_lin_rst=temp_lin_rst, join_str=join_str, 
using_str=using_str))
 
         num_rows = plpy.execute(
             """
@@ -178,8 +134,8 @@ def linregr_train(schema_madlib, source_table, out_table,
                       'linregr'::varchar                  as method
                     , '{source_table}'::varchar           as source_table
                     , '{out_table}'::varchar              as out_table
-                    , $__madlib__${dependent_varname}$__madlib__$::varchar     
 as dependent_varname
-                    , $__madlib__${independent_varname}$__madlib__$::varchar   
 as independent_varname
+                    , '{dependent_varname}'::varchar      as dependent_varname
+                    , '{independent_varname}'::varchar    as 
independent_varname
                     , {num_rows_processed}::integer       as num_rows_processed
                     , {num_rows_skipped}::integer         as 
num_missing_rows_skipped
                     , {grouping_col}::text                as grouping_col
@@ -187,7 +143,7 @@ def linregr_train(schema_madlib, source_table, out_table,
                        out_table_summary=out_table_summary,
                        dependent_varname=dependent_varname,
                        independent_varname=independent_varname,
-                       
grouping_col="$__madlib__$"+grouping_cols+"$__madlib__$" if grouping_cols else 
"NULL",
+                       grouping_col="'" + grouping_cols + "'" if grouping_cols 
else "NULL",
                        **num_rows))
     return None
 # ----------------------------------------------------------------------
@@ -229,9 +185,10 @@ def _validate_args(schema_madlib, source_table, out_table, 
dependent_varname,
     if grouping_cols is not None:
         _assert(grouping_cols != '',
                 "Linregr error: Invalid grouping columns name!")
-        # grouping columns can be a valid expression as well, for eg.
-        # a json expression (data->>'id'), so commenting this part.
         grouping_list = _string_to_array_with_quotes(grouping_cols)
+        _assert(columns_exist_in_table(
+            source_table, grouping_list, schema_madlib),
+            "Linregr error: Grouping column does not exist!")
 
         predefined = set(('coef', 'r2', 'std_err', 't_stats',
                           'p_values', 'condition_no',
@@ -321,7 +278,6 @@ def linregr_help_message(schema_madlib, message, **kwargs):
 
     return help_string.format(schema_madlib=schema_madlib)
 
-
 def linregr_predict_help_message(schema_madlib, message, **kwargs):
     """ Help message for Prediction in Linear Regression
 
diff --git a/src/ports/postgres/modules/regress/test/linear.sql_in 
b/src/ports/postgres/modules/regress/test/linear.sql_in
index 612914a..9013108 100644
--- a/src/ports/postgres/modules/regress/test/linear.sql_in
+++ b/src/ports/postgres/modules/regress/test/linear.sql_in
@@ -213,153 +213,3 @@ select linregr_train('example');
 select linregr_predict();
 select linregr_predict('usage');
 select linregr_predict('example');
-
-
-
-
-
-------------------------------------------------------------------------
-
-
--- Test case for JSON Data Type and Special characters. 
--- This function checks special characters for GPDB version = 4.x and JSON for 
5.x and above
-
-create or replace function linereg_expr_test() RETURNS VOID AS
-$$ 
-DECLARE col_type TEXT;
-
-begin
-select typname into col_type from pg_type where typname = 'json' ;
-if col_type = 'json' THEN
-
-DROP TABLE IF EXISTS houses_json;
-CREATE TABLE houses_json (
-    id SERIAL NOT NULL,
-    data json
-);
-
-
-INSERT INTO houses_json VALUES
-( 2  ,'{ "ta,x":  590, "bedroom":2, "ba$th":1,   "pr''ice":  50000, "size": 
770, "lot":22100 }'),
-( 4  ,'{ "ta,x": 1050, "bedroom":3, "ba$th":2,   "pr''ice":  85000, 
"size":1410, "lot":12000 }'),
-( 1  ,'{ "ta,x":   20, "bedroom":3, "ba$th":1,   "pr''ice":  22500, 
"size":1060, "lot":3500  }'),
-( 6  ,'{ "ta,x":  870, "bedroom":2, "ba$th":2,   "pr''ice":  90000, 
"size":1300, "lot":17500 }'),
-( 3  ,'{ "ta,x": 1320, "bedroom":3, "ba$th":2,   "pr''ice": 133000, 
"size":1500, "lot":30000 }'),
-( 5  ,'{ "ta,x": 1350, "bedroom":2, "ba$th":1,   "pr''ice":  90500, "size": 
820, "lot":25700 }'),
-( 7  ,'{ "ta,x": 2790, "bedroom":3, "ba$th":2.5, "pr''ice": 260000, 
"size":2130, "lot":25000 }'),
-( 10 ,'{ "ta,x":  680, "bedroom":2, "ba$th":1,   "pr''ice": 142500, 
"size":1170, "lot":22000 }'),
-( 9  ,'{ "ta,x": 1840, "bedroom":3, "ba$th":2,   "pr''ice": 160000, 
"size":1500, "lot":19000 }'),
-( 8  ,'{ "ta,x": 3680, "bedroom":4, "ba$th":2,   "pr''ice": 240000, 
"size":2790, "lot":20000 }'),
-( 11 ,'{ "ta,x": 1660, "bedroom":3, "ba$th":1,   "pr''ice":  87000, 
"size":1030, "lot":17500 }'),
-( 12 ,'{ "ta,x": 1620, "bedroom":3, "ba$th":2,   "pr''ice": 118600, 
"size":1250, "lot":20000 }'),
-( 13 ,'{ "ta,x": 3100, "bedroom":3, "ba$th":2,   "pr''ice": 140000, 
"size":1760, "lot":38000 }'),
-( 14 ,'{ "ta,x": 2070, "bedroom":2, "ba$th":3,   "pr''ice": 148000, 
"size":1550, "lot":14000 }'),
-( 15 ,'{ "ta,x":  650, "bedroom":3, "ba$th":1.5, "pr''ice":  65000, 
"size":1450, "lot":12000 }');
-
-drop table if exists result_lin_houses_json;
-drop table if exists result_lin_houses_json_summary;
-PERFORM linregr_train('houses_json', 'result_lin_houses_json', 
'(data->>''pr''''ice'')::integer',
-                    'array[1, (data->>''ta,x'')::integer, 
(data->>''ba$th'')::double precision, (data->>''size'')::integer]',
-                    'data->>''bedroom''', True);
-PERFORM assert(
-    relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 
1e-2 and
-    relative_error(r2, 0.9688) < 1e-2 and
-    relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
-    relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
-    relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
-    relative_error(condition_no, 10086.1) < 1e-2 and
-    relative_error(bp_stats, 2.5451) < 1e-2 and
-    relative_error(bp_p_value, 0.467192) < 1e-2,
-    'Linear regression with heteroskedasticity & grouping (houses): Wrong 
results'
-)
-from result_lin_houses_json
-where "data->>'bedroom'"::integer = 2;
-
-else 
-
-
-
-DROP TABLE IF EXISTS houses_spcl;
-CREATE TABLE houses_spcl (
-    id SERIAL NOT NULL,
-    "ta,x" INTEGER,
-    "bed,room" REAL,
-    "ba$th" REAL,
-    "pr'ice" INTEGER,
-    size INTEGER,
-    lot INTEGER
-);
-
-INSERT INTO houses_spcl("ta,x", "bed,room", "ba$th", "pr'ice", size, lot) 
VALUES
-( 590, 2, 1,    50000,  770, 22100),
-(1050, 3, 2,    85000, 1410, 12000),
-(  20, 3, 1,    22500, 1060, 3500 ),
-( 870, 2, 2,    90000, 1300, 17500),
-(1320, 3, 2,   133000, 1500, 30000),
-(1350, 2, 1,    90500,  820, 25700),
-(2790, 3, 2.5, 260000, 2130, 25000),
-( 680, 2, 1,   142500, 1170, 22000),
-(1840, 3, 2,   160000, 1500, 19000),
-(3680, 4, 2,   240000, 2790, 20000),
-(1660, 3, 1,    87000, 1030, 17500),
-(1620, 3, 2,   118600, 1250, 20000),
-(3100, 3, 2,   140000, 1760, 38000),
-(2070, 2, 3,   148000, 1550, 14000),
-( 650, 3, 1.5,  65000, 1450, 12000);
-
-drop table if exists result_lin_houses_spcl;
-drop table if exists result_lin_houses_spcl_summary;
-PERFORM linregr_train('houses_spcl', 'result_lin_houses_spcl', '"pr''ice"',
-                    'array[1, "ta,x", "ba$th", size]',
-                    '"bed,room"', True);
-PERFORM assert(
-    relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 
1e-2 and
-    relative_error(r2, 0.9688) < 1e-2 and
-    relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
-    relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
-    relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
-    relative_error(condition_no, 10086.1) < 1e-2 and
-    relative_error(bp_stats, 2.5451) < 1e-2 and
-    relative_error(bp_p_value, 0.467192) < 1e-2,
-    'Linear regression with heteroskedasticity & grouping (houses): Wrong 
results'
-)
-from result_lin_houses_spcl
-where result_lin_houses_spcl."bed,room" = 2;
-
-
-end IF;
-
-end;
-$$ LANGUAGE plpgsql;
-
-
-select linereg_expr_test();
-
-
-
-------------------------------------------------------------------------
-
--- Test Case for : when grouping_cols is NULL and the 
heteroskedasticity_option is True
-
-
-
-
-drop table if exists result_lin_houses;
-drop table if exists result_lin_houses_summary;
-select linregr_train('houses', 'result_lin_houses', 'price',
-                    'array[1, tax, bath, size]',
-                    NULL, True);
-
-select assert(
-    relative_error(coef, 
array[-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354]) < 
1e-2 and
-    relative_error(r2, 0.768577580597443) < 1e-2 and
-    relative_error(std_err, 
array[33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087]) < 
1e-2 and
-    relative_error(t_stats, 
array[-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605]) < 
1e-2 and
-    relative_error(p_values, 
array[0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186])
 < 1e-2 and
-    relative_error(condition_no, 9002.50) < 1e-2 and
-    relative_error(bp_stats, 1.22605243985138) < 1e-2 and
-    relative_error(bp_p_value, 0.746762880478034) < 1e-2,
-    'Linear regression with heteroskedasticity with no grouping: Wrong results'
-)
-from result_lin_houses;
-

[madlib] 01/02: Revert "Linear Regression: Support for JSON and special characters"

Reply via email to