This is an automated email from the ASF dual-hosted git repository. okislal pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 56f29f77a137294ca2cde66a860006d808b556ca Author: Orhan Kislal <[email protected]> AuthorDate: Tue Jan 22 13:04:34 2019 +0300 Revert "Linear Regression: Support for JSON and special characters" This reverts commit afd408d0f73cb849bbbdc651642d8aa39e51ffcf to fix the author info. --- src/ports/postgres/modules/regress/linear.py_in | 78 +++-------- .../postgres/modules/regress/test/linear.sql_in | 150 --------------------- 2 files changed, 17 insertions(+), 211 deletions(-) diff --git a/src/ports/postgres/modules/regress/linear.py_in b/src/ports/postgres/modules/regress/linear.py_in index 2be78e9..70fe78a 100644 --- a/src/ports/postgres/modules/regress/linear.py_in +++ b/src/ports/postgres/modules/regress/linear.py_in @@ -11,53 +11,19 @@ from utilities.validate_args import table_is_empty from utilities.utilities import add_postfix from utilities.utilities import _assert from utilities.control import MinWarning -from utilities.utilities import get_table_qualified_col_str -from utilities.utilities import strip_end_quotes -from utilities.utilities import _string_to_array -from utilities.validate_args import get_expr_type -from utilities.utilities import _string_to_sql_array -from utilities.validate_args import quote_ident - - # ---------------------------------------------------------------------- def linregr_train(schema_madlib, source_table, out_table, dependent_varname, independent_varname, grouping_cols, heteroskedasticity_option, **kwargs): - """ - Args: - @param source_table, -- name of input table - @param out_table, -- name of output table - @param dependent_varname, -- name of dependent variable - @param independent_varname, -- name of independent variables - @param grouping_cols, -- names of columns to group-by - @param heteroskedasticity_option -- perform heteroskedasticity test? - - """ - with MinWarning('warning'): _validate_args(schema_madlib, source_table, out_table, dependent_varname, independent_varname, grouping_cols, heteroskedasticity_option) - group_col_list = ''if grouping_cols is None else _string_to_array_with_quotes( - grouping_cols) - - group_str = '' if grouping_cols is None else 'GROUP BY %s' % get_table_qualified_col_str(source_table, - group_col_list) - # For json expressions like data->>'ID', this will quote them to - # "data->>'ID'" - cols_wo_quotes = ''if grouping_cols is None else ' '.join( - strip_end_quotes(col).format(**locals()) for col in group_col_list) - group_cols_w_quotes = ''if grouping_cols is None else ' ,'.join( - " \"" + strip_end_quotes(col) + "\" ".format(**locals()) for col in group_col_list) - - # For json expressions like data->>'ID' it will alias the columns to - # "data->>'ID'" - group_str_sel = ''if grouping_cols is None else ' , '.join("{source_table}.{col} as \"".format(source_table=source_table, col=col) + strip_end_quotes(col) + "\" " - for col in group_col_list) + "," - + group_str = '' if grouping_cols is None else 'GROUP BY %s' % grouping_cols + group_str_sel = '' if grouping_cols is None else grouping_cols + ',' join_str = ',' if grouping_cols is None else 'JOIN' using_str = '' if grouping_cols is None else 'USING (%s)' % grouping_cols @@ -84,13 +50,6 @@ def linregr_train(schema_madlib, source_table, out_table, independent_varname=independent_varname, source_table=source_table)) - # USING Clause does not support expressions. So modifying to the regular - # join clause instead. - join_clause = '' if grouping_cols is None else " ON " + ' AND '.join("{source_table}.{col} = {temp_lin_rst}.\"" - .format(source_table=source_table, col=col, - temp_lin_rst=temp_lin_rst) + strip_end_quotes(col) + "\"" - for col in group_col_list) - # Run heteroskedasticity test if heteroskedasticity_option: temp_hsk_rst = unique_string() @@ -105,15 +64,15 @@ def linregr_train(schema_madlib, source_table, out_table, {independent_varname}, {temp_lin_rst}.coef) AS hsk_rst FROM - {source_table} {join_str} {temp_lin_rst} {join_clause} - {group_str} + {source_table} {join_str} {temp_lin_rst} {using_str} + {group_str} """.format(schema_madlib=schema_madlib, temp_hsk_rst=temp_hsk_rst, dependent_varname=dependent_varname, independent_varname=independent_varname, group_str_sel=group_str_sel, group_str=group_str, - join_str=join_str, source_table=source_table, - temp_lin_rst=temp_lin_rst, join_clause=join_clause)) + join_str=join_str, using_str=using_str, + source_table=source_table, temp_lin_rst=temp_lin_rst)) # Output the results join_str = '' @@ -121,19 +80,17 @@ def linregr_train(schema_madlib, source_table, out_table, if heteroskedasticity_option: if grouping_cols is not None: join_str = 'JOIN %s AS hsk' % temp_hsk_rst - using_str = 'USING (%s)' % group_cols_w_quotes + using_str = 'USING (%s)' % (grouping_cols) else: join_str = ', %s AS hsk' % temp_hsk_rst bp_stats = '(hsk.hsk_rst).bp_stats,' if heteroskedasticity_option else '' bp_p_value = '(hsk.hsk_rst).bp_p_value,' if heteroskedasticity_option else '' - group_cols_w_quotes = '' if grouping_cols is None else group_cols_w_quotes + "," - plpy.execute( """ CREATE TABLE {out_table} AS SELECT - {group_cols_w_quotes} + {group_str_sel} coef, r2, std_err, @@ -152,11 +109,10 @@ def linregr_train(schema_madlib, source_table, out_table, END AS num_missing_rows_skipped, vcov as variance_covariance FROM - {temp_lin_rst} AS lin {join_str} {using_str} - """.format(out_table=out_table, + {temp_lin_rst} AS lin {join_str} {using_str} + """.format(out_table=out_table, group_str_sel=group_str_sel, bp_stats=bp_stats, bp_p_value=bp_p_value, - temp_lin_rst=temp_lin_rst, join_str=join_str, using_str=using_str, - group_cols_w_quotes=group_cols_w_quotes)) + temp_lin_rst=temp_lin_rst, join_str=join_str, using_str=using_str)) num_rows = plpy.execute( """ @@ -178,8 +134,8 @@ def linregr_train(schema_madlib, source_table, out_table, 'linregr'::varchar as method , '{source_table}'::varchar as source_table , '{out_table}'::varchar as out_table - , $__madlib__${dependent_varname}$__madlib__$::varchar as dependent_varname - , $__madlib__${independent_varname}$__madlib__$::varchar as independent_varname + , '{dependent_varname}'::varchar as dependent_varname + , '{independent_varname}'::varchar as independent_varname , {num_rows_processed}::integer as num_rows_processed , {num_rows_skipped}::integer as num_missing_rows_skipped , {grouping_col}::text as grouping_col @@ -187,7 +143,7 @@ def linregr_train(schema_madlib, source_table, out_table, out_table_summary=out_table_summary, dependent_varname=dependent_varname, independent_varname=independent_varname, - grouping_col="$__madlib__$"+grouping_cols+"$__madlib__$" if grouping_cols else "NULL", + grouping_col="'" + grouping_cols + "'" if grouping_cols else "NULL", **num_rows)) return None # ---------------------------------------------------------------------- @@ -229,9 +185,10 @@ def _validate_args(schema_madlib, source_table, out_table, dependent_varname, if grouping_cols is not None: _assert(grouping_cols != '', "Linregr error: Invalid grouping columns name!") - # grouping columns can be a valid expression as well, for eg. - # a json expression (data->>'id'), so commenting this part. grouping_list = _string_to_array_with_quotes(grouping_cols) + _assert(columns_exist_in_table( + source_table, grouping_list, schema_madlib), + "Linregr error: Grouping column does not exist!") predefined = set(('coef', 'r2', 'std_err', 't_stats', 'p_values', 'condition_no', @@ -321,7 +278,6 @@ def linregr_help_message(schema_madlib, message, **kwargs): return help_string.format(schema_madlib=schema_madlib) - def linregr_predict_help_message(schema_madlib, message, **kwargs): """ Help message for Prediction in Linear Regression diff --git a/src/ports/postgres/modules/regress/test/linear.sql_in b/src/ports/postgres/modules/regress/test/linear.sql_in index 612914a..9013108 100644 --- a/src/ports/postgres/modules/regress/test/linear.sql_in +++ b/src/ports/postgres/modules/regress/test/linear.sql_in @@ -213,153 +213,3 @@ select linregr_train('example'); select linregr_predict(); select linregr_predict('usage'); select linregr_predict('example'); - - - - - ------------------------------------------------------------------------- - - --- Test case for JSON Data Type and Special characters. --- This function checks special characters for GPDB version = 4.x and JSON for 5.x and above - -create or replace function linereg_expr_test() RETURNS VOID AS -$$ -DECLARE col_type TEXT; - -begin -select typname into col_type from pg_type where typname = 'json' ; -if col_type = 'json' THEN - -DROP TABLE IF EXISTS houses_json; -CREATE TABLE houses_json ( - id SERIAL NOT NULL, - data json -); - - -INSERT INTO houses_json VALUES -( 2 ,'{ "ta,x": 590, "bedroom":2, "ba$th":1, "pr''ice": 50000, "size": 770, "lot":22100 }'), -( 4 ,'{ "ta,x": 1050, "bedroom":3, "ba$th":2, "pr''ice": 85000, "size":1410, "lot":12000 }'), -( 1 ,'{ "ta,x": 20, "bedroom":3, "ba$th":1, "pr''ice": 22500, "size":1060, "lot":3500 }'), -( 6 ,'{ "ta,x": 870, "bedroom":2, "ba$th":2, "pr''ice": 90000, "size":1300, "lot":17500 }'), -( 3 ,'{ "ta,x": 1320, "bedroom":3, "ba$th":2, "pr''ice": 133000, "size":1500, "lot":30000 }'), -( 5 ,'{ "ta,x": 1350, "bedroom":2, "ba$th":1, "pr''ice": 90500, "size": 820, "lot":25700 }'), -( 7 ,'{ "ta,x": 2790, "bedroom":3, "ba$th":2.5, "pr''ice": 260000, "size":2130, "lot":25000 }'), -( 10 ,'{ "ta,x": 680, "bedroom":2, "ba$th":1, "pr''ice": 142500, "size":1170, "lot":22000 }'), -( 9 ,'{ "ta,x": 1840, "bedroom":3, "ba$th":2, "pr''ice": 160000, "size":1500, "lot":19000 }'), -( 8 ,'{ "ta,x": 3680, "bedroom":4, "ba$th":2, "pr''ice": 240000, "size":2790, "lot":20000 }'), -( 11 ,'{ "ta,x": 1660, "bedroom":3, "ba$th":1, "pr''ice": 87000, "size":1030, "lot":17500 }'), -( 12 ,'{ "ta,x": 1620, "bedroom":3, "ba$th":2, "pr''ice": 118600, "size":1250, "lot":20000 }'), -( 13 ,'{ "ta,x": 3100, "bedroom":3, "ba$th":2, "pr''ice": 140000, "size":1760, "lot":38000 }'), -( 14 ,'{ "ta,x": 2070, "bedroom":2, "ba$th":3, "pr''ice": 148000, "size":1550, "lot":14000 }'), -( 15 ,'{ "ta,x": 650, "bedroom":3, "ba$th":1.5, "pr''ice": 65000, "size":1450, "lot":12000 }'); - -drop table if exists result_lin_houses_json; -drop table if exists result_lin_houses_json_summary; -PERFORM linregr_train('houses_json', 'result_lin_houses_json', '(data->>''pr''''ice'')::integer', - 'array[1, (data->>''ta,x'')::integer, (data->>''ba$th'')::double precision, (data->>''size'')::integer]', - 'data->>''bedroom''', True); -PERFORM assert( - relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and - relative_error(r2, 0.9688) < 1e-2 and - relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and - relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and - relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and - relative_error(condition_no, 10086.1) < 1e-2 and - relative_error(bp_stats, 2.5451) < 1e-2 and - relative_error(bp_p_value, 0.467192) < 1e-2, - 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' -) -from result_lin_houses_json -where "data->>'bedroom'"::integer = 2; - -else - - - -DROP TABLE IF EXISTS houses_spcl; -CREATE TABLE houses_spcl ( - id SERIAL NOT NULL, - "ta,x" INTEGER, - "bed,room" REAL, - "ba$th" REAL, - "pr'ice" INTEGER, - size INTEGER, - lot INTEGER -); - -INSERT INTO houses_spcl("ta,x", "bed,room", "ba$th", "pr'ice", size, lot) VALUES -( 590, 2, 1, 50000, 770, 22100), -(1050, 3, 2, 85000, 1410, 12000), -( 20, 3, 1, 22500, 1060, 3500 ), -( 870, 2, 2, 90000, 1300, 17500), -(1320, 3, 2, 133000, 1500, 30000), -(1350, 2, 1, 90500, 820, 25700), -(2790, 3, 2.5, 260000, 2130, 25000), -( 680, 2, 1, 142500, 1170, 22000), -(1840, 3, 2, 160000, 1500, 19000), -(3680, 4, 2, 240000, 2790, 20000), -(1660, 3, 1, 87000, 1030, 17500), -(1620, 3, 2, 118600, 1250, 20000), -(3100, 3, 2, 140000, 1760, 38000), -(2070, 2, 3, 148000, 1550, 14000), -( 650, 3, 1.5, 65000, 1450, 12000); - -drop table if exists result_lin_houses_spcl; -drop table if exists result_lin_houses_spcl_summary; -PERFORM linregr_train('houses_spcl', 'result_lin_houses_spcl', '"pr''ice"', - 'array[1, "ta,x", "ba$th", size]', - '"bed,room"', True); -PERFORM assert( - relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) < 1e-2 and - relative_error(r2, 0.9688) < 1e-2 and - relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and - relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and - relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and - relative_error(condition_no, 10086.1) < 1e-2 and - relative_error(bp_stats, 2.5451) < 1e-2 and - relative_error(bp_p_value, 0.467192) < 1e-2, - 'Linear regression with heteroskedasticity & grouping (houses): Wrong results' -) -from result_lin_houses_spcl -where result_lin_houses_spcl."bed,room" = 2; - - -end IF; - -end; -$$ LANGUAGE plpgsql; - - -select linereg_expr_test(); - - - ------------------------------------------------------------------------- - --- Test Case for : when grouping_cols is NULL and the heteroskedasticity_option is True - - - - -drop table if exists result_lin_houses; -drop table if exists result_lin_houses_summary; -select linregr_train('houses', 'result_lin_houses', 'price', - 'array[1, tax, bath, size]', - NULL, True); - -select assert( - relative_error(coef, array[-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354]) < 1e-2 and - relative_error(r2, 0.768577580597443) < 1e-2 and - relative_error(std_err, array[33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087]) < 1e-2 and - relative_error(t_stats, array[-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605]) < 1e-2 and - relative_error(p_values, array[0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186]) < 1e-2 and - relative_error(condition_no, 9002.50) < 1e-2 and - relative_error(bp_stats, 1.22605243985138) < 1e-2 and - relative_error(bp_p_value, 0.746762880478034) < 1e-2, - 'Linear regression with heteroskedasticity with no grouping: Wrong results' -) -from result_lin_houses; -
