This is an automated email from the ASF dual-hosted git repository.
okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new afd408d Linear Regression: Support for JSON and special characters
afd408d is described below
commit afd408d0f73cb849bbbdc651642d8aa39e51ffcf
Author: Orhan Kislal <[email protected]>
AuthorDate: Tue Jan 22 12:50:54 2019 +0300
Linear Regression: Support for JSON and special characters
JIRA: MADLIB-1284
- Modified code in linear.py_in to support special characters and JSON data
type
- Written test case for JSON and Speacial characters
- Test case checks for JSON and special characters in GPDB 5.x and Special
characters in GPDB 4.x
Co-authored-by: Orhan Kislal <[email protected]>
Closes #343
---
src/ports/postgres/modules/regress/linear.py_in | 78 ++++++++---
.../postgres/modules/regress/test/linear.sql_in | 150 +++++++++++++++++++++
2 files changed, 211 insertions(+), 17 deletions(-)
diff --git a/src/ports/postgres/modules/regress/linear.py_in
b/src/ports/postgres/modules/regress/linear.py_in
index 70fe78a..2be78e9 100644
--- a/src/ports/postgres/modules/regress/linear.py_in
+++ b/src/ports/postgres/modules/regress/linear.py_in
@@ -11,19 +11,53 @@ from utilities.validate_args import table_is_empty
from utilities.utilities import add_postfix
from utilities.utilities import _assert
from utilities.control import MinWarning
+from utilities.utilities import get_table_qualified_col_str
+from utilities.utilities import strip_end_quotes
+from utilities.utilities import _string_to_array
+from utilities.validate_args import get_expr_type
+from utilities.utilities import _string_to_sql_array
+from utilities.validate_args import quote_ident
+
+
# ----------------------------------------------------------------------
def linregr_train(schema_madlib, source_table, out_table,
dependent_varname, independent_varname, grouping_cols,
heteroskedasticity_option, **kwargs):
+ """
+ Args:
+ @param source_table, -- name of input table
+ @param out_table, -- name of output table
+ @param dependent_varname, -- name of dependent variable
+ @param independent_varname, -- name of independent variables
+ @param grouping_cols, -- names of columns to group-by
+ @param heteroskedasticity_option -- perform heteroskedasticity test?
+
+ """
+
with MinWarning('warning'):
_validate_args(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
grouping_cols, heteroskedasticity_option)
- group_str = '' if grouping_cols is None else 'GROUP BY %s' %
grouping_cols
- group_str_sel = '' if grouping_cols is None else grouping_cols + ','
+ group_col_list = ''if grouping_cols is None else
_string_to_array_with_quotes(
+ grouping_cols)
+
+ group_str = '' if grouping_cols is None else 'GROUP BY %s' %
get_table_qualified_col_str(source_table,
+
group_col_list)
+ # For json expressions like data->>'ID', this will quote them to
+ # "data->>'ID'"
+ cols_wo_quotes = ''if grouping_cols is None else ' '.join(
+ strip_end_quotes(col).format(**locals()) for col in group_col_list)
+ group_cols_w_quotes = ''if grouping_cols is None else ' ,'.join(
+ " \"" + strip_end_quotes(col) + "\" ".format(**locals()) for col
in group_col_list)
+
+ # For json expressions like data->>'ID' it will alias the columns to
+ # "data->>'ID'"
+ group_str_sel = ''if grouping_cols is None else ' ,
'.join("{source_table}.{col} as \"".format(source_table=source_table, col=col)
+ strip_end_quotes(col) + "\" "
+ for col in
group_col_list) + ","
+
join_str = ',' if grouping_cols is None else 'JOIN'
using_str = '' if grouping_cols is None else 'USING (%s)' %
grouping_cols
@@ -50,6 +84,13 @@ def linregr_train(schema_madlib, source_table, out_table,
independent_varname=independent_varname,
source_table=source_table))
+ # USING Clause does not support expressions. So modifying to the
regular
+ # join clause instead.
+ join_clause = '' if grouping_cols is None else " ON " + ' AND
'.join("{source_table}.{col} = {temp_lin_rst}.\""
+
.format(source_table=source_table, col=col,
+
temp_lin_rst=temp_lin_rst) + strip_end_quotes(col) + "\""
+
for col in group_col_list)
+
# Run heteroskedasticity test
if heteroskedasticity_option:
temp_hsk_rst = unique_string()
@@ -64,15 +105,15 @@ def linregr_train(schema_madlib, source_table, out_table,
{independent_varname},
{temp_lin_rst}.coef) AS hsk_rst
FROM
- {source_table} {join_str} {temp_lin_rst} {using_str}
- {group_str}
+ {source_table} {join_str} {temp_lin_rst} {join_clause}
+ {group_str}
""".format(schema_madlib=schema_madlib,
temp_hsk_rst=temp_hsk_rst,
dependent_varname=dependent_varname,
independent_varname=independent_varname,
group_str_sel=group_str_sel, group_str=group_str,
- join_str=join_str, using_str=using_str,
- source_table=source_table,
temp_lin_rst=temp_lin_rst))
+ join_str=join_str, source_table=source_table,
+ temp_lin_rst=temp_lin_rst, join_clause=join_clause))
# Output the results
join_str = ''
@@ -80,17 +121,19 @@ def linregr_train(schema_madlib, source_table, out_table,
if heteroskedasticity_option:
if grouping_cols is not None:
join_str = 'JOIN %s AS hsk' % temp_hsk_rst
- using_str = 'USING (%s)' % (grouping_cols)
+ using_str = 'USING (%s)' % group_cols_w_quotes
else:
join_str = ', %s AS hsk' % temp_hsk_rst
bp_stats = '(hsk.hsk_rst).bp_stats,' if heteroskedasticity_option else
''
bp_p_value = '(hsk.hsk_rst).bp_p_value,' if heteroskedasticity_option
else ''
+ group_cols_w_quotes = '' if grouping_cols is None else
group_cols_w_quotes + ","
+
plpy.execute(
"""
CREATE TABLE {out_table} AS
SELECT
- {group_str_sel}
+ {group_cols_w_quotes}
coef,
r2,
std_err,
@@ -109,10 +152,11 @@ def linregr_train(schema_madlib, source_table, out_table,
END AS num_missing_rows_skipped,
vcov as variance_covariance
FROM
- {temp_lin_rst} AS lin {join_str} {using_str}
- """.format(out_table=out_table, group_str_sel=group_str_sel,
+ {temp_lin_rst} AS lin {join_str} {using_str}
+ """.format(out_table=out_table,
bp_stats=bp_stats, bp_p_value=bp_p_value,
- temp_lin_rst=temp_lin_rst, join_str=join_str,
using_str=using_str))
+ temp_lin_rst=temp_lin_rst, join_str=join_str,
using_str=using_str,
+ group_cols_w_quotes=group_cols_w_quotes))
num_rows = plpy.execute(
"""
@@ -134,8 +178,8 @@ def linregr_train(schema_madlib, source_table, out_table,
'linregr'::varchar as method
, '{source_table}'::varchar as source_table
, '{out_table}'::varchar as out_table
- , '{dependent_varname}'::varchar as dependent_varname
- , '{independent_varname}'::varchar as
independent_varname
+ , $__madlib__${dependent_varname}$__madlib__$::varchar
as dependent_varname
+ , $__madlib__${independent_varname}$__madlib__$::varchar
as independent_varname
, {num_rows_processed}::integer as num_rows_processed
, {num_rows_skipped}::integer as
num_missing_rows_skipped
, {grouping_col}::text as grouping_col
@@ -143,7 +187,7 @@ def linregr_train(schema_madlib, source_table, out_table,
out_table_summary=out_table_summary,
dependent_varname=dependent_varname,
independent_varname=independent_varname,
- grouping_col="'" + grouping_cols + "'" if grouping_cols
else "NULL",
+
grouping_col="$__madlib__$"+grouping_cols+"$__madlib__$" if grouping_cols else
"NULL",
**num_rows))
return None
# ----------------------------------------------------------------------
@@ -185,10 +229,9 @@ def _validate_args(schema_madlib, source_table, out_table,
dependent_varname,
if grouping_cols is not None:
_assert(grouping_cols != '',
"Linregr error: Invalid grouping columns name!")
+ # grouping columns can be a valid expression as well, for eg.
+ # a json expression (data->>'id'), so commenting this part.
grouping_list = _string_to_array_with_quotes(grouping_cols)
- _assert(columns_exist_in_table(
- source_table, grouping_list, schema_madlib),
- "Linregr error: Grouping column does not exist!")
predefined = set(('coef', 'r2', 'std_err', 't_stats',
'p_values', 'condition_no',
@@ -278,6 +321,7 @@ def linregr_help_message(schema_madlib, message, **kwargs):
return help_string.format(schema_madlib=schema_madlib)
+
def linregr_predict_help_message(schema_madlib, message, **kwargs):
""" Help message for Prediction in Linear Regression
diff --git a/src/ports/postgres/modules/regress/test/linear.sql_in
b/src/ports/postgres/modules/regress/test/linear.sql_in
index 9013108..612914a 100644
--- a/src/ports/postgres/modules/regress/test/linear.sql_in
+++ b/src/ports/postgres/modules/regress/test/linear.sql_in
@@ -213,3 +213,153 @@ select linregr_train('example');
select linregr_predict();
select linregr_predict('usage');
select linregr_predict('example');
+
+
+
+
+
+------------------------------------------------------------------------
+
+
+-- Test case for JSON Data Type and Special characters.
+-- This function checks special characters for GPDB version = 4.x and JSON for
5.x and above
+
+create or replace function linereg_expr_test() RETURNS VOID AS
+$$
+DECLARE col_type TEXT;
+
+begin
+select typname into col_type from pg_type where typname = 'json' ;
+if col_type = 'json' THEN
+
+DROP TABLE IF EXISTS houses_json;
+CREATE TABLE houses_json (
+ id SERIAL NOT NULL,
+ data json
+);
+
+
+INSERT INTO houses_json VALUES
+( 2 ,'{ "ta,x": 590, "bedroom":2, "ba$th":1, "pr''ice": 50000, "size":
770, "lot":22100 }'),
+( 4 ,'{ "ta,x": 1050, "bedroom":3, "ba$th":2, "pr''ice": 85000,
"size":1410, "lot":12000 }'),
+( 1 ,'{ "ta,x": 20, "bedroom":3, "ba$th":1, "pr''ice": 22500,
"size":1060, "lot":3500 }'),
+( 6 ,'{ "ta,x": 870, "bedroom":2, "ba$th":2, "pr''ice": 90000,
"size":1300, "lot":17500 }'),
+( 3 ,'{ "ta,x": 1320, "bedroom":3, "ba$th":2, "pr''ice": 133000,
"size":1500, "lot":30000 }'),
+( 5 ,'{ "ta,x": 1350, "bedroom":2, "ba$th":1, "pr''ice": 90500, "size":
820, "lot":25700 }'),
+( 7 ,'{ "ta,x": 2790, "bedroom":3, "ba$th":2.5, "pr''ice": 260000,
"size":2130, "lot":25000 }'),
+( 10 ,'{ "ta,x": 680, "bedroom":2, "ba$th":1, "pr''ice": 142500,
"size":1170, "lot":22000 }'),
+( 9 ,'{ "ta,x": 1840, "bedroom":3, "ba$th":2, "pr''ice": 160000,
"size":1500, "lot":19000 }'),
+( 8 ,'{ "ta,x": 3680, "bedroom":4, "ba$th":2, "pr''ice": 240000,
"size":2790, "lot":20000 }'),
+( 11 ,'{ "ta,x": 1660, "bedroom":3, "ba$th":1, "pr''ice": 87000,
"size":1030, "lot":17500 }'),
+( 12 ,'{ "ta,x": 1620, "bedroom":3, "ba$th":2, "pr''ice": 118600,
"size":1250, "lot":20000 }'),
+( 13 ,'{ "ta,x": 3100, "bedroom":3, "ba$th":2, "pr''ice": 140000,
"size":1760, "lot":38000 }'),
+( 14 ,'{ "ta,x": 2070, "bedroom":2, "ba$th":3, "pr''ice": 148000,
"size":1550, "lot":14000 }'),
+( 15 ,'{ "ta,x": 650, "bedroom":3, "ba$th":1.5, "pr''ice": 65000,
"size":1450, "lot":12000 }');
+
+drop table if exists result_lin_houses_json;
+drop table if exists result_lin_houses_json_summary;
+PERFORM linregr_train('houses_json', 'result_lin_houses_json',
'(data->>''pr''''ice'')::integer',
+ 'array[1, (data->>''ta,x'')::integer,
(data->>''ba$th'')::double precision, (data->>''size'')::integer]',
+ 'data->>''bedroom''', True);
+PERFORM assert(
+ relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) <
1e-2 and
+ relative_error(r2, 0.9688) < 1e-2 and
+ relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
+ relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
+ relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
+ relative_error(condition_no, 10086.1) < 1e-2 and
+ relative_error(bp_stats, 2.5451) < 1e-2 and
+ relative_error(bp_p_value, 0.467192) < 1e-2,
+ 'Linear regression with heteroskedasticity & grouping (houses): Wrong
results'
+)
+from result_lin_houses_json
+where "data->>'bedroom'"::integer = 2;
+
+else
+
+
+
+DROP TABLE IF EXISTS houses_spcl;
+CREATE TABLE houses_spcl (
+ id SERIAL NOT NULL,
+ "ta,x" INTEGER,
+ "bed,room" REAL,
+ "ba$th" REAL,
+ "pr'ice" INTEGER,
+ size INTEGER,
+ lot INTEGER
+);
+
+INSERT INTO houses_spcl("ta,x", "bed,room", "ba$th", "pr'ice", size, lot)
VALUES
+( 590, 2, 1, 50000, 770, 22100),
+(1050, 3, 2, 85000, 1410, 12000),
+( 20, 3, 1, 22500, 1060, 3500 ),
+( 870, 2, 2, 90000, 1300, 17500),
+(1320, 3, 2, 133000, 1500, 30000),
+(1350, 2, 1, 90500, 820, 25700),
+(2790, 3, 2.5, 260000, 2130, 25000),
+( 680, 2, 1, 142500, 1170, 22000),
+(1840, 3, 2, 160000, 1500, 19000),
+(3680, 4, 2, 240000, 2790, 20000),
+(1660, 3, 1, 87000, 1030, 17500),
+(1620, 3, 2, 118600, 1250, 20000),
+(3100, 3, 2, 140000, 1760, 38000),
+(2070, 2, 3, 148000, 1550, 14000),
+( 650, 3, 1.5, 65000, 1450, 12000);
+
+drop table if exists result_lin_houses_spcl;
+drop table if exists result_lin_houses_spcl_summary;
+PERFORM linregr_train('houses_spcl', 'result_lin_houses_spcl', '"pr''ice"',
+ 'array[1, "ta,x", "ba$th", size]',
+ '"bed,room"', True);
+PERFORM assert(
+ relative_error(coef, array[-84242.0345, 55.4430, -78966.9754, 225.6119]) <
1e-2 and
+ relative_error(r2, 0.9688) < 1e-2 and
+ relative_error(std_err, array[35019.00, 19.57, 23036.81, 49.04]) < 1e-2 and
+ relative_error(t_stats, array[-2.406, 2.833, -3.428, 4.600]) < 1e-2 and
+ relative_error(p_values, array[0.251, 0.216, 0.181, 0.136]) < 1e-2 and
+ relative_error(condition_no, 10086.1) < 1e-2 and
+ relative_error(bp_stats, 2.5451) < 1e-2 and
+ relative_error(bp_p_value, 0.467192) < 1e-2,
+ 'Linear regression with heteroskedasticity & grouping (houses): Wrong
results'
+)
+from result_lin_houses_spcl
+where result_lin_houses_spcl."bed,room" = 2;
+
+
+end IF;
+
+end;
+$$ LANGUAGE plpgsql;
+
+
+select linereg_expr_test();
+
+
+
+------------------------------------------------------------------------
+
+-- Test Case for : when grouping_cols is NULL and the
heteroskedasticity_option is True
+
+
+
+
+drop table if exists result_lin_houses;
+drop table if exists result_lin_houses_summary;
+select linregr_train('houses', 'result_lin_houses', 'price',
+ 'array[1, tax, bath, size]',
+ NULL, True);
+
+select assert(
+ relative_error(coef,
array[-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354]) <
1e-2 and
+ relative_error(r2, 0.768577580597443) < 1e-2 and
+ relative_error(std_err,
array[33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087]) <
1e-2 and
+ relative_error(t_stats,
array[-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605]) <
1e-2 and
+ relative_error(p_values,
array[0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186])
< 1e-2 and
+ relative_error(condition_no, 9002.50) < 1e-2 and
+ relative_error(bp_stats, 1.22605243985138) < 1e-2 and
+ relative_error(bp_p_value, 0.746762880478034) < 1e-2,
+ 'Linear regression with heteroskedasticity with no grouping: Wrong results'
+)
+from result_lin_houses;
+