This is an automated email from the ASF dual-hosted git repository.
njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new f6a7ddb DL: Fix predict with 'NULL' string class values
f6a7ddb is described below
commit f6a7ddbbeb35f045475e286615845fc92b15f097
Author: Nandish Jayaram <[email protected]>
AuthorDate: Fri Jun 7 14:29:59 2019 -0700
DL: Fix predict with 'NULL' string class values
JIRA: MADLIB-1357
Fix handling of 'NULL' string for class values in predict. 'NULL' string
as a class value was getting reported the same way as a Postgres NULL
class value (i.e., empty). This commit double-quotes a 'NULL' string
class value in the relevant query.
NOTE: In predict, if we use 'prob' as the pred_type, we create a column
for each distinct class level. The column name for Postgres NULL class
level is 'prob_NULL', and the column name for a 'NULL' string class
value will be 'prob_"NULL'.
Closes #408
---
.../postgres/modules/utilities/utilities.py_in | 32 ++++++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in
b/src/ports/postgres/modules/utilities/utilities.py_in
index e6e31c5..4e142aa 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -443,6 +443,19 @@ def create_cols_from_array_sql_string(py_list,
sql_array_col, colname,
Output:
(ARRAY['cat','dog'])[sqlcol[1]+1]::TEXT AS estimated_pred
+ @NOTE:
+ If py_list is [None, 'cat', 'dog', NULL']:
+ then the SQL query string returned would create the following
+ column names:
+ prob_NULL, prob_cat, 'prob_dog', and 'prob_"NULL'.
+ 1. Notice that for None, which represents Postgres' NULL value, the
+ column name will be 'prob_NULL',
+ 2. and to differentiate the column name for a string 'NULL', the
+ resulting column name will be 'prob_"NULL'.
+
+ The weird quoting in this column name is due to calling strip after
+ quote_ident in the code below.
+
@returns:
@param, str, that can be used in a SQL query.
@@ -458,7 +471,21 @@ def create_cols_from_array_sql_string(py_list,
sql_array_col, colname,
_assert(py_list.count(None) <= 1,
"{0}: Input list should contain at most 1 None element.".
format(module_name))
- py_list = ['NULL' if ele is None else ele for ele in py_list]
+ def py_list_str(ele):
+ """
+ A python None is converted to a SQL NULL.
+ String 'NULL' is converted to SQL 'NULL' string by quoting
+ it to '"NULL"'. This quoting is necessary for Postgres to
+ differentiate between NULL and 'NULL' in the SQL query
+ string returned by create_cols_from_array_sql_string.
+ """
+ if ele is None:
+ return 'NULL'
+ elif isinstance(ele, str) and ele.lower()=='null':
+ return '"{0}"'.format(ele)
+ return ele
+
+ py_list = list(map(py_list_str, py_list))
if has_one_ele:
# Query to choose the value in the first element of
# sql_array_col which is the index to access in py_list.
@@ -475,7 +502,8 @@ def create_cols_from_array_sql_string(py_list,
sql_array_col, colname,
# we cannot call sql quote_ident on the py_list entries because
# aliasing does not support quote_ident. Hence calling our
- # python implementation of quote_ident
+ # python implementation of quote_ident. We must call strip()
+ # after quote_ident since the resulting SQL query fails otherwise.
select_clause = ', '.join(
['CAST({sql_array_col}[{j}] AS {coltype}) AS
"{final_colname}"'.
format(j=i + 1,