Repository: madlib
Updated Branches:
  refs/heads/master 8bbad2df6 -> 3af2d703e


Encode cat: Create lower case name when possible

JIRA: MADLIB-1202

The encode_categorical_variables() function generates column name with
some capital characters, including:
1. when you specify top_values, column name contains suffix __MISC__
2. when you set encode_nulls as True, a column name contains suffix __NULL
3. when the original column is boolean type, column names with suffixes
_True and _False

In the above cases, users have to use double quoting to query, which is
not conveninet.  This commit adresses this with all of the three
scenarios generating column names with lower cases.

Besides, Postgres requires expressions to be placed under parenthesis
before casting them. Further, expressions with quoted names could result
in invalid alias strings. Both issues have been fixed here.

Closes #234

Co-authored-by: Rahul Iyer <ri...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/3af2d703
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/3af2d703
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/3af2d703

Branch: refs/heads/master
Commit: 3af2d703ef0fb00fabbda6a47a82f7b858961963
Parents: 8bbad2d
Author: Jingyi Mei <j...@pivotal.io>
Authored: Fri Feb 23 18:23:11 2018 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Feb 23 18:31:47 2018 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  |  35 +++++--
 .../modules/utilities/encode_categorical.sql_in | 100 ++++++++++++++++++-
 .../utilities/test/encode_categorical.sql_in    |   2 +-
 3 files changed, 120 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/3af2d703/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in 
b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 81bc8ed..0d7eb91 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -79,7 +79,7 @@ class CategoricalEncoder(object):
         self.output_dictionary = output_dictionary
         self.distributed_by = distributed_by if not is_platform_pg() else None
 
-        self._name_others_col = "_MISC__"
+        self._name_others_col = "_misc__"
         self._array_out_name = "__encoded_variables__"
 
         # create new parameters after validating and parsing inputs
@@ -307,30 +307,34 @@ class CategoricalEncoder(object):
                 else:
                     return ''
                 v_type = list
+                cast_str = ''
             elif v is None:
                 value_str = "IS NULL"
                 v_type = None
+                cast_str = ''
             else:
                 # assume v is a string if not list/tuple and not None
                 value_str = "= '{v}'".format(v=str(v))
                 v_type = str
+                cast_str = '::TEXT'
 
             if self.output_type not in ('array', 'svec'):
                 if not self._output_dictionary:
-                    value_names = {None: 'NULL',
+                    value_names = {None: 'null',
                                    list: self._name_others_col,
                                    str: strip_end_quotes(v)}
-                    alias = 'AS "{0}_{1}"'.format(col_no_quotes, 
value_names[v_type])
+                    alias_val = value_names[v_type]
                 else:
-                    alias = 'AS "{0}_{1}"'.format(col_no_quotes, seq)
+                    alias_val = str(seq)
+                alias = 'AS ' + quote_ident('{0}_{1}'.format(col_no_quotes, 
alias_val))
             else:
                 # if output_type is array-like then each case does not
                 # require an alias
                 alias = ""
-            return ("(CASE WHEN ({col} {value_str}) "
+            return ("(CASE WHEN ({schema_madlib}.__to_char({col}){cast_str} 
{value_str}) "
                     "THEN 1 ELSE 0 END)::INTEGER {alias}".
-                    format(col=col, value_str=value_str, alias=alias))
-
+                    format(schema_madlib=self.schema_madlib, col=col,
+                    cast_str=cast_str, value_str=value_str, alias=alias))
         self._output_dictionary = (self._output_dictionary or
                                    self._is_col_name_long(col_to_values))
         col_switch_list = []
@@ -451,7 +455,7 @@ class CategoricalEncoder(object):
         top_val_sql_list = []
         for col in self._output_cols:
             if not self.encode_null:
-                filter_str = 'WHERE {col} IS NOT NULL'.format(col=col)
+                filter_str = 'WHERE ({col}) IS NOT NULL'.format(col=col)
             else:
                 filter_str = ''
             # get value distribution for each column independently
@@ -461,12 +465,15 @@ class CategoricalEncoder(object):
                     array_agg(f order by c desc) as value,
                     array_agg(c order by c desc) as freq
                 FROM (
-                    SELECT {col}::text as f, count(*)::integer as c
+                    SELECT {schema_madlib}.__to_char({col})::text as f,
+                           count(*)::integer as c
                     FROM {tbl}
                     {filter_str}
                     GROUP BY {col}
                ) q
-            """.format(col=col, tbl=self.source_table, filter_str=filter_str))
+            """.format(schema_madlib=self.schema_madlib,
+                       col=col, tbl=self.source_table,
+                       filter_str=filter_str))
         top_values = plpy.execute('\n UNION ALL \n'.join(top_val_sql_list))
 
         # top_values is now a list of dictionary, each element
@@ -514,7 +521,13 @@ class CategoricalEncoder(object):
     def _get_distinct_values(self):
         """ Find distinct values of each categorical column
         """
-        array_agg_str = ',\n'.join('array_agg(DISTINCT {c}) AS {c_quoted}'.
+        # Boolean variables when passed to Python will refer to the values as
+        # 'True', 'False' with the first letter as capital, which will cause 
the
+        # generated column name as <boolean column name>_True/False that needs
+        # double quoting. To ensure the boolean values remain lower case, cast
+        # the column to text format before copying to Python so that boolean 
The
+        # same logic is applied generated column name with _null and _misc
+        array_agg_str = ',\n'.join('array_agg(DISTINCT ({c})::TEXT) AS 
{c_quoted}'.
                                    format(c=c, c_quoted=quote_ident(c))
                                    for c in self._output_cols)
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/3af2d703/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in 
b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index 2f22855..69b7e34 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -281,7 +281,7 @@ SELECT * FROM abalone_out ORDER BY id;
 (20 rows)
 </pre>
 
--# Now include NULL values in encoding (note the additional column 'sex_NULL'):
+-# Now include NULL values in encoding (note the additional column 'sex_null'):
 <pre class="example">
 DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
 SELECT madlib.encode_categorical_variables (
@@ -297,7 +297,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | length | diameter | height | rings | sex_F | sex_I | sex_M | sex_NULL
+ id | length | diameter | height | rings | sex_F | sex_I | sex_M | sex_null
 ----+--------+----------+--------+-------+-------+-------+-------+----------
   1 |  0.455 |    0.365 |  0.095 |    15 |     0 |     0 |     1 |        0
   2 |   0.35 |    0.265 |   0.09 |     7 |     0 |     0 |     1 |        0
@@ -379,7 +379,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | 
rings__MISC__
+ id | sex_M | sex_F | sex__misc__ | rings_10 | rings_7 | rings_9 | 
rings__misc__
 
----+-------+-------+-------------+----------+---------+---------+---------------
   1 |     1 |     0 |           0 |        0 |       0 |       0 |             
1
   2 |     1 |     0 |           0 |        0 |       1 |       0 |             0
@@ -617,7 +617,7 @@ SELECT * FROM abalone_out_dictionary ORDER BY 
encoded_column_name;
 (14 rows)
 </pre>
 
--# We can chose from various distribution policies, for examply RANDOMLY:
+-# We can chose from various distribution policies of the output table, for 
examply RANDOMLY:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
 SELECT madlib.encode_categorical_variables (
@@ -633,9 +633,35 @@ SELECT madlib.encode_categorical_variables (
         NULL,                        -- Dictionary output
         'RANDOMLY'                   -- Distribution policy
         );
+SELECT * FROM abalone_out ORDER BY id;
+</pre>
+<pre class="result">
+ id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | 
rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
+----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
+  1 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |      
  1 |        0 |        0 |        0 |       0 |       0 |       0
+  2 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       1 |       0 |       0
+  3 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       1
+  4 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+  5 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       1 |       0 |       0
+  6 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       1 |       0
+  7 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        1 |       0 |       0 |       0
+  8 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |      
  0 |        1 |        0 |        0 |       0 |       0 |       0
+  9 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       1
+ 10 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |      
  0 |        0 |        1 |        0 |       0 |       0 |       0
+ 11 |     1 |     0 |     0 |        0 |        0 |        0 |        1 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+ 12 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+ 13 |     0 |     0 |     1 |        0 |        1 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+ 14 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+ 15 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+ 16 |     0 |     0 |     1 |        0 |        0 |        1 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+ 17 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       1 |       0 |       0
+ 18 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       0
+ 19 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       1 |       0 |       0
+ 20 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |      
  0 |        0 |        0 |        0 |       0 |       0 |       1
+(20 rows)
 </pre>
 
--# If you have a reason to encode FLOAT variables, you can cast them
+-# If you have a reason to encode FLOAT variables, you can cast them as TEXT
 in the following way within the function call:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
@@ -644,6 +670,70 @@ SELECT madlib.encode_categorical_variables (
         'abalone_out',               -- Output table
         'height::TEXT'               -- Categorical columns
         );
+SELECT * FROM abalone_out ORDER BY id;
+</pre>
+<pre class="result">
+id | sex | length | diameter | height | rings | height::TEXT_0.08 | 
height::TEXT_0.085 | height::TEXT_0.09 | height::TEXT_0.095 | height::TEXT_0.1 
| height::TEXT_0.11 | height::TEXT_0.125 | height::TEXT_0.13 | 
height::TEXT_0.135 | height::TEXT_0.14 | height::TEXT_0.145 | height::TEXT_0.15
+----+-----+--------+----------+--------+-------+-------------------+--------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+-------------------
+  1 | M   |  0.455 |    0.365 |  0.095 |    15 |                 0 |           
       0 |                 0 |                  1 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+  2 | M   |   0.35 |    0.265 |   0.09 |     7 |                 0 |           
       0 |                 1 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+  3 | F   |   0.53 |     0.42 |  0.135 |     9 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  1 |        
         0 |                  0 |                 0
+  4 | M   |   0.44 |    0.365 |  0.125 |    10 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  1 |                 0 |                  0 |        
         0 |                  0 |                 0
+  5 | I   |   0.33 |    0.255 |   0.08 |     7 |                 1 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+  6 | I   |  0.425 |      0.3 |  0.095 |     8 |                 0 |           
       0 |                 0 |                  1 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+  7 | F   |   0.53 |    0.415 |   0.15 |    20 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 1
+  8 | F   |  0.545 |    0.425 |  0.125 |    16 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  1 |                 0 |                  0 |        
         0 |                  0 |                 0
+  9 | M   |  0.475 |     0.37 |  0.125 |     9 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  1 |                 0 |                  0 |        
         0 |                  0 |                 0
+ 10 |     |   0.55 |     0.44 |   0.15 |    19 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 1
+ 11 | F   |  0.525 |     0.38 |   0.14 |    14 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         1 |                  0 |                 0
+ 12 | M   |   0.43 |     0.35 |   0.11 |    10 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       1 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+ 13 | M   |   0.49 |     0.38 |  0.135 |    11 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  1 |        
         0 |                  0 |                 0
+ 14 | F   |  0.535 |    0.405 |  0.145 |    10 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  1 |                 0
+ 15 | F   |   0.47 |    0.355 |    0.1 |    10 |                 0 |           
       0 |                 0 |                  0 |                1 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+ 16 | M   |    0.5 |      0.4 |   0.13 |    12 |                 0 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 1 |                  0 |        
         0 |                  0 |                 0
+ 17 | I   |  0.355 |     0.28 |  0.085 |     7 |                 0 |           
       1 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+ 18 | F   |   0.44 |     0.34 |    0.1 |    10 |                 0 |           
       0 |                 0 |                  0 |                1 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+ 19 | M   |  0.365 |    0.295 |   0.08 |     7 |                 1 |           
       0 |                 0 |                  0 |                0 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+ 20 |     |   0.45 |     0.32 |    0.1 |     9 |                 0 |           
       0 |                 0 |                  0 |                1 |          
       0 |                  0 |                 0 |                  0 |        
         0 |                  0 |                 0
+(20 rows)
+</pre>
+
+-# You can also use a logical expression in the categorical columns, which will
+be passed as boolean, and in the output table there will be two columns with
+name logical_expression_true and logical_expression_false:
+<pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
+SELECT madlib.encode_categorical_variables (
+        'abalone',                   -- Source table
+        'abalone_out',               -- Output table
+        'height>.10'                 -- Categorical columns
+        );
+SELECT * FROM abalone_out ORDER BY id;
+</pre>
+<pre class="result">
+ id | sex | length | diameter | height | rings | height>.10_false | 
height>.10_true
+----+-----+--------+----------+--------+-------+------------------+-----------------
+  1 | M   |  0.455 |    0.365 |  0.095 |    15 |                1 |            
   0
+  2 | M   |   0.35 |    0.265 |   0.09 |     7 |                1 |            
   0
+  3 | F   |   0.53 |     0.42 |  0.135 |     9 |                0 |            
   1
+  4 | M   |   0.44 |    0.365 |  0.125 |    10 |                0 |            
   1
+  5 | I   |   0.33 |    0.255 |   0.08 |     7 |                1 |            
   0
+  6 | I   |  0.425 |      0.3 |  0.095 |     8 |                1 |            
   0
+  7 | F   |   0.53 |    0.415 |   0.15 |    20 |                0 |            
   1
+  8 | F   |  0.545 |    0.425 |  0.125 |    16 |                0 |            
   1
+  9 | M   |  0.475 |     0.37 |  0.125 |     9 |                0 |            
   1
+ 10 |     |   0.55 |     0.44 |   0.15 |    19 |                0 |            
   1
+ 11 | F   |  0.525 |     0.38 |   0.14 |    14 |                0 |            
   1
+ 12 | M   |   0.43 |     0.35 |   0.11 |    10 |                0 |            
   1
+ 13 | M   |   0.49 |     0.38 |  0.135 |    11 |                0 |            
   1
+ 14 | F   |  0.535 |    0.405 |  0.145 |    10 |                0 |            
   1
+ 15 | F   |   0.47 |    0.355 |    0.1 |    10 |                1 |            
   0
+ 16 | M   |    0.5 |      0.4 |   0.13 |    12 |                0 |            
   1
+ 17 | I   |  0.355 |     0.28 |  0.085 |     7 |                1 |            
   0
+ 18 | F   |   0.44 |     0.34 |    0.1 |    10 |                1 |            
   0
+ 19 | M   |  0.365 |    0.295 |   0.08 |     7 |                1 |            
   0
+ 20 |     |   0.45 |     0.32 |    0.1 |     9 |                1 |            
   0
+(20 rows)
 </pre>
 
 @anchor literature

http://git-wip-us.apache.org/repos/asf/madlib/blob/3af2d703/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in 
b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
index 5a3dd75..31c27aa 100644
--- a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
@@ -85,7 +85,7 @@ select * from abalone_out4;
 -- array output working with dictionary output,
 -- top with percent input, global value_to_drop
 select encode_categorical_variables('abalone', 'abalone_out5',
-                                           'sex, "Class"', '',
+                                           'sex, "Class" > 1', '',
                                            'id', '0.5', 'M',
                                            true, 'array', false
                                            );

Reply via email to