Repository: incubator-madlib
Updated Branches:
  refs/heads/master f1aa9af62 -> 4264fa9d3


DT: Correctly encode unseen categorical features

Changes applied in commit a2f4740 added an option to treat NULL values
as a new category. This was implemented by changing the encoding process
of categorical features to add a new value at the end of the list of
values. The intention with the commit was to treat new unseen, non-null
values equivalent to NULL. The process, however, still encoded the
unseen categorical value as -1, which is interpreted as NULL in
underlying functions. This commit updates this process to correctly use
the last index as the encoding for the unseen/NULL value.

Closes #171


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/4264fa9d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/4264fa9d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/4264fa9d

Branch: refs/heads/master
Commit: 4264fa9d3bbfdb616364339c127373d7f0568e30
Parents: f1aa9af
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Aug 18 16:59:44 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Aug 18 16:59:44 2017 -0700

----------------------------------------------------------------------
 .../recursive_partitioning/feature_encoding.cpp | 22 ++++--
 .../recursive_partitioning/decision_tree.py_in  |  7 +-
 .../recursive_partitioning/decision_tree.sql_in | 73 ++++++++++----------
 .../test/decision_tree.sql_in                   |  2 +-
 4 files changed, 59 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/modules/recursive_partitioning/feature_encoding.cpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/feature_encoding.cpp 
b/src/modules/recursive_partitioning/feature_encoding.cpp
index 3b0a452..a5f131b 100644
--- a/src/modules/recursive_partitioning/feature_encoding.cpp
+++ b/src/modules/recursive_partitioning/feature_encoding.cpp
@@ -184,21 +184,31 @@ map_catlevel_to_int::run(AnyType &args){
     ArrayHandle<text*> cat_values = args[0].getAs<ArrayHandle<text*> >();
     ArrayHandle<text*> cat_levels = args[1].getAs<ArrayHandle<text*> >();
     ArrayHandle<int> n_levels = args[2].getAs<ArrayHandle<int> >();
+    bool null_as_category = args[3].getAs<bool>();
 
     MutableArrayHandle<int> cat_int = allocateArray<int>(n_levels.size());
     int pos = 0;
     for (size_t i = 0; i < n_levels.size(); i++) {
         // linear search to find a match
-        int match = -1;  // if cat_values contains any not present in 
cat_levels,
-                         // then the mapped integer is -1. If cat_values 
contains
-                         // a known cat_level, then the mapped integer is
-                         // the index of that value in cat_levels
-        for (int j = 0; j < n_levels[i]; j++)
+
+        // if cat_values contains any not present in cat_levels, then the 
mapped
+        // integer is -1. If cat_values contains a known cat_level, then the
+        // mapped integer is the index of that value in cat_levels.
+        int match = -1;
+        for (int j = 0; j < n_levels[i]; j++){
             if (cmp_text(cat_values[i], cat_levels[pos + j])) {
                 match = j;
                 break;
             }
-        cat_int[i] = match;
+        }
+
+        // If null_as_category is True, then match is set to the last index
+        // instead of -1 since the last index is expected to represent NULL.
+        if (match == -1 and null_as_category){
+            cat_int[i] = n_levels[i] - 1;
+        } else {
+            cat_int[i] = match;
+        }
         pos += static_cast<int>(n_levels[i]);
     }
     return cat_int;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 35fc3f9..ce56ab0 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -1179,8 +1179,11 @@ def get_feature_str(schema_madlib, boolean_cats,
                     "(coalesce({0}::text, '{1}'))::text".format(col, null_val))
 
         cat_features_str = ("{0}._map_catlevel_to_int(array[" +
-                            ", ".join(cat_features_cast) + "], {1}, {2})"
-                            ).format(schema_madlib, levels_str, n_levels_str)
+                            ", ".join(cat_features_cast) + "], {1}, {2}, {3})"
+                            ).format(schema_madlib,
+                                     levels_str,
+                                     n_levels_str,
+                                     null_proxy is not None)
     else:
         cat_features_str = "NULL"
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 8290dcf..3632f2e 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -912,20 +912,20 @@ Result:
 (32 rows)
 </pre>
 
-<h4>NULL handling example</h4>
+<h4>NULL Handling Example</h4>
 
-- Create toy example to illustrate null-as-category handling
+-# Create toy example to illustrate null-as-category handling
 <pre class='example'>
-drop table null_handling_example;
-create table null_handling_example (
-id integer,
-country text,
-city text,
-weather text,
-response text
+DROP TABLE IF EXISTS null_handling_example;
+CREATE TABLE null_handling_example (
+    id integer,
+    country text,
+    city text,
+    weather text,
+    response text
 );
 &nbsp;
-insert into null_handling_example values
+INSERT INTO null_handling_example VALUES
 (1,null,null,null,'a'),
 (2,'US',null,null,'b'),
 (3,'US','NY',null,'c'),
@@ -933,22 +933,22 @@ insert into null_handling_example values
 &nbsp;
 DROP TABLE IF EXISTS train_output, train_output_summary;
 SELECT madlib.tree_train('null_handling_example',         -- source table
-                         'train_output',    -- output model table
-                         'id',              -- id column
-                         'response',             -- dependent variable
-                         'country, city, weather',               -- features
-                         NULL,
-                         'gini',             -- split criterion
-                         NULL::text,        -- no grouping
-                         NULL::text,        -- no weights
-                         4,                -- max depth
-                         1,                 -- min split
-                         1,                 -- number of bins per continuous 
variable
-                         10,                -- number of splits
-                         NULL,              -- pruning parameters
-                         'null_as_category=true' -- null handling
+                         'train_output',                  -- output model table
+                         'id',                            -- id column
+                         'response',                      -- dependent variable
+                         'country, weather, city',        -- features
+                         NULL,                            -- features to 
exclude
+                         'gini',                          -- split criterion
+                         NULL::text,                      -- no grouping
+                         NULL::text,                      -- no weights
+                         4,                               -- max depth
+                         1,                               -- min split
+                         1,                               -- number of bins 
per continuous variable
+                         10,                              -- number of splits
+                         NULL,                            -- pruning parameters
+                         'null_as_category=true'          -- null handling
                          );
-select cat_levels_in_text, cat_n_levels from train_output;
+SELECT cat_levels_in_text, cat_n_levels FROM train_output;
 </pre>
 <pre class='result'>
             cat_levels_in_text            | cat_n_levels
@@ -956,17 +956,17 @@ select cat_levels_in_text, cat_n_levels from train_output;
  {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2}
 </pre>
 
-- Predict for previously not seen data by assuming NULL value as the default
+-# Predict for previously not seen data by assuming NULL value as the default
 <pre class='example'>
-drop table if exists table_test;
-create table table_test (
-id integer,
-country text,
-city text,
-weather text,
-expected_response text
+DROP TABLE IF EXISTS table_test;
+CREATE TABLE table_test (
+    id integer,
+    country text,
+    city text,
+    weather text,
+    expected_response text
 );
-insert into table_test values
+INSERT INTO table_test VALUES
 (1,'IN','MUM','cloudy','a'),
 (2,'US','HOU','humid','b'),
 (3,'US','NY','sunny','c'),
@@ -979,7 +979,7 @@ SELECT madlib.tree_predict('train_output',
                            'response');
 SELECT s.id, expected_response, estimated_response
 FROM prediction_results p, table_test s
-where s.id = p.id ORDER BY id;
+WHERE s.id = p.id ORDER BY id;
 </pre>
 <pre class='result'>
  id | expected_response | estimated_response
@@ -1229,7 +1229,8 @@ CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_entropy(
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._map_catlevel_to_int(
     cat_values_in_text          TEXT[],     -- categorical variable value from 
each row
     cat_levels_in_text          TEXT[],     -- all levels in text
-    cat_n_levels                INTEGER[]   -- number of levels for each 
categorical variable
+    cat_n_levels                INTEGER[],   -- number of levels for each 
categorical variable
+    null_as_category            BOOLEAN     -- flag to check if NULL is 
treated as a separate category
 ) RETURNS INTEGER[] AS
     'MODULE_PATHNAME', 'map_catlevel_to_int'
 LANGUAGE c IMMUTABLE

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index 93b9a52..e585379 100644
--- 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -43,7 +43,7 @@ SELECT * FROM dummy_splits;
 -- cat encoding
 SELECT
     assert(
-        relative_error(_map_catlevel_to_int('{B}', '{A,B}', ARRAY[2]), 
ARRAY[1]) < 1e-6,
+        relative_error(_map_catlevel_to_int('{B}', '{A,B}', ARRAY[2], TRUE), 
ARRAY[1]) < 1e-6,
         'wrong results in _map_catlevel_to_int()')
 ;
 

Reply via email to