Rahul Iyer created MADLIB-1258:
----------------------------------
Summary: Individual group dropping a categorical variable can lead
to incorrect results
Key: MADLIB-1258
URL: https://issues.apache.org/jira/browse/MADLIB-1258
Project: Apache MADlib
Issue Type: Bug
Components: Module: Decision Tree, Module: Random Forest
Reporter: Rahul Iyer
In DT/RF, a categorical variable is dropped if it only has a single level. This
can lead to a situation in grouped models, where a particular group drops a
categorical variables which is retained by other groups (see example below).
This is fine on its own, but will lead to issues with prediction, since the
predict functions assume a consistent list of categorical features across
groups.
There are two possible ways to fix the problem:
1. Update `*predict` (and other downstream functions) to handle the varying cat
features across groups.
2. Don't drop a categorical feature (This case would require ensuring that our
internal code does not assume that a categorical feature has at least 2
levels).
{code:sql}
DROP TABLE IF EXISTS dt_golf CASCADE;
CREATE TABLE dt_golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Cont_features" double precision[],
cat_features text[],
windy boolean,
class text
) ;
INSERT INTO dt_golf
(id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class)
VALUES
(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group,
train_output_poisson_count;
SELECT madlib.forest_train(
'dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'temperature::double precision', -- response
'cat_features, windy', -- features
NULL, -- exclude columns
'class', -- grouping
5, -- num of trees
NULL, -- num of random features
TRUE, -- importance
20, -- num_permutations
10, -- max depth
1, -- min split
1, -- min bucket
3, -- number of bins per continuous variable
'max_surrogates = 2 ',
FALSE
);
\x on
SELECT * from train_output_group;
{code}
Result:
{code}
-[ RECORD 1 ]-----------+-------------------------------------------------
gid | 1
class | Don't Play
success | t
cat_n_levels | {2,2}
cat_levels_in_text | {c,a,True,False}
oob_error | 78.2893518518518
oob_var_importance | {2.368475785867e-15,2.368475785867e-15}
impurity_var_importance | {2.296944444444,0}
-[ RECORD 2 ]-----------+-------------------------------------------------
gid | 2
class | Play
success | t
cat_n_levels | {2,2,2}
cat_levels_in_text | {c,a,b,d,False,True}
oob_error | 38.1958872778793
oob_var_importance | {10.9137514172336,0,0}
impurity_var_importance | {8.1044222372,0.25723053952258,0.25723053952258}
{code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)