[ 
https://issues.apache.org/jira/browse/MADLIB-1218?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Frank McQuillan updated MADLIB-1218:
------------------------------------
    Description: 
{code}
DROP TABLE IF EXISTS mt_cars;

CREATE TABLE mt_cars (
    id integer NOT NULL,
    mpg double precision,
    cyl integer,
    disp double precision,
    hp integer,
    drat double precision,
    wt double precision,
    qsec double precision,
    vs integer,
    am integer,
    gear integer,
    carb integer
);

INSERT INTO mt_cars VALUES
(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
(9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
(18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
{code}

RF:

{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
mt_cars_output_summary;

SELECT madlib.forest_train('mt_cars',
                           'mt_cars_output',
                           'id',
                           'mpg',
                           '*',
                           'id, hp, drat, am, gear, carb',  -- exclude columns
                           'am',  -- grouping
                           10::integer,
                           2::integer,
                           TRUE::boolean,
                           1,
                           10,
                           8,
                           3,
                           10
                           );
SELECT * FROM mt_cars_output_summary;
{code}
produces
{code}
-[ RECORD 1 
]---------+-----------------------------------------------------------------------
method                | forest_train
is_classification     | f
source_table          | mt_cars
model_table           | mt_cars_output
id_col_name           | id
dependent_varname     | mpg
independent_varnames  | vs,cyl,disp,qsec,wt
cat_features          | vs,cyl
con_features          | disp,qsec,wt
grouping_cols         | am
num_trees             | 10
num_random_features   | 2
max_tree_depth        | 10
min_split             | 8
min_bucket            | 3
num_splits            | 10
verbose               | f
importance            | t
num_permutations      | 1
num_all_groups        | 2
num_failed_groups     | 0
total_rows_processed  | 32
total_rows_skipped    | 0
dependent_var_levels  | 
dependent_var_type    | double precision
independent_var_types | integer, integer, double precision, double precision, 
double precision
null_proxy            | None
{code}

{code}
SELECT * FROM mt_cars_output_group;
{code}
produces
{code}
-[ RECORD 1 ]------+--------------------------------------
gid                | 1
am                 | 0
success            | t
cat_n_levels       | {3,3}
cat_levels_in_text | {4,6,8,4,6,8}
oob_error          | 10.4384609425616
cat_var_importance | {1.77200744569228,3.23893136930339}
con_var_importance | {6.52806304913008,0,2.71255870662522}
-[ RECORD 2 ]------+--------------------------------------
gid                | 2
am                 | 1
success            | t
cat_n_levels       | {2,2}
cat_levels_in_text | {0,1,0,1}
oob_error          | 22.3349978333876
cat_var_importance | {4.59597959183674,0}
con_var_importance | {5.715875,0,18.4067344897959}
{code}

In the above, cat_n_levels and cat_levels_in_text are wrong.  I would expect 
{0,1,4,6,8} to be reported for both groups. 

Compare the above with the case of no groups:

{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
mt_cars_output_summary;

SELECT madlib.forest_train('mt_cars',
                           'mt_cars_output',
                           'id',
                           'mpg',
                           '*',
                           'id, hp, drat, am, gear, carb',  -- exclude columns
                           NULL, -- no groups
                           10::integer,
                           2::integer,
                           TRUE::boolean,
                           1,
                           10,
                           8,
                           3,
                           10
                           );
SELECT * FROM mt_cars_output_summary;
{code}

produces

{code}
[ RECORD 1 
]---------+-----------------------------------------------------------------------
method                | forest_train
is_classification     | f
source_table          | mt_cars
model_table           | mt_cars_output
id_col_name           | id
dependent_varname     | mpg
independent_varnames  | vs,cyl,disp,qsec,wt
cat_features          | vs,cyl
con_features          | disp,qsec,wt
grouping_cols         | 
num_trees             | 10
num_random_features   | 2
max_tree_depth        | 10
min_split             | 8
min_bucket            | 3
num_splits            | 10
verbose               | f
importance            | t
num_permutations      | 1
num_all_groups        | 1
num_failed_groups     | 0
total_rows_processed  | 32
total_rows_skipped    | 0
dependent_var_levels  | 
dependent_var_type    | double precision
independent_var_types | integer, integer, double precision, double precision, 
double precision
null_proxy            | None
{code}

SELECT * FROM mt_cars_output_group;

produces:

gid                | 1
success            | t
cat_n_levels       | {2,3}
cat_levels_in_text | {0,1,4,6,8}
oob_error          | 9.90200731761036
cat_var_importance | {0.390353827160495,12.8836927689546}
con_var_importance | {0.118665925451092,0.947886524555693,19.910665401879}

which seems OK.

DT:

I am seeing the same thing with decision tree.  Here is a sample output 
grouping on 'am'  from decision tree:

{code}
madlib=# SELECT am, pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth 
FROM train_output;
-[ RECORD 1 ]------+--------------
am                 | 1
pruning_cp         | 0
cat_levels_in_text | {0,1,0,1}
cat_n_levels       | {2,2}
tree_depth         | 1
-[ RECORD 2 ]------+--------------
am                 | 0
pruning_cp         | 0
cat_levels_in_text | {4,6,8,4,6,8}
cat_n_levels       | {3,3}
tree_depth         | 3
{code}


  was:
{code}
DROP TABLE IF EXISTS mt_cars;

CREATE TABLE mt_cars (
    id integer NOT NULL,
    mpg double precision,
    cyl integer,
    disp double precision,
    hp integer,
    drat double precision,
    wt double precision,
    qsec double precision,
    vs integer,
    am integer,
    gear integer,
    carb integer
);

INSERT INTO mt_cars VALUES
(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
(9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
(18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
{code}

RF:

{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
mt_cars_output_summary;

SELECT madlib.forest_train('mt_cars',
                           'mt_cars_output',
                           'id',
                           'mpg',
                           '*',
                           'id, hp, drat, am, gear, carb',  -- exclude columns
                           'am',  -- grouping
                           10::integer,
                           2::integer,
                           TRUE::boolean,
                           1,
                           10,
                           8,
                           3,
                           10
                           );
SELECT * FROM mt_cars_output_summary;
{code}
produces
{code}
-[ RECORD 1 
]---------+-----------------------------------------------------------------------
method                | forest_train
is_classification     | f
source_table          | mt_cars
model_table           | mt_cars_output
id_col_name           | id
dependent_varname     | mpg
independent_varnames  | vs,cyl,disp,qsec,wt
cat_features          | vs,cyl
con_features          | disp,qsec,wt
grouping_cols         | am
num_trees             | 10
num_random_features   | 2
max_tree_depth        | 10
min_split             | 8
min_bucket            | 3
num_splits            | 10
verbose               | f
importance            | t
num_permutations      | 1
num_all_groups        | 2
num_failed_groups     | 0
total_rows_processed  | 32
total_rows_skipped    | 0
dependent_var_levels  | 
dependent_var_type    | double precision
independent_var_types | integer, integer, double precision, double precision, 
double precision
null_proxy            | None
{code}

{code}
SELECT * FROM mt_cars_output_group;
{code}
produces
{code}
-[ RECORD 1 ]------+--------------------------------------
gid                | 1
am                 | 0
success            | t
cat_n_levels       | {3,3}
cat_levels_in_text | {4,6,8,4,6,8}
oob_error          | 10.4384609425616
cat_var_importance | {1.77200744569228,3.23893136930339}
con_var_importance | {6.52806304913008,0,2.71255870662522}
-[ RECORD 2 ]------+--------------------------------------
gid                | 2
am                 | 1
success            | t
cat_n_levels       | {2,2}
cat_levels_in_text | {0,1,0,1}
oob_error          | 22.3349978333876
cat_var_importance | {4.59597959183674,0}
con_var_importance | {5.715875,0,18.4067344897959}
{code}

In the above, cat_n_levels and cat_levels_in_text are wrong.  I would expect 
{0,1,4,6,8} to be reported for both groups. 

Compare the above with the case of no groups:

{code}
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
mt_cars_output_summary;

SELECT madlib.forest_train('mt_cars',
                           'mt_cars_output',
                           'id',
                           'mpg',
                           '*',
                           'id, hp, drat, am, gear, carb',  -- exclude columns
                           NULL, -- no groups
                           10::integer,
                           2::integer,
                           TRUE::boolean,
                           1,
                           10,
                           8,
                           3,
                           10
                           );
SELECT * FROM mt_cars_output_summary;
{code}

produces

{code}
[ RECORD 1 
]---------+-----------------------------------------------------------------------
method                | forest_train
is_classification     | f
source_table          | mt_cars
model_table           | mt_cars_output
id_col_name           | id
dependent_varname     | mpg
independent_varnames  | vs,cyl,disp,qsec,wt
cat_features          | vs,cyl
con_features          | disp,qsec,wt
grouping_cols         | 
num_trees             | 10
num_random_features   | 2
max_tree_depth        | 10
min_split             | 8
min_bucket            | 3
num_splits            | 10
verbose               | f
importance            | t
num_permutations      | 1
num_all_groups        | 1
num_failed_groups     | 0
total_rows_processed  | 32
total_rows_skipped    | 0
dependent_var_levels  | 
dependent_var_type    | double precision
independent_var_types | integer, integer, double precision, double precision, 
double precision
null_proxy            | None
{code}

SELECT * FROM mt_cars_output_group;

produces:

gid                | 1
success            | t
cat_n_levels       | {2,3}
cat_levels_in_text | {0,1,4,6,8}
oob_error          | 9.90200731761036
cat_var_importance | {0.390353827160495,12.8836927689546}
con_var_importance | {0.118665925451092,0.947886524555693,19.910665401879}

which seems OK.



> DT and RF: categorical levels reported incorrectly when grouping is used 
> -------------------------------------------------------------------------
>
>                 Key: MADLIB-1218
>                 URL: https://issues.apache.org/jira/browse/MADLIB-1218
>             Project: Apache MADlib
>          Issue Type: Bug
>          Components: Module: Decision Tree, Module: Random Forest
>            Reporter: Frank McQuillan
>            Assignee: Rahul Iyer
>            Priority: Major
>             Fix For: v1.14
>
>
> {code}
> DROP TABLE IF EXISTS mt_cars;
> CREATE TABLE mt_cars (
>     id integer NOT NULL,
>     mpg double precision,
>     cyl integer,
>     disp double precision,
>     hp integer,
>     drat double precision,
>     wt double precision,
>     qsec double precision,
>     vs integer,
>     am integer,
>     gear integer,
>     carb integer
> );
> INSERT INTO mt_cars VALUES
> (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
> (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
> (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
> (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
> (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
> (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
> (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
> (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
> (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
> (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
> (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
> (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
> (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
> (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
> (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
> (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
> (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
> (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
> (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
> (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
> (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
> (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
> (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
> (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
> (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
> (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
> (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
> (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
> (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
> (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
> (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
> (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
> {code}
> RF:
> {code}
> DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
> mt_cars_output_summary;
> SELECT madlib.forest_train('mt_cars',
>                            'mt_cars_output',
>                            'id',
>                            'mpg',
>                            '*',
>                            'id, hp, drat, am, gear, carb',  -- exclude columns
>                            'am',  -- grouping
>                            10::integer,
>                            2::integer,
>                            TRUE::boolean,
>                            1,
>                            10,
>                            8,
>                            3,
>                            10
>                            );
> SELECT * FROM mt_cars_output_summary;
> {code}
> produces
> {code}
> -[ RECORD 1 
> ]---------+-----------------------------------------------------------------------
> method                | forest_train
> is_classification     | f
> source_table          | mt_cars
> model_table           | mt_cars_output
> id_col_name           | id
> dependent_varname     | mpg
> independent_varnames  | vs,cyl,disp,qsec,wt
> cat_features          | vs,cyl
> con_features          | disp,qsec,wt
> grouping_cols         | am
> num_trees             | 10
> num_random_features   | 2
> max_tree_depth        | 10
> min_split             | 8
> min_bucket            | 3
> num_splits            | 10
> verbose               | f
> importance            | t
> num_permutations      | 1
> num_all_groups        | 2
> num_failed_groups     | 0
> total_rows_processed  | 32
> total_rows_skipped    | 0
> dependent_var_levels  | 
> dependent_var_type    | double precision
> independent_var_types | integer, integer, double precision, double precision, 
> double precision
> null_proxy            | None
> {code}
> {code}
> SELECT * FROM mt_cars_output_group;
> {code}
> produces
> {code}
> -[ RECORD 1 ]------+--------------------------------------
> gid                | 1
> am                 | 0
> success            | t
> cat_n_levels       | {3,3}
> cat_levels_in_text | {4,6,8,4,6,8}
> oob_error          | 10.4384609425616
> cat_var_importance | {1.77200744569228,3.23893136930339}
> con_var_importance | {6.52806304913008,0,2.71255870662522}
> -[ RECORD 2 ]------+--------------------------------------
> gid                | 2
> am                 | 1
> success            | t
> cat_n_levels       | {2,2}
> cat_levels_in_text | {0,1,0,1}
> oob_error          | 22.3349978333876
> cat_var_importance | {4.59597959183674,0}
> con_var_importance | {5.715875,0,18.4067344897959}
> {code}
> In the above, cat_n_levels and cat_levels_in_text are wrong.  I would expect 
> {0,1,4,6,8} to be reported for both groups. 
> Compare the above with the case of no groups:
> {code}
> DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
> mt_cars_output_summary;
> SELECT madlib.forest_train('mt_cars',
>                            'mt_cars_output',
>                            'id',
>                            'mpg',
>                            '*',
>                            'id, hp, drat, am, gear, carb',  -- exclude columns
>                            NULL, -- no groups
>                            10::integer,
>                            2::integer,
>                            TRUE::boolean,
>                            1,
>                            10,
>                            8,
>                            3,
>                            10
>                            );
> SELECT * FROM mt_cars_output_summary;
> {code}
> produces
> {code}
> [ RECORD 1 
> ]---------+-----------------------------------------------------------------------
> method                | forest_train
> is_classification     | f
> source_table          | mt_cars
> model_table           | mt_cars_output
> id_col_name           | id
> dependent_varname     | mpg
> independent_varnames  | vs,cyl,disp,qsec,wt
> cat_features          | vs,cyl
> con_features          | disp,qsec,wt
> grouping_cols         | 
> num_trees             | 10
> num_random_features   | 2
> max_tree_depth        | 10
> min_split             | 8
> min_bucket            | 3
> num_splits            | 10
> verbose               | f
> importance            | t
> num_permutations      | 1
> num_all_groups        | 1
> num_failed_groups     | 0
> total_rows_processed  | 32
> total_rows_skipped    | 0
> dependent_var_levels  | 
> dependent_var_type    | double precision
> independent_var_types | integer, integer, double precision, double precision, 
> double precision
> null_proxy            | None
> {code}
> SELECT * FROM mt_cars_output_group;
> produces:
> gid                | 1
> success            | t
> cat_n_levels       | {2,3}
> cat_levels_in_text | {0,1,4,6,8}
> oob_error          | 9.90200731761036
> cat_var_importance | {0.390353827160495,12.8836927689546}
> con_var_importance | {0.118665925451092,0.947886524555693,19.910665401879}
> which seems OK.
> DT:
> I am seeing the same thing with decision tree.  Here is a sample output 
> grouping on 'am'  from decision tree:
> {code}
> madlib=# SELECT am, pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth 
> FROM train_output;
> -[ RECORD 1 ]------+--------------
> am                 | 1
> pruning_cp         | 0
> cat_levels_in_text | {0,1,0,1}
> cat_n_levels       | {2,2}
> tree_depth         | 1
> -[ RECORD 2 ]------+--------------
> am                 | 0
> pruning_cp         | 0
> cat_levels_in_text | {4,6,8,4,6,8}
> cat_n_levels       | {3,3}
> tree_depth         | 3
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to