[ https://issues.apache.org/jira/browse/MADLIB-1218?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Frank McQuillan closed MADLIB-1218. ----------------------------------- > DT and RF: categorical levels reported incorrectly when grouping is used > ------------------------------------------------------------------------- > > Key: MADLIB-1218 > URL: https://issues.apache.org/jira/browse/MADLIB-1218 > Project: Apache MADlib > Issue Type: Bug > Components: Module: Decision Tree, Module: Random Forest > Reporter: Frank McQuillan > Assignee: Rahul Iyer > Priority: Major > Fix For: v1.14 > > > {code} > DROP TABLE IF EXISTS mt_cars; > CREATE TABLE mt_cars ( > id integer NOT NULL, > mpg double precision, > cyl integer, > disp double precision, > hp integer, > drat double precision, > wt double precision, > qsec double precision, > vs integer, > am integer, > gear integer, > carb integer > ); > INSERT INTO mt_cars VALUES > (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2), > (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4), > (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2), > (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4), > (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4), > (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3), > (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1), > (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3), > (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1), > (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3), > (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1), > (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1), > (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4), > (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2), > (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2), > (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4), > (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1), > (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2), > (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4), > (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1), > (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4), > (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2), > (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4), > (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2), > (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1), > (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4), > (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2), > (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8), > (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4), > (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2), > (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6), > (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2); > {code} > RF: > {code} > DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, > mt_cars_output_summary; > SELECT madlib.forest_train('mt_cars', > 'mt_cars_output', > 'id', > 'mpg', > '*', > 'id, hp, drat, am, gear, carb', -- exclude columns > 'am', -- grouping > 10::integer, > 2::integer, > TRUE::boolean, > 1, > 10, > 8, > 3, > 10 > ); > SELECT * FROM mt_cars_output_summary; > {code} > produces > {code} > -[ RECORD 1 > ]---------+----------------------------------------------------------------------- > method | forest_train > is_classification | f > source_table | mt_cars > model_table | mt_cars_output > id_col_name | id > dependent_varname | mpg > independent_varnames | vs,cyl,disp,qsec,wt > cat_features | vs,cyl > con_features | disp,qsec,wt > grouping_cols | am > num_trees | 10 > num_random_features | 2 > max_tree_depth | 10 > min_split | 8 > min_bucket | 3 > num_splits | 10 > verbose | f > importance | t > num_permutations | 1 > num_all_groups | 2 > num_failed_groups | 0 > total_rows_processed | 32 > total_rows_skipped | 0 > dependent_var_levels | > dependent_var_type | double precision > independent_var_types | integer, integer, double precision, double precision, > double precision > null_proxy | None > {code} > {code} > SELECT * FROM mt_cars_output_group; > {code} > produces > {code} > -[ RECORD 1 ]------+-------------------------------------- > gid | 1 > am | 0 > success | t > cat_n_levels | {3,3} > cat_levels_in_text | {4,6,8,4,6,8} > oob_error | 10.4384609425616 > cat_var_importance | {1.77200744569228,3.23893136930339} > con_var_importance | {6.52806304913008,0,2.71255870662522} > -[ RECORD 2 ]------+-------------------------------------- > gid | 2 > am | 1 > success | t > cat_n_levels | {2,2} > cat_levels_in_text | {0,1,0,1} > oob_error | 22.3349978333876 > cat_var_importance | {4.59597959183674,0} > con_var_importance | {5.715875,0,18.4067344897959} > {code} > In the above, cat_n_levels and cat_levels_in_text are wrong. I would expect > {0,1,4,6,8} to be reported for both groups. > Compare the above with the case of no groups: > {code} > DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, > mt_cars_output_summary; > SELECT madlib.forest_train('mt_cars', > 'mt_cars_output', > 'id', > 'mpg', > '*', > 'id, hp, drat, am, gear, carb', -- exclude columns > NULL, -- no groups > 10::integer, > 2::integer, > TRUE::boolean, > 1, > 10, > 8, > 3, > 10 > ); > SELECT * FROM mt_cars_output_summary; > {code} > produces > {code} > [ RECORD 1 > ]---------+----------------------------------------------------------------------- > method | forest_train > is_classification | f > source_table | mt_cars > model_table | mt_cars_output > id_col_name | id > dependent_varname | mpg > independent_varnames | vs,cyl,disp,qsec,wt > cat_features | vs,cyl > con_features | disp,qsec,wt > grouping_cols | > num_trees | 10 > num_random_features | 2 > max_tree_depth | 10 > min_split | 8 > min_bucket | 3 > num_splits | 10 > verbose | f > importance | t > num_permutations | 1 > num_all_groups | 1 > num_failed_groups | 0 > total_rows_processed | 32 > total_rows_skipped | 0 > dependent_var_levels | > dependent_var_type | double precision > independent_var_types | integer, integer, double precision, double precision, > double precision > null_proxy | None > {code} > SELECT * FROM mt_cars_output_group; > produces: > gid | 1 > success | t > cat_n_levels | {2,3} > cat_levels_in_text | {0,1,4,6,8} > oob_error | 9.90200731761036 > cat_var_importance | {0.390353827160495,12.8836927689546} > con_var_importance | {0.118665925451092,0.947886524555693,19.910665401879} > which seems OK. > DT: > I am seeing the same thing with decision tree. Here is a sample output > grouping on 'am' from decision tree: > {code} > madlib=# SELECT am, pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth > FROM train_output; > -[ RECORD 1 ]------+-------------- > am | 1 > pruning_cp | 0 > cat_levels_in_text | {0,1,0,1} > cat_n_levels | {2,2} > tree_depth | 1 > -[ RECORD 2 ]------+-------------- > am | 0 > pruning_cp | 0 > cat_levels_in_text | {4,6,8,4,6,8} > cat_n_levels | {3,3} > tree_depth | 3 > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)