[ https://issues.apache.org/jira/browse/MADLIB-1217?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Frank McQuillan closed MADLIB-1217. ----------------------------------- > DT and RF: null_as_category null handling does not work with grouping > ---------------------------------------------------------------------- > > Key: MADLIB-1217 > URL: https://issues.apache.org/jira/browse/MADLIB-1217 > Project: Apache MADlib > Issue Type: Bug > Components: Module: Decision Tree, Module: Random Forest > Reporter: Frank McQuillan > Assignee: Rahul Iyer > Priority: Major > Fix For: v1.14 > > > Load data: > {code} > DROP TABLE IF EXISTS mt_cars; > CREATE TABLE mt_cars ( > id integer NOT NULL, > mpg double precision, > cyl integer, > disp double precision, > hp integer, > drat double precision, > wt double precision, > qsec double precision, > vs integer, > am integer, > gear integer, > carb integer > ); > INSERT INTO mt_cars VALUES > (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2), > (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4), > (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2), > (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4), > (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4), > (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3), > (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1), > (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3), > (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1), > (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3), > (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1), > (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1), > (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4), > (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2), > (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2), > (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4), > (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1), > (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2), > (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4), > (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1), > (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4), > (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2), > (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4), > (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2), > (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1), > (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4), > (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2), > (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8), > (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4), > (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2), > (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6), > (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2); > {code} > DT: > {code} > DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv; > SELECT madlib.tree_train('mt_cars', -- source table > 'train_output', -- output model table > 'id', -- id column > 'mpg', -- dependent variable > '*', -- features > 'id, hp, drat, am, gear, carb', -- exclude columns > 'mse', -- split criterion > 'am', -- grouping > NULL::text, -- no weights, all observations > treated equally > 10, -- max depth > 8, -- min split > 3, -- number of bins per continuous > variable > 10, -- number of splits > NULL, -- pruning parameters > 'null_as_category=TRUE' > ); > {code} > results in error > {code} > ERROR: plpy.SPIError: column "__null__" does not exist > LINE 14: (COALESCE(vs, __NULL__))::text a... > ^ > QUERY: > SELECT > colname::text, > levels::text[], > grp_key::text > from ( > SELECT > grp_key, > 'vs' as colname, > array_agg(levels order by dep_avg) as levels > from ( > SELECT > array_to_string(array[(am)::text]::text[], ',') > as grp_key, > (COALESCE(vs, __NULL__))::text as levels, > COALESCE(vs, __NULL__) as dep_avg > FROM mt_cars > WHERE (am) is not NULL and (mpg) is not NULL AND > COALESCE(vs, __NULL__) IS NOT NULL > GROUP BY COALESCE(vs, __NULL__), am > ) s > GROUP BY grp_key > ) s1 > where array_upper(levels, 1) > 1 > UNION ALL > SELECT > colname::text, > levels::text[], > grp_key::text > from ( > SELECT > grp_key, > 'cyl' as colname, > array_agg(levels order by dep_avg) as levels > from ( > SELECT > array_to_string(array[(am)::text]::text[], ',') > as grp_key, > (COALESCE(cyl, __NULL__))::text as levels, > COALESCE(cyl, __NULL__) as dep_avg > FROM mt_cars > WHERE (am) is not NULL and (mpg) is not NULL AND > COALESCE(cyl, __NULL__) IS NOT NULL > GROUP BY COALESCE(cyl, __NULL__), am > ) s > GROUP BY grp_key > ) s1 > where array_upper(levels, 1) > 1 > > CONTEXT: Traceback (most recent call last): > PL/Python function "tree_train", line 28, in <module> > null_handling_params, verbose_mode) > PL/Python function "tree_train", line 489, in tree_train > PL/Python function "tree_train", line 293, in _get_tree_states > PL/Python function "tree_train", line 963, in _get_bins_grps > PL/Python function "tree_train" > {code} > RF: > {code} > DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, > mt_cars_output_summary; > SELECT madlib.forest_train('mt_cars', > 'mt_cars_output', > 'id', > 'mpg', > '*', > 'id, hp, drat, am, gear, carb', -- exclude columns > 'am', -- grouping > 10::integer, > 2::integer, > TRUE::boolean, > 1, > 10, > 8, > 3, > 10, > 'null_as_category=TRUE' > ); > {code} > results in error: > {code} > ERROR: plpy.SPIError: column "__null__" does not exist > LINE 14: (COALESCE(vs, __NULL__))::text a... > ^ > QUERY: > SELECT > colname::text, > levels::text[], > grp_key::text > from ( > SELECT > grp_key, > 'vs' as colname, > array_agg(levels order by dep_avg) as levels > from ( > SELECT > array_to_string(array[(am)::text], ',') as > grp_key, > (COALESCE(vs, __NULL__))::text as levels, > COALESCE(vs, __NULL__) as dep_avg > FROM mt_cars > WHERE (am) is not NULL and (mpg) is not NULL AND > COALESCE(vs, __NULL__) IS NOT NULL > GROUP BY COALESCE(vs, __NULL__), am > ) s > GROUP BY grp_key > ) s1 > where array_upper(levels, 1) > 1 > UNION ALL > SELECT > colname::text, > levels::text[], > grp_key::text > from ( > SELECT > grp_key, > 'cyl' as colname, > array_agg(levels order by dep_avg) as levels > from ( > SELECT > array_to_string(array[(am)::text], ',') as > grp_key, > (COALESCE(cyl, __NULL__))::text as levels, > COALESCE(cyl, __NULL__) as dep_avg > FROM mt_cars > WHERE (am) is not NULL and (mpg) is not NULL AND > COALESCE(cyl, __NULL__) IS NOT NULL > GROUP BY COALESCE(cyl, __NULL__), am > ) s > GROUP BY grp_key > ) s1 > where array_upper(levels, 1) > 1 > > CONTEXT: Traceback (most recent call last): > PL/Python function "forest_train", line 42, in <module> > sample_ratio > PL/Python function "forest_train", line 427, in forest_train > PL/Python function "forest_train", line 963, in _get_bins_grps > PL/Python function "forest_train" > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)