Rahul Iyer created MADLIB-1236:
----------------------------------
Summary: DT: tree_predict fails if a categorical variable has been
discarded
Key: MADLIB-1236
URL: https://issues.apache.org/jira/browse/MADLIB-1236
Project: Apache MADlib
Issue Type: Task
Components: Module: Decision Tree
Reporter: Rahul Iyer
Fix For: v1.15
{{tree_predict}} fails if {{tree_train}} removed a categorical variable
(possibly due to presence of a a single level). This is because the summary
table incorrectly does not exclude the discarded categorical variable, leading
to {{tree_predict}} mapping the levels of that variable with a pre-built map.
This "mapping" fails when because {{tree_train}} does not include the variable
in this pre-built map.
Repro steps with output given below.
{code}
DROP TABLE IF EXISTS dt_golf CASCADE;
CREATE TABLE dt_golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Cont_features" double precision[],
cat_features text[],
windy boolean,
class text
) ;
INSERT INTO dt_golf
(id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class)
VALUES
(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
(16, 'overcast', 80, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
(17, 'overcast', 60, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
(18, 'overcast', 70, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play');
SELECT tree_train('dt_golf'::text, -- source table
'train_output'::text, -- output model table
'id'::text, -- id column
'temperature::double precision'::text, --
response
'"OUTLOOK", humidity, windy, cat_features'::text,
-- features
NULL::text, -- exclude columns
'gini'::text, -- split criterion
'class'::text, -- grouping
NULL::text, -- no weights
10::integer, -- max depth
6::integer, -- min split
2::integer, -- min bucket
3::integer, -- number of bins per continuous
variable
'cp=0.01' -- cost-complexity pruning
parameter
);
CREATE TABLE dt_golf2 as
SELECT * FROM dt_golf
UNION
SELECT 15 as id, 'humid' as "OUTLOOK", 71 as temperature, 80 as humidity,
ARRAY[90, 90] as "Cont_features", ARRAY['b', 'c'] as cat_features,
true as windy, 'Don''t Play' as class;
SELECT tree_predict('train_output', 'dt_golf2', 'predict_output');
{code}
Error message:
{code}
psql:/tmp/madlib.88brFX/recursive_partitioning/test/decision_tree.sql_in.tmp:327:
ERROR: plpy.SPIError: Function
"_map_catlevel_to_int(text[],text[],integer[],boolean)": Invalid type
conversion. Null where not expected. (seg0 slice2 127.0.0.1:25432 pid=88213)
CONTEXT: Traceback (most recent call last):
PL/Python function "tree_predict", line 23, in <module>
return decision_tree.tree_predict(**globals())
PL/Python function "tree_predict", line 1690, in tree_predict
PL/Python function "tree_predict"
{code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)