[ 
https://issues.apache.org/jira/browse/MADLIB-1097?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16016103#comment-16016103
 ] 

Frank McQuillan commented on MADLIB-1097:
-----------------------------------------

here a NULL feature example that works OK now:

{code}
DROP TABLE IF EXISTS dt_golf_nulls;
CREATE TABLE dt_golf_nulls (
    id integer NOT NULL,
    outlook text,
    temperature double precision,
    humidity double precision,
    windy text,
    class text
) ;
INSERT INTO dt_golf_nulls (id,outlook,temperature,humidity,windy,class) VALUES
(1, NULL, 85, 85, 'false', 'Don''t Play'),
(2, NULL, 80, 90, 'true', 'Don''t Play'),
(3, 'overcast', 83, NULL, 'false', 'Play'),
(4, 'rain', 70, NULL, 'false', 'Play'),
(5, NULL, 68, 80, 'false', 'Play'),
(6, 'rain', NULL, 70, 'true', 'Don''t Play'),
(7, 'overcast', 64, NULL, 'true', 'Play'),
(8, 'sunny', 72, NULL, 'false', 'Don''t Play'),
(9, NULL, 69, 70, 'false', 'Play'),
(10, NULL, 75, 80, 'false', 'Play'),
(11, 'sunny', 75, 70, NULL, 'Play'),
(12, 'overcast', NULL, 90, 'true', 'Play'),
(13, 'overcast', NULL, 75, 'false', 'Play'),
(14, 'rain', 71, NULL, 'true', 'Don''t Play');
SELECT * FROM dt_golf_nulls ORDER BY id;
{code}
produces
{code}
 id | outlook  | temperature | humidity | windy |   class    
----+----------+-------------+----------+-------+------------
  1 |          |          85 |       85 | false | Don't Play
  2 |          |          80 |       90 | true  | Don't Play
  3 | overcast |          83 |          | false | Play
  4 | rain     |          70 |          | false | Play
  5 |          |          68 |       80 | false | Play
  6 | rain     |             |       70 | true  | Don't Play
  7 | overcast |          64 |          | true  | Play
  8 | sunny    |          72 |          | false | Don't Play
  9 |          |          69 |       70 | false | Play
 10 |          |          75 |       80 | false | Play
 11 | sunny    |          75 |       70 |       | Play
 12 | overcast |             |       90 | true  | Play
 13 | overcast |             |       75 | false | Play
 14 | rain     |          71 |          | true  | Don't Play
(14 rows)
{code}
and running RF
{code}
DROP TABLE IF EXISTS train_output, train_output_summary;
SELECT madlib.tree_train('dt_golf_nulls',         -- source table
                         'train_output',    -- output model table
                         'id',              -- id column
                         'class',           -- response
                         'outlook, temperature, humidity, windy',   -- features
                         NULL::text,        -- exclude columns
                         'gini',            -- split criterion
                         NULL::text,        -- no grouping
                         NULL::text,        -- no weights
                         5,                 -- max depth
                         3,                 -- min split
                         1,                 -- min bucket
                         6            -- number of bins per continuous variable
                         );
SELECT * FROM train_output_summary;
{code}
prodces
{code}
-[ RECORD 1 ]---------+-----------------------------------------------
method                | tree_train
is_classification     | t
source_table          | dt_golf_nulls
model_table           | train_output
id_col_name           | id
dependent_varname     | class
independent_varnames  | outlook, windy, temperature, humidity
cat_features          | outlook,windy
con_features          | temperature,humidity
grouping_cols         | 
num_all_groups        | 1
num_failed_groups     | 0
total_rows_processed  | 14
total_rows_skipped    | 0
dependent_var_levels  | "Don't Play","Play"
dependent_var_type    | text
input_cp              | 0
independent_var_types | text, text, double precision, double precision
{code}

> Random Forest does not allow NULL values in features
> ----------------------------------------------------
>
>                 Key: MADLIB-1097
>                 URL: https://issues.apache.org/jira/browse/MADLIB-1097
>             Project: Apache MADlib
>          Issue Type: Improvement
>          Components: Module: Random Forest
>            Reporter: Nandish Jayaram
>            Assignee: Rahul Iyer
>            Priority: Minor
>             Fix For: v1.12
>
>
> Running forest_train() with features that have NULL values results in the 
> following error:
> {code}
> psql:/tmp/madlib.LkFR_5/recursive_partitioning/test/random_forest.sql_in.tmp:79:
>  ERROR:  spiexceptions.InvalidParameterValue: Function 
> "_rf_cat_imp_score(bytea8,integer[],double 
> precision[],integer[],integer,double precision,boolean,double precision[])": 
> Invalid type conversion. Null where not expected.
> CONTEXT:  Traceback (most recent call last):
>   PL/Python function "forest_train", line 42, in <module>
>     sample_ratio
>   PL/Python function "forest_train", line 605, in forest_train
>   PL/Python function "forest_train", line 1052, in _calculate_oob_prediction
> PL/Python function "forest_train"
> {code}
> The following are the input table and parameters used:
> {code:sql}
> CREATE TABLE dt_golf (
>     id integer NOT NULL,
>     "OUTLOOK" text,
>     temperature double precision,
>     humidity double precision,
>     windy boolean,
>     class text
> ) ;
> INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
> (1, 'sunny', 85, 85, false, 'Don''t Play'),
> (2, 'sunny', 80, 90, true, 'Don''t Play'),
> (3, 'overcast', 83, 78, false, 'Play'),
> (4, 'rain', NULL, 96, false, 'Play'),
> (5, 'rain', 68, 80, NULL, 'Play'),
> (6, 'rain', 65, 70, true, 'Don''t Play'),
> (7, 'overcast', 64, 65, true, 'Play'),
> (8, 'sunny', 72, 95, false, 'Don''t Play'),
> (9, 'sunny', 69, 70, false, 'Play'),
> (10, 'rain', 75, 80, false, 'Play'),
> (11, 'sunny', 75, 70, true, 'Play'),
> (12, 'overcast', 72, 90, true, 'Play'),
> (13, 'overcast', 81, 75, false, 'Play'),
> (14, 'rain', 71, 80, true, 'Don''t Play');
> SELECT forest_train(
>                   'dt_golf'::TEXT,         -- source table
>                   'train_output'::TEXT,    -- output model table
>                   'id'::TEXT,              -- id column
>                   'class'::TEXT,           -- response
>                   'windy, temperature'::TEXT,   -- features
>                   NULL::TEXT,        -- exclude columns
>                   NULL::TEXT,        -- no grouping
>                   5,                -- num of trees
>                   1,                 -- num of random features
>                   TRUE::BOOLEAN,    -- importance
>                   1::INTEGER,       -- num_permutations
>                   10::INTEGER,       -- max depth
>                   1::INTEGER,        -- min split
>                   1::INTEGER,        -- min bucket
>                   8::INTEGER,        -- number of bins per continuous variable
>                   'max_surrogates=0',
>                   FALSE
>                   );
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Reply via email to