[1/2] incubator-hivemall git commit: Close #70: [HIVEMALL-75-2] Add tree_export UDF and update RandomForest tutorial

myui Fri, 30 Jun 2017 05:16:40 -0700

Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 9f01ebf20 -> 9876d0631



http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/docs/gitbook/multiclass/iris_scw.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/iris_scw.md 
b/docs/gitbook/multiclass/iris_scw.md
index 79cdaf4..2d1b8bb 100644
--- a/docs/gitbook/multiclass/iris_scw.md
+++ b/docs/gitbook/multiclass/iris_scw.md
@@ -16,311 +16,65 @@
   specific language governing permissions and limitations
   under the License.
 -->
-        
-*NOTE: RandomForest is being supported from Hivemall v0.4 or later.*
 
-# Dataset
-
-* https://archive.ics.uci.edu/ml/datasets/Iris
-
-```
-Attribute Information:
-   1. sepal length in cm
-   2. sepal width in cm
-   3. petal length in cm
-   4. petal width in cm
-   5. class: 
-      -- Iris Setosa
-      -- Iris Versicolour
-      -- Iris Virginica
-```
-
-# Table preparation
-
-```sql
-create database iris;
-use iris;
-
-create external table raw (
-  sepal_length int,
-  sepal_width int,
-  petal_length int,
-  petak_width int,
-  class string
-)
-ROW FORMAT DELIMITED
-  FIELDS TERMINATED BY ','
-  LINES TERMINATED BY '\n'
-STORED AS TEXTFILE LOCATION '/dataset/iris/raw';
-
-$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
-```
-
-```sql
-create table label_mapping 
-as
-select
-  class,
-  rank - 1 as label
-from (
-select
-  distinct class,
-  dense_rank() over (order by class) as rank
-from 
-  raw
-) t
-;
-```
+# Training (multiclass classification)
 
 ```sql
-create table training
-as
-select
-  rowid() as rowid,
-  array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as 
features,
-  t2.label
-from
-  raw t1
-  JOIN label_mapping t2 ON (t1.class = t2.class)
-;
-```
-
-# Training
-
-`train_randomforest_classifier` takes a dense `features` in double[] and a 
`label` starting from 0.
-
-```sql
-CREATE TABLE model 
-STORED AS SEQUENCEFILE 
-AS
+create table model_scw1 as
 select 
-  train_randomforest_classifier(features, label) 
-  -- hivemall v0.4.1-alpha.2 and before
-  -- train_randomforest_classifier(features, label) as (pred_model, 
var_importance, oob_errors, oob_tests)
-  -- hivemall v0.4.1 and later
-  -- train_randomforest_classifier(features, label) as (model_id, model_type, 
pred_model, var_importance, oob_errors, oob_tests)
-from
-  training;
-```
-*Note: The default TEXTFILE should not be used for model table when using 
Javascript output through "-output javascript" option.*
-
-```
-hive> desc model;
-model_id                int                                         
-model_type              int                                         
-pred_model              string                                      
-var_importance          array<double>                               
-oob_errors              int                                         
-oob_tests               int  
-```
-
-## Training options
-
-"-help" option shows usage of the function.
-
-```
-select train_randomforest_classifier(features, label, "-help") from training;
-
-> FAILED: UDFArgumentException 
-usage: train_randomforest_classifier(double[] features, int label [,
-       string options]) - Returns a relation consists of <int model_id,
-       int model_type, string pred_model, array<double> var_importance,
-       int oob_errors, int oob_tests> [-attrs <arg>] [-depth <arg>]
-       [-disable_compression] [-help] [-leafs <arg>] [-output <arg>]
-       [-rule <arg>] [-seed <arg>] [-splits <arg>] [-trees <arg>] [-vars
-       <arg>]
- -attrs,--attribute_types <arg>   Comma separated attribute types (Q for
-                                  quantitative variable and C for
-                                  categorical variable. e.g., [Q,C,Q,C])
- -depth,--max_depth <arg>         The maximum number of the tree depth
-                                  [default: Integer.MAX_VALUE]
- -disable_compression             Whether to disable compression of the
-                                  output script [default: false]
- -help                            Show function help
- -leafs,--max_leaf_nodes <arg>    The maximum number of leaf nodes
-                                  [default: Integer.MAX_VALUE]
- -output,--output_type <arg>      The output type (serialization/ser or
-                                  opscode/vm or javascript/js) [default:
-                                  serialization]
- -rule,--split_rule <arg>         Split algorithm [default: GINI, ENTROPY]
- -seed <arg>                      seed value in long [default: -1
-                                  (random)]
- -splits,--min_split <arg>        A node that has greater than or equals
-                                  to `min_split` examples will split
-                                  [default: 2]
- -trees,--num_trees <arg>         The number of trees for each task
-                                  [default: 50]
- -vars,--num_variables <arg>      The number of random selected features
-                                  [default: ceil(sqrt(x[0].length))].
-                                  int(num_variables * x[0].length) is
-                                  considered if num_variable is (0,1]
+ label, 
+ feature,
+ argmin_kld(weight, covar) as weight
+from 
+ (select 
+     train_multiclass_scw(features, label) as (label, feature, weight, covar)
+  from 
+     training_x10
+ ) t 
+group by label, feature;
 ```
-*Caution: "-num_trees" controls the number of trees for each task, not the 
total number of trees.*
 
-### Parallelize Training
-
-To parallelize RandomForest training, you can use UNION ALL as follows:
+# Predict
 
 ```sql
-CREATE TABLE model 
-STORED AS SEQUENCEFILE 
-AS
-select 
-  train_randomforest_classifier(features, label, '-trees 25') 
-from
-  training
-UNION ALL
+create or replace view predict_scw1
+as
 select 
-  train_randomforest_classifier(features, label, '-trees 25')
-from
-  training
-;
-```
-
-### Learning stats
-
-[`Variable 
importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp)
 and [`Out Of Bag (OOB) error 
rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) 
of RandomForest can be shown as follows:
-
-```sql
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
 select
-  array_sum(var_importance) as var_importance,
-  sum(oob_errors) / sum(oob_tests) as oob_err_rate
-from
-  model;
-```
-> [2.81010338879605,0.4970357753626371,23.790369091407698,14.315316390235273]  
   0.05333333333333334
-
-### Output prediction model by Javascipt
-
-```sql
-CREATE TABLE model_javascript
-STORED AS SEQUENCEFILE 
-AS
-select train_randomforest_classifier(features, label, "-output_type js 
-disable_compression")
-from training;
-
-select model from model_javascript limit 1;
-```
-
-```js
-if(x[3] <= 0.5) {
-  0;
-} else  {
-  if(x[2] <= 4.5) {
-    if(x[3] <= 1.5) {
-      if(x[0] <= 4.5) {
-        1;
-      } else  {
-        if(x[0] <= 5.5) {
-          1;
-        } else  {
-          if(x[1] <= 2.5) {
-            1;
-          } else  {
-            1;
-          }
-        }
-      }
-    } else  {
-      2;
-    }
-  } else  {
-    if(x[3] <= 1.5) {
-      2;
-    } else  {
-      2;
-    }
-  }
-}
-```
-
-# Prediction
-
-```sql
-set hivevar:classification=true;
-set hive.auto.convert.join=true;
-set hive.mapjoin.optimized.hashtable=false;
-
-create table predicted_vm
-as
-SELECT
-  rowid,
-  rf_ensemble(predicted) as predicted
-FROM (
-  SELECT
-    rowid, 
-    -- hivemall v0.4.1-alpha.2 and before
-    -- tree_predict(p.model, t.features, ${classification}) as predicted
-    -- hivemall v0.4.1 and later
-    tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
-  FROM
-    model p
-    LEFT OUTER JOIN -- CROSS JOIN
-    training t
-) t1
-group by
-  rowid
-;
-```
-_Note: Javascript outputs can be evaluated by `js_tree_predict`._
-
-### Parallelize Prediction
-
-The following query runs predictions in N-parallel. It would reduce elapsed 
time for prediction almost by N.
-
-```sql
-SET hivevar:classification=true;
-set hive.auto.convert.join=true;
-SET hive.mapjoin.optimized.hashtable=false;
-SET mapred.reduce.tasks=8;
-
-create table predicted_vm
-as
-SELECT
-  rowid,
-  rf_ensemble(predicted) as predicted
-FROM (
-  SELECT
-    t.rowid, 
-    -- hivemall v0.4.1-alpha.2 and before
-    -- tree_predict(p.pred_model, t.features, ${classification}) as predicted
-    -- hivemall v0.4.1 and later
-    tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
-  FROM (
-    SELECT model_id, model_type, pred_model
-    FROM model
-    DISTRIBUTE BY rand(1)
-  ) p 
-  LEFT OUTER JOIN training t
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    test20p_exploded t LEFT OUTER JOIN
+    model_scw1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
 ) t1
-group by
-  rowid
-;
+group by rowid
+) t2;
 ```
 
 # Evaluation
 
 ```sql
-select count(1) from training;
-> 150
-
-set hivevar:total_cnt=150;
+create or replace view eval_scw1 as
+select 
+  t.label as actual, 
+  p.label as predicted
+from 
+  test20p t JOIN predict_scw1 p 
+    on (t.rowid = p.rowid);
 
-WITH t1 as (
-SELECT
-  t.rowid,
-  t.label as actual,
-  p.predicted.label as predicted
-FROM
-  predicted_vm p
-  LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
-)
-SELECT
-  count(1) / ${total_cnt}
-FROM
-  t1
-WHERE
-  actual = predicted
-;
+select count(1)/30 from eval_scw1 
+where actual = predicted;
 ```
-> 0.9533333333333334
+
+> 0.9666666666666667

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/docs/gitbook/resources/images/iris.png
----------------------------------------------------------------------
diff --git a/docs/gitbook/resources/images/iris.png 
b/docs/gitbook/resources/images/iris.png
new file mode 100644
index 0000000..1d8213d
Binary files /dev/null and b/docs/gitbook/resources/images/iris.png differ

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all-as-permanent.deprecated.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.deprecated.hive 
b/resources/ddl/define-all-as-permanent.deprecated.hive
index 5558c4e..1cd604a 100644
--- a/resources/ddl/define-all-as-permanent.deprecated.hive
+++ b/resources/ddl/define-all-as-permanent.deprecated.hive
@@ -55,12 +55,6 @@ CREATE FUNCTION adadelta as 
'hivemall.regression.AdaDeltaUDTF' USING JAR '${hive
 DROP FUNCTION IF EXISTS collect_all;
 CREATE FUNCTION collect_all as 'hivemall.tools.array.CollectAllUDAF' USING JAR 
'${hivemall_jar}';
 
-DROP FUNCTION IF EXISTS vm_tree_predict;
-CREATE FUNCTION vm_tree_predict as 
'hivemall.smile.tools.TreePredictByStackMachineUDF' USING JAR '${hivemall_jar}';
-
-DROP FUNCTION IF EXISTS js_tree_predict;
-CREATE FUNCTION js_tree_predict as 
'hivemall.smile.tools.TreePredictByJavascriptUDF' USING JAR '${hivemall_jar}';
-
 DROP FUNCTION IF EXISTS train_gbt_classifier;
 CREATE FUNCTION train_gbt_classifier as 
'hivemall.smile.classification.GradientTreeBoostingClassifierUDTF' USING JAR 
'${hivemall_jar}';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive 
b/resources/ddl/define-all-as-permanent.hive
index 075e733..a3b6725 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -689,6 +689,9 @@ CREATE FUNCTION train_randomforest_regr as 
'hivemall.smile.regression.RandomFore
 DROP FUNCTION IF EXISTS tree_predict;
 CREATE FUNCTION tree_predict as 'hivemall.smile.tools.TreePredictUDF' USING 
JAR '${hivemall_jar}';
 
+DROP FUNCTION IF EXISTS tree_export;
+CREATE FUNCTION tree_export as 'hivemall.smile.tools.TreeExportUDF' USING JAR 
'${hivemall_jar}';
+
 DROP FUNCTION IF EXISTS rf_ensemble;
 CREATE FUNCTION rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF' 
USING JAR '${hivemall_jar}';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all.deprecated.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.deprecated.hive 
b/resources/ddl/define-all.deprecated.hive
index 001666b..ded195f 100644
--- a/resources/ddl/define-all.deprecated.hive
+++ b/resources/ddl/define-all.deprecated.hive
@@ -55,12 +55,6 @@ create temporary function adadelta as 
'hivemall.regression.AdaDeltaUDTF';
 drop temporary function if exists collect_all;
 create temporary function collect_all as 'hivemall.tools.array.CollectAllUDAF';
 
-drop temporary function if exists vm_tree_predict;
-create temporary function vm_tree_predict as 
'hivemall.smile.tools.TreePredictByStackMachineUDF';
-
-drop temporary function if exists js_tree_predict;
-create temporary function js_tree_predict as 
'hivemall.smile.tools.TreePredictByJavascriptUDF';
-
 drop temporary function if exists train_gbt_classifier;
 create temporary function train_gbt_classifier as 
'hivemall.smile.classification.GradientTreeBoostingClassifierUDTF';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 7f5c727..77b6a98 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -681,6 +681,9 @@ create temporary function train_randomforest_regr as 
'hivemall.smile.regression.
 drop temporary function if exists tree_predict;
 create temporary function tree_predict as 
'hivemall.smile.tools.TreePredictUDF';
 
+drop temporary function if exists tree_export;
+create temporary function tree_export as 'hivemall.smile.tools.TreeExportUDF';
+
 drop temporary function if exists rf_ensemble;
 create temporary function rf_ensemble as 
'hivemall.smile.tools.RandomForestEnsembleUDAF';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index fc4a60e..2193cd8 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -665,6 +665,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION 
train_randomforest_regr AS 'hivemall.s
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tree_predict")
 sqlContext.sql("CREATE TEMPORARY FUNCTION tree_predict AS 
'hivemall.smile.tools.TreePredictUDF'")
 
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tree_export")
+sqlContext.sql("CREATE TEMPORARY FUNCTION tree_export AS 
'hivemall.smile.tools.TreeExportUDF'")
+
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS rf_ensemble")
 sqlContext.sql("CREATE TEMPORARY FUNCTION rf_ensemble AS 
'hivemall.smile.tools.RandomForestEnsembleUDAF'")
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 1d11d1a..b5e8ab3 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -173,6 +173,7 @@ create temporary function l2_norm as 
'hivemall.tools.math.L2NormUDAF';
 create temporary function dimsum_mapper as 
'hivemall.knn.similarity.DIMSUMMapperUDTF';
 create temporary function train_classifier as 
'hivemall.classifier.GeneralClassifierUDTF';
 create temporary function train_regression as 
'hivemall.regression.GeneralRegressionUDTF';
+create temporary function tree_export as 'hivemall.smile.tools.TreeExportUDF';
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';

[1/2] incubator-hivemall git commit: Close #70: [HIVEMALL-75-2] Add tree_export UDF and update RandomForest tutorial

Reply via email to