Repository: incubator-hivemall
Updated Branches:
  refs/heads/master ee25b5893 -> e90f4abcb


[HIVEMALL-174][DOC] Update RandomForest document to reflect changes in usages

## What changes were proposed in this pull request?

Update RandomForest document to reflect changes in usages

## What type of PR is it?

Documentation

## What is the Jira issue?

https://issues.apache.org/jira/browse/HIVEMALL-174

Author: Makoto Yui <m...@apache.org>

Closes #136 from myui/rf_docs.


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e90f4abc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e90f4abc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e90f4abc

Branch: refs/heads/master
Commit: e90f4abcb54396cd8f9362727a7f7f1e9b28617d
Parents: ee25b58
Author: Makoto Yui <m...@apache.org>
Authored: Tue Mar 13 19:45:24 2018 +0900
Committer: Makoto Yui <m...@apache.org>
Committed: Tue Mar 13 19:45:24 2018 +0900

----------------------------------------------------------------------
 .../RandomForestClassifierUDTF.java             |  8 +--
 docs/gitbook/binaryclass/news20_rf.md           |  6 ++-
 docs/gitbook/binaryclass/titanic_rf.md          | 37 +++++++-------
 docs/gitbook/multiclass/iris_randomforest.md    | 54 +++++++++++---------
 4 files changed, 56 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
 
b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
index 6e8a650..d0db3a1 100644
--- 
a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
+++ 
b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java
@@ -82,9 +82,9 @@ import org.apache.hadoop.mapred.Reporter;
 
 @Description(
         name = "train_randomforest_classifier",
-        value = "_FUNC_(array<double|string> features, int label [, const 
array<double> classWeights, const string options]) - "
-                + "Returns a relation consists of "
-                + "<int model_id, int model_type, string pred_model, 
array<double> var_importance, int oob_errors, int oob_tests, double weight>")
+        value = "_FUNC_(array<double|string> features, int label [, const 
string options, const array<double> classWeights])"
+                + "- Returns a relation consists of "
+                + "<string model_id, double model_weight, string model, 
array<double> var_importance, int oob_errors, int oob_tests>")
 public final class RandomForestClassifierUDTF extends UDTFWithOptions {
     private static final Log logger = 
LogFactory.getLog(RandomForestClassifierUDTF.class);
 
@@ -150,7 +150,7 @@ public final class RandomForestClassifierUDTF extends 
UDTFWithOptions {
         opts.addOption("rule", "split_rule", true, "Split algorithm [default: 
GINI, ENTROPY]");
         opts.addOption("stratified", "stratified_sampling", false,
             "Enable Stratified sampling for unbalanced data");
-        opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]");
+        opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]. 
[default: 1.0]");
         return opts;
     }
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/docs/gitbook/binaryclass/news20_rf.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/binaryclass/news20_rf.md 
b/docs/gitbook/binaryclass/news20_rf.md
index 327939b..065c736 100644
--- a/docs/gitbook/binaryclass/news20_rf.md
+++ b/docs/gitbook/binaryclass/news20_rf.md
@@ -20,7 +20,7 @@
 Hivemall Random Forest supports libsvm-like sparse inputs. 
 
 > #### Note
-> This feature, i.e., Sparse input support in Random Forest, is supported 
since Hivemall v0.5-rc.1 or later._
+> This feature, i.e., Sparse input support in Random Forest, is supported 
since Hivemall v0.5.0 or later._
 > [`feature_hashing`](http://hivemall.incubator.apache.org/userguide/ft_engineering/hashing.html#featurehashing-function)
 >  function is useful to prepare feature vectors for Random Forest.
 
 <!-- toc -->
@@ -60,8 +60,10 @@ FROM (
   SELECT
     rowid, 
     m.model_weight,
+       -- v0.5.0 and later
     tree_predict(m.model_id, m.model, t.features, "-classification") as 
predicted
-    -- tree_predict(m.model_id, m.model, t.features, ${classification}) as 
predicted
+    -- before v0.5.0
+       -- tree_predict(m.model_id, m.model, t.features, ${classification}) as 
predicted
   FROM
     rf_model m
     LEFT OUTER JOIN -- CROSS JOIN

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/docs/gitbook/binaryclass/titanic_rf.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/binaryclass/titanic_rf.md 
b/docs/gitbook/binaryclass/titanic_rf.md
index 2b54074..3d51fa8 100644
--- a/docs/gitbook/binaryclass/titanic_rf.md
+++ b/docs/gitbook/binaryclass/titanic_rf.md
@@ -148,8 +148,9 @@ from
 
 `Q` and `C` represent quantitative variable and categorical variables, 
respectively.
 
-*Caution:* Note that the output of `guess_attribute_types` is not perfect. 
Revise it by your self.
-For example, `pclass` is a categorical variable.
+> #### Caution
+> Note that the output of `guess_attribute_types` is not perfect. Revise it by 
your self.
+> For example, `pclass` is a categorical variable.
 
 ```sql
 set hivevar:attrs=C,C,C,Q,Q,Q,C,Q,C,C;
@@ -159,7 +160,6 @@ create table model_rf
 AS
 select
   train_randomforest_classifier(features, survived, "-trees 500 -attrs 
${attrs}") 
-    -- as (model_id, model_type, pred_model, var_importance, oob_errors, 
oob_tests)
 from
   train_rf
 ;
@@ -192,24 +192,23 @@ FROM (
   SELECT
     passengerid,
     -- rf_ensemble(predicted) as predicted
-    -- hivemall v0.5-rc.1 or later
+    -- v0.5.0 or later
     rf_ensemble(predicted.value, predicted.posteriori, model_weight) as 
predicted
     -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- 
avoid OOB accuracy (i.e., model_weight)
   FROM (
     SELECT
       t.passengerid, 
-      -- hivemall v0.4.1-alpha.3 or later
+      -- from v0.4.1-alpha.3 to v0.4.2-rc4
       -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
-      -- hivemall v0.5-rc.1 or later
+      -- v0.5.0 or later
       p.model_weight,
-         tree_predict(p.model_id, p.model, t.features, "-classification") as 
predicted
-         -- tree_predict(p.model_id, p.model, t.features, ${classification}) 
as predicted
-      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
+      tree_predict(p.model_id, p.model, t.features, "-classification") as 
predicted
+      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted -- to use the old model in v0.5.0 or later
     FROM (
       SELECT 
-        -- hivemall v0.4.1-alpha.3 or later
+        -- from v0.4.1-alpha.3 or v0.4.2-rc4
         -- model_id, model_type, pred_model
-        -- hivemall v0.5-rc.1 or later
+        -- v0.5.0 or later
         model_id, model_weight, model
       FROM 
         model_rf 
@@ -224,7 +223,7 @@ FROM (
 ```
 
 > #### Caution
-> `tree_predict_v1` is for the backward compatibility for using prediction 
models built before `v0.5-rc.1` on `v0.5-rc.1` or later.
+> `tree_predict_v1` is for the backward compatibility for using prediction 
models built before `v0.5.0` on `v0.5.0` or later.
 
 # Kaggle submission
 
@@ -251,7 +250,7 @@ Accuracy would gives `0.76555` for a Kaggle submission.
 # Graphvis export
 
 > #### Note
-> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later.
+> `tree_export` feature is supported from Hivemall v0.5.0 or later.
 > Better to limit tree depth on training by `-depth` option to plot a Decision 
 > Tree.
 
 Hivemall provide `tree_export` to export a decision tree into 
[Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You 
can find the usage by issuing the following query:
@@ -336,24 +335,24 @@ FROM (
   SELECT
     passengerid,
     -- rf_ensemble(predicted) as predicted
-    -- hivemall v0.5-rc.1 or later
+    -- v0.5.0 or later
     rf_ensemble(predicted.value, predicted.posteriori, model_weight) as 
predicted
     -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- 
avoid OOB accuracy (i.e., model_weight)
   FROM (
     SELECT
       t.passengerid, 
-      -- hivemall v0.4.1-alpha.3 or later
+      -- from v0.4.1-alpha.3 or v0.4.2-rc4
       -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
-      -- hivemall v0.5-rc.1 or later
+      -- v0.5.0 or later
       p.model_weight,
       tree_predict(p.model_id, p.model, t.features, "-classification") as 
predicted
       -- tree_predict(p.model_id, p.model, t.features, ${classification}) as 
predicted
-      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
+      -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted -- to use the old model in v0.5.0 or later
     FROM (
       SELECT 
-        -- hivemall v0.4.1-alpha.3 or later
+        -- from v0.4.1-alpha.3 to v0.4.2-rc4
         -- model_id, model_type, pred_model
-        -- hivemall v0.5-rc.1 or later
+        -- v0.5.0 or later
         model_id, model_weight, model
       FROM 
         model_rf_07

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/docs/gitbook/multiclass/iris_randomforest.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/iris_randomforest.md 
b/docs/gitbook/multiclass/iris_randomforest.md
index bfc197f..73ea4a3 100644
--- a/docs/gitbook/multiclass/iris_randomforest.md
+++ b/docs/gitbook/multiclass/iris_randomforest.md
@@ -94,17 +94,19 @@ CREATE TABLE model
   STORED AS SEQUENCEFILE 
 AS
 select 
-  train_randomforest_classifier(features, label) 
-  -- hivemall v0.4.1-alpha.2 and before
+  train_randomforest_classifier(features, label)
+  -- v0.5.0 and later
+  -- train_randomforest_classifier(features, label) as (model_id, 
model_weight, model, var_importance, oob_errors, oob_tests)
+  -- v0.4.1-alpha.2 and before
   -- train_randomforest_classifier(features, label) as (pred_model, 
var_importance, oob_errors, oob_tests)
-  -- hivemall v0.4.1 and later
+  -- from v0.4.1 to v0.4.2-rc4
   -- train_randomforest_classifier(features, label) as (model_id, model_type, 
pred_model, var_importance, oob_errors, oob_tests)
 from
   training;
 ```
 
 > #### Caution
-> The default `TEXTFILE` should not be used for model table when using 
Javascript output through `-output javascript` option.
+> Note that model storage format is different between versions as seen the 
above.
 
 ```sql
 hive> desc extended model;
@@ -163,7 +165,7 @@ usage: train_randomforest_classifier(array<double|string> 
features, int
                                      features [default:
                                      ceil(sqrt(x[0].length))].
                                      int(num_variables * x[0].length) is
-                                     considered if num_variable is (0,1
+                                     considered if num_variable is (0,1]
 ```
 
 > #### Caution
@@ -215,19 +217,19 @@ as
 SELECT
   rowid,
   -- rf_ensemble(predicted) as predicted
-  -- hivemall v0.5-rc.1 or later
+  -- v0.5.0 or later
   rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
   -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid 
OOB accuracy (i.e., model_weight)
 FROM (
   SELECT
     rowid, 
-    -- hivemall v0.4.1 and later
+    -- from v0.4.1 to v0.4.2-rc4
     -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
-    -- hivemall v0.5-rc.1 or later
+    -- v0.5.0 or later
     p.model_weight,
     tree_predict(p.model_id, p.model, t.features, "-classification") as 
predicted
     -- tree_predict(p.model_id, p.model, t.features, ${classification}) as 
predicted
-    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted -- to use the old model in v0.5-rc.1 or later
+    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted -- to use the old model in v0.5.0 or later
   FROM
     model p
     LEFT OUTER JOIN -- CROSS JOIN
@@ -238,8 +240,11 @@ group by
 ;
 ```
 
+> #### Note
+> Left outer join without a join condition (i.e., `model p LEFT OUTER JOIN 
training t`) is a trick to fix the left table for cross join.
+
 > #### Caution
-> `tree_predict_v1` is for the backward compatibility for using prediction 
models built before `v0.5-rc.1` on `v0.5-rc.1` or later.
+> `tree_predict_v1` is for the backward compatibility for using prediction 
models built before `v0.5` on `v0.5` or later.
 
 ### Parallelize Prediction
 
@@ -251,29 +256,30 @@ set hive.auto.convert.join=true;
 SET hive.mapjoin.optimized.hashtable=false;
 SET mapred.reduce.tasks=8;
 
+drop table predicted;
 create table predicted
 as
 SELECT
   rowid,
   -- rf_ensemble(predicted) as predicted
-  -- hivemall v0.5-rc.1 or later
+  -- v0.5.0 or later
   rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
   -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid 
OOB accuracy (i.e., model_weight)
 FROM (
   SELECT
     t.rowid, 
-    -- hivemall v0.4.1 and later
+    -- from v0.4.1 to v0.4.2-rc4
     -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
-    -- hivemall v0.5-rc.1 or later
+    -- v0.5.0 or later
     p.model_weight,
     tree_predict(p.model_id, p.model, t.features, "-classification") as 
predicted
     -- tree_predict(p.model_id, p.model, t.features, ${classification}) as 
predicted
-    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted as predicted -- to use the old model in 
v0.5-rc.1 or later
+    -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted as predicted -- to use the old model in v0.5.0 
or later
   FROM (
     SELECT 
-      -- hivemall v0.4.1 and later
+      -- from v0.4.1 to v0.4.2-rc4
       -- model_id, model_type, pred_model
-      -- hivemall v0.5-rc.1 or later
+      -- v0.5.0 or later
       model_id, model_weight, model
     FROM model
     DISTRIBUTE BY rand(1)
@@ -295,13 +301,13 @@ select count(1) from training;
 set hivevar:total_cnt=150;
 
 WITH t1 as (
-SELECT
-  t.rowid,
-  t.label as actual,
-  p.predicted.label as predicted
-FROM
-  predicted p
-  LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
+  SELECT
+    t.rowid,
+    t.label as actual,
+    p.predicted.label as predicted
+  FROM
+    predicted p
+    LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
 )
 SELECT
   count(1) / ${total_cnt}
@@ -316,7 +322,7 @@ WHERE
 # Graphvis export
 
 > #### Note
-> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later.
+> `tree_export` feature is supported from Hivemall v0.5.0 or later.
 > Better to limit tree depth on training by `-depth` option to plot a Decision 
 > Tree.
 
 Hivemall provide `tree_export` to export a decision tree into 
[Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You 
can find the usage by issuing the following query:

Reply via email to