Repository: incubator-hivemall Updated Branches: refs/heads/master ee25b5893 -> e90f4abcb
[HIVEMALL-174][DOC] Update RandomForest document to reflect changes in usages ## What changes were proposed in this pull request? Update RandomForest document to reflect changes in usages ## What type of PR is it? Documentation ## What is the Jira issue? https://issues.apache.org/jira/browse/HIVEMALL-174 Author: Makoto Yui <m...@apache.org> Closes #136 from myui/rf_docs. Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e90f4abc Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e90f4abc Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e90f4abc Branch: refs/heads/master Commit: e90f4abcb54396cd8f9362727a7f7f1e9b28617d Parents: ee25b58 Author: Makoto Yui <m...@apache.org> Authored: Tue Mar 13 19:45:24 2018 +0900 Committer: Makoto Yui <m...@apache.org> Committed: Tue Mar 13 19:45:24 2018 +0900 ---------------------------------------------------------------------- .../RandomForestClassifierUDTF.java | 8 +-- docs/gitbook/binaryclass/news20_rf.md | 6 ++- docs/gitbook/binaryclass/titanic_rf.md | 37 +++++++------- docs/gitbook/multiclass/iris_randomforest.md | 54 +++++++++++--------- 4 files changed, 56 insertions(+), 49 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java index 6e8a650..d0db3a1 100644 --- a/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java +++ b/core/src/main/java/hivemall/smile/classification/RandomForestClassifierUDTF.java @@ -82,9 +82,9 @@ import org.apache.hadoop.mapred.Reporter; @Description( name = "train_randomforest_classifier", - value = "_FUNC_(array<double|string> features, int label [, const array<double> classWeights, const string options]) - " - + "Returns a relation consists of " - + "<int model_id, int model_type, string pred_model, array<double> var_importance, int oob_errors, int oob_tests, double weight>") + value = "_FUNC_(array<double|string> features, int label [, const string options, const array<double> classWeights])" + + "- Returns a relation consists of " + + "<string model_id, double model_weight, string model, array<double> var_importance, int oob_errors, int oob_tests>") public final class RandomForestClassifierUDTF extends UDTFWithOptions { private static final Log logger = LogFactory.getLog(RandomForestClassifierUDTF.class); @@ -150,7 +150,7 @@ public final class RandomForestClassifierUDTF extends UDTFWithOptions { opts.addOption("rule", "split_rule", true, "Split algorithm [default: GINI, ENTROPY]"); opts.addOption("stratified", "stratified_sampling", false, "Enable Stratified sampling for unbalanced data"); - opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]"); + opts.addOption("subsample", true, "Sampling rate in range (0.0,1.0]. [default: 1.0]"); return opts; } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/docs/gitbook/binaryclass/news20_rf.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_rf.md b/docs/gitbook/binaryclass/news20_rf.md index 327939b..065c736 100644 --- a/docs/gitbook/binaryclass/news20_rf.md +++ b/docs/gitbook/binaryclass/news20_rf.md @@ -20,7 +20,7 @@ Hivemall Random Forest supports libsvm-like sparse inputs. > #### Note -> This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5-rc.1 or later._ +> This feature, i.e., Sparse input support in Random Forest, is supported since Hivemall v0.5.0 or later._ > [`feature_hashing`](http://hivemall.incubator.apache.org/userguide/ft_engineering/hashing.html#featurehashing-function) > function is useful to prepare feature vectors for Random Forest. <!-- toc --> @@ -60,8 +60,10 @@ FROM ( SELECT rowid, m.model_weight, + -- v0.5.0 and later tree_predict(m.model_id, m.model, t.features, "-classification") as predicted - -- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted + -- before v0.5.0 + -- tree_predict(m.model_id, m.model, t.features, ${classification}) as predicted FROM rf_model m LEFT OUTER JOIN -- CROSS JOIN http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/docs/gitbook/binaryclass/titanic_rf.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/titanic_rf.md b/docs/gitbook/binaryclass/titanic_rf.md index 2b54074..3d51fa8 100644 --- a/docs/gitbook/binaryclass/titanic_rf.md +++ b/docs/gitbook/binaryclass/titanic_rf.md @@ -148,8 +148,9 @@ from `Q` and `C` represent quantitative variable and categorical variables, respectively. -*Caution:* Note that the output of `guess_attribute_types` is not perfect. Revise it by your self. -For example, `pclass` is a categorical variable. +> #### Caution +> Note that the output of `guess_attribute_types` is not perfect. Revise it by your self. +> For example, `pclass` is a categorical variable. ```sql set hivevar:attrs=C,C,C,Q,Q,Q,C,Q,C,C; @@ -159,7 +160,6 @@ create table model_rf AS select train_randomforest_classifier(features, survived, "-trees 500 -attrs ${attrs}") - -- as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) from train_rf ; @@ -192,24 +192,23 @@ FROM ( SELECT passengerid, -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.passengerid, - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 to v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later p.model_weight, - tree_predict(p.model_id, p.model, t.features, "-classification") as predicted - -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later + tree_predict(p.model_id, p.model, t.features, "-classification") as predicted + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 or v0.4.2-rc4 -- model_id, model_type, pred_model - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later model_id, model_weight, model FROM model_rf @@ -224,7 +223,7 @@ FROM ( ``` > #### Caution -> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later. +> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5.0` on `v0.5.0` or later. # Kaggle submission @@ -251,7 +250,7 @@ Accuracy would gives `0.76555` for a Kaggle submission. # Graphvis export > #### Note -> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later. +> `tree_export` feature is supported from Hivemall v0.5.0 or later. > Better to limit tree depth on training by `-depth` option to plot a Decision > Tree. Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query: @@ -336,24 +335,24 @@ FROM ( SELECT passengerid, -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.passengerid, - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 or v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT - -- hivemall v0.4.1-alpha.3 or later + -- from v0.4.1-alpha.3 to v0.4.2-rc4 -- model_id, model_type, pred_model - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later model_id, model_weight, model FROM model_rf_07 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e90f4abc/docs/gitbook/multiclass/iris_randomforest.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/iris_randomforest.md b/docs/gitbook/multiclass/iris_randomforest.md index bfc197f..73ea4a3 100644 --- a/docs/gitbook/multiclass/iris_randomforest.md +++ b/docs/gitbook/multiclass/iris_randomforest.md @@ -94,17 +94,19 @@ CREATE TABLE model STORED AS SEQUENCEFILE AS select - train_randomforest_classifier(features, label) - -- hivemall v0.4.1-alpha.2 and before + train_randomforest_classifier(features, label) + -- v0.5.0 and later + -- train_randomforest_classifier(features, label) as (model_id, model_weight, model, var_importance, oob_errors, oob_tests) + -- v0.4.1-alpha.2 and before -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests) - -- hivemall v0.4.1 and later + -- from v0.4.1 to v0.4.2-rc4 -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) from training; ``` > #### Caution -> The default `TEXTFILE` should not be used for model table when using Javascript output through `-output javascript` option. +> Note that model storage format is different between versions as seen the above. ```sql hive> desc extended model; @@ -163,7 +165,7 @@ usage: train_randomforest_classifier(array<double|string> features, int features [default: ceil(sqrt(x[0].length))]. int(num_variables * x[0].length) is - considered if num_variable is (0,1 + considered if num_variable is (0,1] ``` > #### Caution @@ -215,19 +217,19 @@ as SELECT rowid, -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT rowid, - -- hivemall v0.4.1 and later + -- from v0.4.1 to v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5-rc.1 or later + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted -- to use the old model in v0.5.0 or later FROM model p LEFT OUTER JOIN -- CROSS JOIN @@ -238,8 +240,11 @@ group by ; ``` +> #### Note +> Left outer join without a join condition (i.e., `model p LEFT OUTER JOIN training t`) is a trick to fix the left table for cross join. + > #### Caution -> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5-rc.1` on `v0.5-rc.1` or later. +> `tree_predict_v1` is for the backward compatibility for using prediction models built before `v0.5` on `v0.5` or later. ### Parallelize Prediction @@ -251,29 +256,30 @@ set hive.auto.convert.join=true; SET hive.mapjoin.optimized.hashtable=false; SET mapred.reduce.tasks=8; +drop table predicted; create table predicted as SELECT rowid, -- rf_ensemble(predicted) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted -- rf_ensemble(predicted.value, predicted.posteriori) as predicted -- avoid OOB accuracy (i.e., model_weight) FROM ( SELECT t.rowid, - -- hivemall v0.4.1 and later + -- from v0.4.1 to v0.4.2-rc4 -- tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later p.model_weight, tree_predict(p.model_id, p.model, t.features, "-classification") as predicted -- tree_predict(p.model_id, p.model, t.features, ${classification}) as predicted - -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5-rc.1 or later + -- tree_predict_v1(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted as predicted -- to use the old model in v0.5.0 or later FROM ( SELECT - -- hivemall v0.4.1 and later + -- from v0.4.1 to v0.4.2-rc4 -- model_id, model_type, pred_model - -- hivemall v0.5-rc.1 or later + -- v0.5.0 or later model_id, model_weight, model FROM model DISTRIBUTE BY rand(1) @@ -295,13 +301,13 @@ select count(1) from training; set hivevar:total_cnt=150; WITH t1 as ( -SELECT - t.rowid, - t.label as actual, - p.predicted.label as predicted -FROM - predicted p - LEFT OUTER JOIN training t ON (t.rowid = p.rowid) + SELECT + t.rowid, + t.label as actual, + p.predicted.label as predicted + FROM + predicted p + LEFT OUTER JOIN training t ON (t.rowid = p.rowid) ) SELECT count(1) / ${total_cnt} @@ -316,7 +322,7 @@ WHERE # Graphvis export > #### Note -> `tree_export` feature is supported from Hivemall v0.5-rc.1 or later. +> `tree_export` feature is supported from Hivemall v0.5.0 or later. > Better to limit tree depth on training by `-depth` option to plot a Decision > Tree. Hivemall provide `tree_export` to export a decision tree into [Graphviz](http://www.graphviz.org/) or human-readable Javascript format. You can find the usage by issuing the following query: