Repository: incubator-hivemall Updated Branches: refs/heads/master 9f01ebf20 -> 9876d0631
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/docs/gitbook/multiclass/iris_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/iris_scw.md b/docs/gitbook/multiclass/iris_scw.md index 79cdaf4..2d1b8bb 100644 --- a/docs/gitbook/multiclass/iris_scw.md +++ b/docs/gitbook/multiclass/iris_scw.md @@ -16,311 +16,65 @@ specific language governing permissions and limitations under the License. --> - -*NOTE: RandomForest is being supported from Hivemall v0.4 or later.* -# Dataset - -* https://archive.ics.uci.edu/ml/datasets/Iris - -``` -Attribute Information: - 1. sepal length in cm - 2. sepal width in cm - 3. petal length in cm - 4. petal width in cm - 5. class: - -- Iris Setosa - -- Iris Versicolour - -- Iris Virginica -``` - -# Table preparation - -```sql -create database iris; -use iris; - -create external table raw ( - sepal_length int, - sepal_width int, - petal_length int, - petak_width int, - class string -) -ROW FORMAT DELIMITED - FIELDS TERMINATED BY ',' - LINES TERMINATED BY '\n' -STORED AS TEXTFILE LOCATION '/dataset/iris/raw'; - -$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data -``` - -```sql -create table label_mapping -as -select - class, - rank - 1 as label -from ( -select - distinct class, - dense_rank() over (order by class) as rank -from - raw -) t -; -``` +# Training (multiclass classification) ```sql -create table training -as -select - rowid() as rowid, - array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as features, - t2.label -from - raw t1 - JOIN label_mapping t2 ON (t1.class = t2.class) -; -``` - -# Training - -`train_randomforest_classifier` takes a dense `features` in double[] and a `label` starting from 0. - -```sql -CREATE TABLE model -STORED AS SEQUENCEFILE -AS +create table model_scw1 as select - train_randomforest_classifier(features, label) - -- hivemall v0.4.1-alpha.2 and before - -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests) - -- hivemall v0.4.1 and later - -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) -from - training; -``` -*Note: The default TEXTFILE should not be used for model table when using Javascript output through "-output javascript" option.* - -``` -hive> desc model; -model_id int -model_type int -pred_model string -var_importance array<double> -oob_errors int -oob_tests int -``` - -## Training options - -"-help" option shows usage of the function. - -``` -select train_randomforest_classifier(features, label, "-help") from training; - -> FAILED: UDFArgumentException -usage: train_randomforest_classifier(double[] features, int label [, - string options]) - Returns a relation consists of <int model_id, - int model_type, string pred_model, array<double> var_importance, - int oob_errors, int oob_tests> [-attrs <arg>] [-depth <arg>] - [-disable_compression] [-help] [-leafs <arg>] [-output <arg>] - [-rule <arg>] [-seed <arg>] [-splits <arg>] [-trees <arg>] [-vars - <arg>] - -attrs,--attribute_types <arg> Comma separated attribute types (Q for - quantitative variable and C for - categorical variable. e.g., [Q,C,Q,C]) - -depth,--max_depth <arg> The maximum number of the tree depth - [default: Integer.MAX_VALUE] - -disable_compression Whether to disable compression of the - output script [default: false] - -help Show function help - -leafs,--max_leaf_nodes <arg> The maximum number of leaf nodes - [default: Integer.MAX_VALUE] - -output,--output_type <arg> The output type (serialization/ser or - opscode/vm or javascript/js) [default: - serialization] - -rule,--split_rule <arg> Split algorithm [default: GINI, ENTROPY] - -seed <arg> seed value in long [default: -1 - (random)] - -splits,--min_split <arg> A node that has greater than or equals - to `min_split` examples will split - [default: 2] - -trees,--num_trees <arg> The number of trees for each task - [default: 50] - -vars,--num_variables <arg> The number of random selected features - [default: ceil(sqrt(x[0].length))]. - int(num_variables * x[0].length) is - considered if num_variable is (0,1] + label, + feature, + argmin_kld(weight, covar) as weight +from + (select + train_multiclass_scw(features, label) as (label, feature, weight, covar) + from + training_x10 + ) t +group by label, feature; ``` -*Caution: "-num_trees" controls the number of trees for each task, not the total number of trees.* -### Parallelize Training - -To parallelize RandomForest training, you can use UNION ALL as follows: +# Predict ```sql -CREATE TABLE model -STORED AS SEQUENCEFILE -AS -select - train_randomforest_classifier(features, label, '-trees 25') -from - training -UNION ALL +create or replace view predict_scw1 +as select - train_randomforest_classifier(features, label, '-trees 25') -from - training -; -``` - -### Learning stats - -[`Variable importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp) and [`Out Of Bag (OOB) error rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) of RandomForest can be shown as follows: - -```sql + rowid, + m.col0 as score, + m.col1 as label +from ( select - array_sum(var_importance) as var_importance, - sum(oob_errors) / sum(oob_tests) as oob_err_rate -from - model; -``` -> [2.81010338879605,0.4970357753626371,23.790369091407698,14.315316390235273] 0.05333333333333334 - -### Output prediction model by Javascipt - -```sql -CREATE TABLE model_javascript -STORED AS SEQUENCEFILE -AS -select train_randomforest_classifier(features, label, "-output_type js -disable_compression") -from training; - -select model from model_javascript limit 1; -``` - -```js -if(x[3] <= 0.5) { - 0; -} else { - if(x[2] <= 4.5) { - if(x[3] <= 1.5) { - if(x[0] <= 4.5) { - 1; - } else { - if(x[0] <= 5.5) { - 1; - } else { - if(x[1] <= 2.5) { - 1; - } else { - 1; - } - } - } - } else { - 2; - } - } else { - if(x[3] <= 1.5) { - 2; - } else { - 2; - } - } -} -``` - -# Prediction - -```sql -set hivevar:classification=true; -set hive.auto.convert.join=true; -set hive.mapjoin.optimized.hashtable=false; - -create table predicted_vm -as -SELECT - rowid, - rf_ensemble(predicted) as predicted -FROM ( - SELECT - rowid, - -- hivemall v0.4.1-alpha.2 and before - -- tree_predict(p.model, t.features, ${classification}) as predicted - -- hivemall v0.4.1 and later - tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - FROM - model p - LEFT OUTER JOIN -- CROSS JOIN - training t -) t1 -group by - rowid -; -``` -_Note: Javascript outputs can be evaluated by `js_tree_predict`._ - -### Parallelize Prediction - -The following query runs predictions in N-parallel. It would reduce elapsed time for prediction almost by N. - -```sql -SET hivevar:classification=true; -set hive.auto.convert.join=true; -SET hive.mapjoin.optimized.hashtable=false; -SET mapred.reduce.tasks=8; - -create table predicted_vm -as -SELECT - rowid, - rf_ensemble(predicted) as predicted -FROM ( - SELECT - t.rowid, - -- hivemall v0.4.1-alpha.2 and before - -- tree_predict(p.pred_model, t.features, ${classification}) as predicted - -- hivemall v0.4.1 and later - tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted - FROM ( - SELECT model_id, model_type, pred_model - FROM model - DISTRIBUTE BY rand(1) - ) p - LEFT OUTER JOIN training t + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + test20p_exploded t LEFT OUTER JOIN + model_scw1 m ON (t.feature = m.feature) + group by + t.rowid, m.label ) t1 -group by - rowid -; +group by rowid +) t2; ``` # Evaluation ```sql -select count(1) from training; -> 150 - -set hivevar:total_cnt=150; +create or replace view eval_scw1 as +select + t.label as actual, + p.label as predicted +from + test20p t JOIN predict_scw1 p + on (t.rowid = p.rowid); -WITH t1 as ( -SELECT - t.rowid, - t.label as actual, - p.predicted.label as predicted -FROM - predicted_vm p - LEFT OUTER JOIN training t ON (t.rowid = p.rowid) -) -SELECT - count(1) / ${total_cnt} -FROM - t1 -WHERE - actual = predicted -; +select count(1)/30 from eval_scw1 +where actual = predicted; ``` -> 0.9533333333333334 + +> 0.9666666666666667 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/docs/gitbook/resources/images/iris.png ---------------------------------------------------------------------- diff --git a/docs/gitbook/resources/images/iris.png b/docs/gitbook/resources/images/iris.png new file mode 100644 index 0000000..1d8213d Binary files /dev/null and b/docs/gitbook/resources/images/iris.png differ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all-as-permanent.deprecated.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.deprecated.hive b/resources/ddl/define-all-as-permanent.deprecated.hive index 5558c4e..1cd604a 100644 --- a/resources/ddl/define-all-as-permanent.deprecated.hive +++ b/resources/ddl/define-all-as-permanent.deprecated.hive @@ -55,12 +55,6 @@ CREATE FUNCTION adadelta as 'hivemall.regression.AdaDeltaUDTF' USING JAR '${hive DROP FUNCTION IF EXISTS collect_all; CREATE FUNCTION collect_all as 'hivemall.tools.array.CollectAllUDAF' USING JAR '${hivemall_jar}'; -DROP FUNCTION IF EXISTS vm_tree_predict; -CREATE FUNCTION vm_tree_predict as 'hivemall.smile.tools.TreePredictByStackMachineUDF' USING JAR '${hivemall_jar}'; - -DROP FUNCTION IF EXISTS js_tree_predict; -CREATE FUNCTION js_tree_predict as 'hivemall.smile.tools.TreePredictByJavascriptUDF' USING JAR '${hivemall_jar}'; - DROP FUNCTION IF EXISTS train_gbt_classifier; CREATE FUNCTION train_gbt_classifier as 'hivemall.smile.classification.GradientTreeBoostingClassifierUDTF' USING JAR '${hivemall_jar}'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 075e733..a3b6725 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -689,6 +689,9 @@ CREATE FUNCTION train_randomforest_regr as 'hivemall.smile.regression.RandomFore DROP FUNCTION IF EXISTS tree_predict; CREATE FUNCTION tree_predict as 'hivemall.smile.tools.TreePredictUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS tree_export; +CREATE FUNCTION tree_export as 'hivemall.smile.tools.TreeExportUDF' USING JAR '${hivemall_jar}'; + DROP FUNCTION IF EXISTS rf_ensemble; CREATE FUNCTION rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF' USING JAR '${hivemall_jar}'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all.deprecated.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.deprecated.hive b/resources/ddl/define-all.deprecated.hive index 001666b..ded195f 100644 --- a/resources/ddl/define-all.deprecated.hive +++ b/resources/ddl/define-all.deprecated.hive @@ -55,12 +55,6 @@ create temporary function adadelta as 'hivemall.regression.AdaDeltaUDTF'; drop temporary function if exists collect_all; create temporary function collect_all as 'hivemall.tools.array.CollectAllUDAF'; -drop temporary function if exists vm_tree_predict; -create temporary function vm_tree_predict as 'hivemall.smile.tools.TreePredictByStackMachineUDF'; - -drop temporary function if exists js_tree_predict; -create temporary function js_tree_predict as 'hivemall.smile.tools.TreePredictByJavascriptUDF'; - drop temporary function if exists train_gbt_classifier; create temporary function train_gbt_classifier as 'hivemall.smile.classification.GradientTreeBoostingClassifierUDTF'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 7f5c727..77b6a98 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -681,6 +681,9 @@ create temporary function train_randomforest_regr as 'hivemall.smile.regression. drop temporary function if exists tree_predict; create temporary function tree_predict as 'hivemall.smile.tools.TreePredictUDF'; +drop temporary function if exists tree_export; +create temporary function tree_export as 'hivemall.smile.tools.TreeExportUDF'; + drop temporary function if exists rf_ensemble; create temporary function rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index fc4a60e..2193cd8 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -665,6 +665,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION train_randomforest_regr AS 'hivemall.s sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tree_predict") sqlContext.sql("CREATE TEMPORARY FUNCTION tree_predict AS 'hivemall.smile.tools.TreePredictUDF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tree_export") +sqlContext.sql("CREATE TEMPORARY FUNCTION tree_export AS 'hivemall.smile.tools.TreeExportUDF'") + sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS rf_ensemble") sqlContext.sql("CREATE TEMPORARY FUNCTION rf_ensemble AS 'hivemall.smile.tools.RandomForestEnsembleUDAF'") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9876d063/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 1d11d1a..b5e8ab3 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -173,6 +173,7 @@ create temporary function l2_norm as 'hivemall.tools.math.L2NormUDAF'; create temporary function dimsum_mapper as 'hivemall.knn.similarity.DIMSUMMapperUDTF'; create temporary function train_classifier as 'hivemall.classifier.GeneralClassifierUDTF'; create temporary function train_regression as 'hivemall.regression.GeneralRegressionUDTF'; +create temporary function tree_export as 'hivemall.smile.tools.TreeExportUDF'; -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
