Added a gitbook userguide Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/370e2aa3 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/370e2aa3 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/370e2aa3
Branch: refs/heads/master Commit: 370e2aa339bc5ee5cdb95e40f4e4e31cd51283da Parents: d3d7b55 Author: myui <[email protected]> Authored: Sun Oct 23 03:44:52 2016 +0900 Committer: myui <[email protected]> Committed: Sun Oct 23 04:11:23 2016 +0900 ---------------------------------------------------------------------- docs/gitbook/.gitignore | 3 + docs/gitbook/README.md | 22 + docs/gitbook/SUMMARY.md | 129 +++++ docs/gitbook/anomaly/lof.md | 185 ++++++ docs/gitbook/binaryclass/a9a.md | 0 docs/gitbook/binaryclass/a9a_dataset.md | 43 ++ docs/gitbook/binaryclass/a9a_lr.md | 79 +++ docs/gitbook/binaryclass/a9a_minibatch.md | 37 ++ docs/gitbook/binaryclass/kdd2010a.md | 0 docs/gitbook/binaryclass/kdd2010a_dataset.md | 72 +++ docs/gitbook/binaryclass/kdd2010a_scw.md | 185 ++++++ docs/gitbook/binaryclass/kdd2010b.md | 0 docs/gitbook/binaryclass/kdd2010b_arow.md | 51 ++ docs/gitbook/binaryclass/kdd2010b_dataset.md | 59 ++ docs/gitbook/binaryclass/news20.md | 0 docs/gitbook/binaryclass/news20_adagrad.md | 170 ++++++ docs/gitbook/binaryclass/news20_dataset.md | 84 +++ docs/gitbook/binaryclass/news20_pa.md | 243 ++++++++ docs/gitbook/binaryclass/news20_scw.md | 272 +++++++++ docs/gitbook/binaryclass/webspam.md | 0 docs/gitbook/binaryclass/webspam_dataset.md | 76 +++ docs/gitbook/binaryclass/webspam_scw.md | 136 +++++ docs/gitbook/book.json | 64 +++ docs/gitbook/eval/datagen.md | 0 docs/gitbook/eval/lr_datagen.md | 92 +++ docs/gitbook/eval/stat_eval.md | 56 ++ docs/gitbook/ft_engineering/ft_trans.md | 0 docs/gitbook/ft_engineering/hashing.md | 103 ++++ docs/gitbook/ft_engineering/quantify.md | 164 ++++++ docs/gitbook/ft_engineering/scaling.md | 173 ++++++ docs/gitbook/ft_engineering/tfidf.md | 149 +++++ docs/gitbook/ft_engineering/vectorizer.md | 42 ++ docs/gitbook/getting_started/README.md | 1 + docs/gitbook/getting_started/input-format.md | 214 +++++++ docs/gitbook/getting_started/installation.md | 25 + .../getting_started/permanent-functions.md | 42 ++ docs/gitbook/misc/generic_funcs.md | 210 +++++++ docs/gitbook/misc/tokenizer.md | 30 + docs/gitbook/misc/topk.md | 288 ++++++++++ docs/gitbook/multiclass/iris.md | 0 docs/gitbook/multiclass/iris_dataset.md | 203 +++++++ docs/gitbook/multiclass/iris_randomforest.md | 307 ++++++++++ docs/gitbook/multiclass/iris_scw.md | 307 ++++++++++ docs/gitbook/multiclass/news20.md | 0 docs/gitbook/multiclass/news20_dataset.md | 77 +++ docs/gitbook/multiclass/news20_ensemble.md | 180 ++++++ .../multiclass/news20_one-vs-the-rest.md | 330 +++++++++++ .../news20_one-vs-the-rest_dataset.md | 52 ++ docs/gitbook/multiclass/news20_pa.md | 90 +++ docs/gitbook/multiclass/news20_scw.md | 319 +++++++++++ docs/gitbook/pig/.gitkeep | 0 docs/gitbook/recommend/cf.md | 0 docs/gitbook/recommend/item_based_cf.md | 565 +++++++++++++++++++ docs/gitbook/recommend/movielens.md | 0 docs/gitbook/recommend/movielens_cv.md | 63 +++ docs/gitbook/recommend/movielens_dataset.md | 160 ++++++ docs/gitbook/recommend/movielens_fm.md | 249 ++++++++ docs/gitbook/recommend/movielens_mf.md | 137 +++++ docs/gitbook/recommend/news20.md | 0 docs/gitbook/recommend/news20_bbit_minhash.md | 50 ++ docs/gitbook/recommend/news20_jaccard.md | 123 ++++ docs/gitbook/recommend/news20_knn.md | 103 ++++ docs/gitbook/regression/e2006.md | 0 docs/gitbook/regression/e2006_arow.md | 259 +++++++++ docs/gitbook/regression/e2006_dataset.md | 72 +++ docs/gitbook/regression/kddcup12tr2.md | 0 docs/gitbook/regression/kddcup12tr2_adagrad.md | 109 ++++ docs/gitbook/regression/kddcup12tr2_dataset.md | 227 ++++++++ docs/gitbook/regression/kddcup12tr2_lr.md | 141 +++++ .../regression/kddcup12tr2_lr_amplify.md | 103 ++++ .../images/hivemall-logo-color-small.png | Bin 0 -> 33630 bytes docs/gitbook/resources/images/techstack.png | Bin 0 -> 198979 bytes docs/gitbook/spark/.gitkeep | 0 docs/gitbook/tips/README.md | 0 docs/gitbook/tips/addbias.md | 43 ++ docs/gitbook/tips/emr.md | 182 ++++++ docs/gitbook/tips/ensemble_learning.md | 180 ++++++ docs/gitbook/tips/general_tips.md | 0 docs/gitbook/tips/hadoop_tuning.md | 79 +++ docs/gitbook/tips/mixserver.md | 68 +++ docs/gitbook/tips/rand_amplify.md | 103 ++++ docs/gitbook/tips/rowid.md | 31 + docs/gitbook/tips/rt_prediction.md | 234 ++++++++ docs/gitbook/troubleshooting/README.md | 0 docs/gitbook/troubleshooting/asterisk.md | 3 + .../troubleshooting/mapjoin_classcastex.md | 8 + .../troubleshooting/mapjoin_task_error.md | 8 + docs/gitbook/troubleshooting/num_mappers.md | 20 + docs/gitbook/troubleshooting/oom.md | 20 + src/site/site.xml | 6 +- src/site/xdoc/index.xml.vm | 7 +- 91 files changed, 8711 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/.gitignore ---------------------------------------------------------------------- diff --git a/docs/gitbook/.gitignore b/docs/gitbook/.gitignore new file mode 100644 index 0000000..b12433d --- /dev/null +++ b/docs/gitbook/.gitignore @@ -0,0 +1,3 @@ +_book/ +node_modules/ +*.log http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/README.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/README.md b/docs/gitbook/README.md new file mode 100644 index 0000000..82602f8 --- /dev/null +++ b/docs/gitbook/README.md @@ -0,0 +1,22 @@ +# Introduction + +<div class="alert alert-info"> +Apache Hivemall is a collection of machine learning algorithms and versatile data analytics functions. It provides a number of ease of use machine learning functionalities through the <a href="https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF">Apache Hive UDF/UDAF/UDTF interface</a>. +</div> + +<div style="text-align:center"><img src="resources/images/hivemall-logo-color-small.png"/></div> + +Apache Hivemall offers a variety of functionalities: <strong>regression, classification, recommendation, anomaly detection, k-nearest neighbor, and feature engineering</strong>. It also supports state-of-the-art machine learning algorithms such as Soft Confidence Weighted, Adaptive Regularization of Weight Vectors, Factorization Machines, and AdaDelta. + +## Architecture + +Apache Hivemall is mainly designed to run on [Apache Hive](https://hive.apache.org/) but it also supports [Apache Pig](https://pig.apache.org/) and [Apache Spark](http://spark.apache.org/) for the runtime. +Thus, it can be considered as a cross platform library for machine learning; prediction models built by a batch query of Apache Hive can be used on Apache Spark/Pig, and conversely, prediction models build by Apache Spark can be used from Apache Hive/Pig. + +<div style="text-align:center"><img src="resources/images/techstack.png" width="80%" height="80%"/></div> + +--- + +<font color="gray"> +<sub>Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the <a href="http://incubator.apache.org/">Apache Incubator</a>.</sub> +</font> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/SUMMARY.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md new file mode 100644 index 0000000..d85f952 --- /dev/null +++ b/docs/gitbook/SUMMARY.md @@ -0,0 +1,129 @@ +# Summary + +## TABLE OF CONTENTS + +* [Getting Started](getting_started/README.md) + * [Installation](getting_started/installation.md) + * [Install as permanent functions](getting_started/permanent-functions.md) + * [Input Format](getting_started/input-format.md) + +* [Tips for Effective Hivemall](tips/README.md) + * [Explicit addBias() for better prediction](tips/addbias.md) + * [Use rand_amplify() to better prediction results](tips/rand_amplify.md) + * [Real-time Prediction on RDBMS](tips/rt_prediction.md) + * [Ensemble learning for stable prediction](tips/ensemble_learning.md) + * [Mixing models for a better prediction convergence (MIX server)](tips/mixserver.md) + * [Run Hivemall on Amazon Elastic MapReduce](tips/emr.md) + +* [General Hive/Hadoop tips](tips/general_tips.md) + * [Adding rowid for each row](tips/rowid.md) + * [Hadoop tuning for Hivemall](tips/hadoop_tuning.md) + +* [Troubleshooting](troubleshooting/README.md) + * [OutOfMemoryError in training](troubleshooting/oom.md) + * [SemanticException Generate Map Join Task Error: Cannot serialize object](troubleshooting/mapjoin_task_error.md) + * [Asterisk argument for UDTF does not work](troubleshooting/asterisk.md) + * [The number of mappers is less than input splits in Hadoop 2.x](troubleshooting/num_mappers.md) + * [Map-side Join causes ClassCastException on Tez](troubleshooting/mapjoin_classcastex.md) + +## Part II - Generic Features + +* [List of generic Hivemall functions](misc/generic_funcs.md) +* [Efficient Top-K query processing](misc/topk.md) +* [English/Japanese Text Tokenizer](misc/tokenizer.md) + +## Part III - Feature Engineering + +* [Feature Scaling](ft_engineering/scaling.md) +* [Feature Hashing](ft_engineering/hashing.md) +* [TF-IDF calculation](ft_engineering/tfidf.md) + +* [FEATURE TRANSFORMATION](ft_engineering/ft_trans.md) + * [Vectorize Features](ft_engineering/vectorizer.md) + * [Quantify non-number features](ft_engineering/quantify.md) + +## Part IV - Evaluation + +* [Statistical evaluation of a prediction model](eval/stat_eval.md) + +* [Data Generation](eval/datagen.md) + * [Logistic Regression data generation](eval/lr_datagen.md) + +## Part V - Binary classification + +* [a9a Tutorial](binaryclass/a9a.md) + * [Data preparation](binaryclass/a9a_dataset.md) + * [Logistic Regression](binaryclass/a9a_lr.md) + * [Mini-batch Gradient Descent](binaryclass/a9a_minibatch.md) + +* [News20 Tutorial](binaryclass/news20.md) + * [Data preparation](binaryclass/news20_dataset.md) + * [Perceptron, Passive Aggressive](binaryclass/news20_pa.md) + * [CW, AROW, SCW](binaryclass/news20_scw.md) + * [AdaGradRDA, AdaGrad, AdaDelta](binaryclass/news20_adagrad.md) + +* [KDD2010a Tutorial](binaryclass/kdd2010a.md) + * [Data preparation](binaryclass/kdd2010a_dataset.md) + * [PA, CW, AROW, SCW](binaryclass/kdd2010a_scw.md) + +* [KDD2010b Tutorial](binaryclass/kdd2010b.md) + * [Data preparation](binaryclass/kdd2010b_dataset.md) + * [AROW](binaryclass/kdd2010b_arow.md) + +* [Webspam Tutorial](binaryclass/webspam.md) + * [Data pareparation](binaryclass/webspam_dataset.md) + * [PA1, AROW, SCW](binaryclass/webspam_scw.md) + +## Part VI - Multiclass classification + +* [News20 Multiclass Tutorial](multiclass/news20.md) + * [Data preparation](multiclass/news20_dataset.md) + * [Data preparation for one-vs-the-rest classifiers](multiclass/news20_one-vs-the-rest_dataset.md) + * [PA](multiclass/news20_pa.md) + * [CW, AROW, SCW](multiclass/news20_scw.md) + * [Ensemble learning](multiclass/news20_ensemble.md) + * [one-vs-the-rest classifier](multiclass/news20_one-vs-the-rest.md) + +* [Iris Tutorial](multiclass/iris.md) + * [Data preparation](multiclass/iris_dataset.md) + * [SCW](multiclass/iris_scw.md) + * [RandomForest](multiclass/iris_randomforest.md) + +## Part VII - Regression + +* [E2006-tfidf regression Tutorial](regression/e2006.md) + * [Data preparation](regression/e2006_dataset.md) + * [Passive Aggressive, AROW](regression/e2006_arow.md) + +* [KDDCup 2012 track 2 CTR prediction Tutorial](regression/kddcup12tr2.md) + * [Data preparation](regression/kddcup12tr2_dataset.md) + * [Logistic Regression, Passive Aggressive](regression/kddcup12tr2_lr.md) + * [Logistic Regression with Amplifier](regression/kddcup12tr2_lr_amplify.md) + * [AdaGrad, AdaDelta](regression/kddcup12tr2_adagrad.md) + +## Part VIII - Recommendation + +* [Collaborative Filtering](recommend/cf.md) + * [Item-based Collaborative Filtering](recommend/item_based_cf.md) + +* [News20 related article recommendation Tutorial](recommend/news20.md) + * [Data preparation](multiclass/news20_dataset.md) + * [LSH/Minhash and Jaccard Similarity](recommend/news20_jaccard.md) + * [LSH/Minhash and Brute-Force Search](recommend/news20_knn.md) + * [kNN search using b-Bits Minhash](recommend/news20_bbit_minhash.md) + +* [MovieLens movie recommendation Tutorial](recommend/movielens.md) + * [Data preparation](recommend/movielens_dataset.md) + * [Matrix Factorization](recommend/movielens_mf.md) + * [Factorization Machine](recommend/movielens_fm.md) + * [10-fold Cross Validation (Matrix Factorization)](recommend/movielens_cv.md) + +## Part IX - Anomaly Detection + +* [Outlier Detection using Local Outlier Factor (LOF)](anomaly/lof.md) + +## Part X - External References + +* [Hivemall on Apache Spark](https://github.com/maropu/hivemall-spark) +* [Hivemall on Apache Pig](https://github.com/daijyc/hivemall/wiki/PigHome) + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/anomaly/lof.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/anomaly/lof.md b/docs/gitbook/anomaly/lof.md new file mode 100644 index 0000000..f8f0b61 --- /dev/null +++ b/docs/gitbook/anomaly/lof.md @@ -0,0 +1,185 @@ +This article introduce how to find outliers using [Local Outlier Detection (LOF)](http://en.wikipedia.org/wiki/Local_outlier_factor) on Hivemall. + +# Data Preparation + +```sql +create database lof; +use lof; + +create external table hundred_balls ( + rowid int, + weight double, + specific_heat double, + reflectance double +) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY ' ' +STORED AS TEXTFILE LOCATION '/dataset/lof/hundred_balls'; +``` + +Download [hundred_balls.txt](https://github.com/myui/hivemall/blob/master/resources/examples/lof/hundred_balls.txt) that is originally provides in [this article](http://next.rikunabi.com/tech/docs/ct_s03600.jsp?p=002259). + +You can find outliers in [this picture](http://next.rikunabi.com/tech/contents/ts_report/img/201303/002259/part1_img1.jpg). As you can see, Rowid `87` is apparently an outlier. + +```sh +awk '{FS=" "; OFS=" "; print NR,$0}' hundred_balls.txt | \ +hadoop fs -put - /dataset/lof/hundred_balls/hundred_balls.txt +``` + +```sql +create table train +as +select rowid, array(concat("weight:", weight), concat("specific_heat:", specific_heat), concat("reflectance:", reflectance)) as features +from hundred_balls; +``` + +## Apply Data Normalization + +```sql +create table train_normalized +as +WITH fv as ( +select + rowid, + extract_feature(feature) as feature, + extract_weight(feature) as value +from + train + LATERAL VIEW explode(features) exploded AS feature +), +stats as ( +select + feature, + -- avg(value) as mean, stddev_pop(value) as stddev + min(value) as min, max(value) as max +from + fv +group by + feature +), +norm as ( +select + rowid, + t1.feature, + -- zscore(t1.value, t2.mean, t2.stddev) as zscore + rescale(t1.value, t2.min, t2.max) as minmax +from + fv t1 JOIN + stats t2 ON (t1.feature = t2.feature) +), +norm_fv as ( +select + rowid, + -- concat(feature, ":", zscore) as feature + concat(feature, ":", minmax) as feature +from + norm +) +select + rowid, + collect_list(feature) as features +from + norm_fv +group by + rowid +; +``` + +``` +hive> select * from train_normalized limit 3; + +1 ["reflectance:0.5252967","specific_heat:0.19863537","weight:0.0"] +2 ["reflectance:0.5950446","specific_heat:0.09166764","weight:0.052084323"] +3 ["reflectance:0.6797837","specific_heat:0.12567581","weight:0.13255163"] +``` + +# Outlier Detection using Local Outlier Facotor (LOF) + +```sql +-- workaround to deal with a bug in Hive/Tez +-- https://issues.apache.org/jira/browse/HIVE-10729 +-- set hive.auto.convert.join=false; +set hive.mapjoin.optimized.hashtable=false; + +-- parameter of LoF +set hivevar:k=12; + +-- find topk outliers +set hivevar:topk=3; +``` + +```sql +create table list_neighbours +as +select + each_top_k( + -${k}, t1.rowid, euclid_distance(t1.features, t2.features), + t1.rowid, + t2.rowid + ) as (rank, distance, target, neighbour) +from + train_normalized t1 + LEFT OUTER JOIN train_normalized t2 +where + t1.rowid != t2.rowid +; +``` + +_Note: `list_neighbours` table SHOULD be created because `list_neighbours` is used multiple times._ + +_Note: [`each_top_k`](https://github.com/myui/hivemall/pull/196) is supported from Hivemall v0.3.2-3 or later._ + +_Note: To parallelize a top-k computation, break LEFT-hand table into piece as describe in [this page](https://github.com/myui/hivemall/wiki/Efficient-Top-k-computation-on-Apache-Hive-using-Hivemall-UDTF#parallelization-of-similarity-computation-using-with-clause)._ + +```sql +WITH k_distance as ( +select + target, + max(distance) as k_distance +from + list_neighbours +group by + target +), +reach_distance as ( +select + t1.target, + max2(t2.k_distance, t1.distance) as reach_distance +from + list_neighbours t1 JOIN + k_distance t2 ON (t1.neighbour = t2.target) +), +lrd as ( +select + target, + 1.0 / avg(reach_distance) as lrd +from + reach_distance +group by + target +), +neighbours_lrd as ( +select + t1.target, + t2.lrd +from + list_neighbours t1 JOIN + lrd t2 on (t1.neighbour = t2.target) +) +select + t1.target, + sum(t2.lrd / t1.lrd) / count(1) as lof +from + lrd t1 JOIN + neighbours_lrd t2 on (t1.target = t2.target) +group by + t1.target +order by lof desc +limit ${topk}; +``` + +``` +> 87 3.031143749957831 +> 16 1.9755564408378874 +> 1 1.8415763570939774 +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/a9a.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/a9a.md b/docs/gitbook/binaryclass/a9a.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/a9a_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/a9a_dataset.md b/docs/gitbook/binaryclass/a9a_dataset.md new file mode 100644 index 0000000..28bcd57 --- /dev/null +++ b/docs/gitbook/binaryclass/a9a_dataset.md @@ -0,0 +1,43 @@ +a9a +=== +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a + +--- + +preparation +========= + +[conv.awk](https://raw.githubusercontent.com/myui/hivemall/master/resources/misc/conv.awk) + +``` +cd /mnt/archive/datasets/classification/a9a +awk -f conv.awk a9a | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.train +awk -f conv.awk a9a.t | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.test +``` + +## Putting data on HDFS +``` +hadoop fs -mkdir -p /dataset/a9a/train +hadoop fs -mkdir -p /dataset/a9a/test + +hadoop fs -copyFromLocal a9a.train /dataset/a9a/train +hadoop fs -copyFromLocal a9a.test /dataset/a9a/test +``` + +## Training/test data prepareation +```sql +create database a9a; +use a9a; + +create external table a9atrain ( + rowid int, + label float, + features ARRAY<STRING> +) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/a9a/train'; + +create external table a9atest ( + rowid int, + label float, + features ARRAY<STRING> +) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/a9a/test'; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/a9a_lr.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/a9a_lr.md b/docs/gitbook/binaryclass/a9a_lr.md new file mode 100644 index 0000000..5029c49 --- /dev/null +++ b/docs/gitbook/binaryclass/a9a_lr.md @@ -0,0 +1,79 @@ +a9a +=== +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a + +_Training with iterations is OBSOLUTE in Hivemall._ +_Using amplifier and shuffling inputs is RECOMMENDED in Hivemall._ + +--- + +## UDF preparation + +```sql +select count(1) from a9atrain; +-- set total_steps ideally be "count(1) / #map tasks" +set hivevar:total_steps=32561; + +select count(1) from a9atest; +set hivevar:num_test_instances=16281; +``` + +## training +```sql +create table a9a_model1 +as +select + cast(feature as int) as feature, + avg(weight) as weight +from + (select + logress(addBias(features),label,"-total_steps ${total_steps}") as (feature,weight) + from + a9atrain + ) t +group by feature; +``` +_"-total_steps" option is optional for logress() function._ +_I recommend you NOT to use options (e.g., total_steps and eta0) if you are not familiar with those options. Hivemall then uses an autonomic ETA (learning rate) estimator._ + +## prediction +```sql +create or replace view a9a_predict1 +as +WITH a9atest_exploded as ( +select + rowid, + label, + extract_feature(feature) as feature, + extract_weight(feature) as value +from + a9atest LATERAL VIEW explode(addBias(features)) t AS feature +) +select + t.rowid, + sigmoid(sum(m.weight * t.value)) as prob, + CAST((case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1.0 else 0.0 end) as FLOAT) as label +from + a9atest_exploded t LEFT OUTER JOIN + a9a_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view a9a_submit1 as +select + t.label as actual, + pd.label as predicted, + pd.prob as probability +from + a9atest t JOIN a9a_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1) / ${num_test_instances} from a9a_submit1 +where actual == predicted; +``` +> 0.8430071862907684 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/a9a_minibatch.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/a9a_minibatch.md b/docs/gitbook/binaryclass/a9a_minibatch.md new file mode 100644 index 0000000..714db6a --- /dev/null +++ b/docs/gitbook/binaryclass/a9a_minibatch.md @@ -0,0 +1,37 @@ +This page explains how to apply [Mini-Batch Gradient Descent](https://class.coursera.org/ml-003/lecture/106) for the training of logistic regression explained in [this example](https://github.com/myui/hivemall/wiki/a9a-binary-classification-(logistic-regression)). + +See [this page](https://github.com/myui/hivemall/wiki/a9a-binary-classification-(logistic-regression)) first. This content depends on it. + +# Training + +Replace `a9a_model1` of [this example](https://github.com/myui/hivemall/wiki/a9a-binary-classification-(logistic-regression)). + +```sql +set hivevar:total_steps=32561; +set hivevar:mini_batch_size=10; + +create table a9a_model1 +as +select + cast(feature as int) as feature, + avg(weight) as weight +from + (select + logress(addBias(features),label,"-total_steps ${total_steps} -mini_batch ${mini_batch_size}") as (feature,weight) + from + a9atrain + ) t +group by feature; +``` + +# Evaluation + +```sql +select count(1) / ${num_test_instances} from a9a_submit1 +where actual == predicted; +``` + + +| Stochastic Gradient Descent | Minibatch Gradient Descent | +| ------------- | ------------- | +| 0.8430071862907684 | 0.8463239358761747 | \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/kdd2010a.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010a.md b/docs/gitbook/binaryclass/kdd2010a.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/kdd2010a_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010a_dataset.md b/docs/gitbook/binaryclass/kdd2010a_dataset.md new file mode 100644 index 0000000..731d68b --- /dev/null +++ b/docs/gitbook/binaryclass/kdd2010a_dataset.md @@ -0,0 +1,72 @@ +[http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (algebra)](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (algebra)) + +* # of classes: 2 +* # of data: 8,407,752 (training) / 510,302 (testing) +* # of features: 20,216,830 in about 2.73 GB (training) / 20,216,830 (testing) + +--- +# Define training/testing tables +```sql +add jar ./tmp/hivemall.jar; +source ./tmp/define-all.hive; + +create database kdd2010; +use kdd2010; + +create external table kdd10a_train ( + rowid int, + label int, + features ARRAY<STRING> +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," +STORED AS TEXTFILE LOCATION '/dataset/kdd10a/train'; + +create external table kdd10a_test ( + rowid int, + label int, + features ARRAY<STRING> +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," +STORED AS TEXTFILE LOCATION '/dataset/kdd10a/test'; +``` + +# Putting data into HDFS +[conv.awk](https://raw.githubusercontent.com/myui/hivemall/master/scripts/misc/conv.awk) +```sh +awk -f conv.awk kdda | hadoop fs -put - /dataset/kdd10a/train/kdda +awk -f conv.awk kdda.t | hadoop fs -put - /dataset/kdd10a/test/kdda.t +``` + +# Make auxiliary tables +```sql +create table kdd10a_train_orcfile ( + rowid bigint, + label int, + features array<string> +) STORED AS orc tblproperties ("orc.compress"="SNAPPY"); + +-- SET mapred.reduce.tasks=64; +INSERT OVERWRITE TABLE kdd10a_train_orcfile +select * from kdd10a_train +CLUSTER BY rand(); +-- SET mapred.reduce.tasks=-1; + +create table kdd10a_test_exploded as +select + rowid, + label, + split(feature,":")[0] as feature, + cast(split(feature,":")[1] as float) as value +from + kdd10a_test LATERAL VIEW explode(addBias(features)) t AS feature; + +set hivevar:xtimes=3; +set hivevar:shufflebuffersize=1000; +-- set hivemall.amplify.seed=32; +create or replace view kdd10a_train_x3 +as +select + rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features) +from + kdd10a_train_orcfile; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/kdd2010a_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010a_scw.md b/docs/gitbook/binaryclass/kdd2010a_scw.md new file mode 100644 index 0000000..ee8fbba --- /dev/null +++ b/docs/gitbook/binaryclass/kdd2010a_scw.md @@ -0,0 +1,185 @@ +# PA1 +## Train +```sql +-- SET mapred.reduce.tasks=32; +drop table kdd10a_pa1_model1; +create table kdd10a_pa1_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + train_pa1(addBias(features),label) as (feature,weight) + from + kdd10a_train_x3 + ) t +group by feature; +``` + +## Predict +```sql +create or replace view kdd10a_pa1_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + kdd10a_test_exploded t LEFT OUTER JOIN + kdd10a_pa1_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +# Evaluate +```sql +create or replace view kdd10a_pa1_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + kdd10a_test t JOIN kdd10a_pa1_predict1 pd + on (t.rowid = pd.rowid); + +select count(1)/510302 from kdd10a_pa1_submit1 +where actual = predicted; +``` +> 0.8677782959894337 + +# CW +```sql +-- SET mapred.reduce.tasks=32; +drop table kdd10a_cw_model1; +create table kdd10a_cw_model1 as +select + feature, + argmin_kld(weight, covar) as weight +from + (select + train_cw(addBias(features),label) as (feature,weight,covar) + from + kdd10a_train_x3 + ) t +group by feature; + +create or replace view kdd10a_cw_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + kdd10a_test_exploded t LEFT OUTER JOIN + kdd10a_cw_model1 m ON (t.feature = m.feature) +group by + t.rowid; + +create or replace view kdd10a_cw_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + kdd10a_test t JOIN kdd10a_cw_predict1 pd + on (t.rowid = pd.rowid); + +select count(1)/510302 from kdd10a_cw_submit1 +where actual = predicted; +``` +> 0.8678037711002504 + +# AROW +```sql +-- SET mapred.reduce.tasks=32; +drop table kdd10a_arow_model1; +create table kdd10a_arow_model1 as +select + feature, + -- voted_avg(weight) as weight + argmin_kld(weight, covar) as weight -- [hivemall v0.2alpha3 or later] +from + (select + -- train_arow(addBias(features),label) as (feature,weight) -- [hivemall v0.1] + train_arow(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + from + kdd10a_train_x3 + ) t +group by feature; + +create or replace view kdd10a_arow_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + kdd10a_test_exploded t LEFT OUTER JOIN + kdd10a_arow_model1 m ON (t.feature = m.feature) +group by + t.rowid; + +create or replace view kdd10a_arow_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + kdd10a_test t JOIN kdd10a_arow_predict1 pd + on (t.rowid = pd.rowid); + +select count(1)/510302 from kdd10a_arow_submit1 +where actual = predicted; +``` +> 0.8676038894615345 + +# SCW +```sql +-- SET mapred.reduce.tasks=32; +drop table kdd10a_scw_model1; +create table kdd10a_scw_model1 as +select + feature, + argmin_kld(weight, covar) as weight +from + (select + train_scw(addBias(features),label) as (feature,weight,covar) + from + kdd10a_train_x3 + ) t +group by feature; + +create or replace view kdd10a_scw_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + kdd10a_test_exploded t LEFT OUTER JOIN + kdd10a_scw_model1 m ON (t.feature = m.feature) +group by + t.rowid; + +create or replace view kdd10a_scw_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + kdd10a_test t JOIN kdd10a_scw_predict1 pd + on (t.rowid = pd.rowid); + +select count(1)/510302 from kdd10a_scw_submit1 +where actual = predicted; +``` +> 0.8678096499719774 + +--- + +| Algorithm | Accuracy | +|:-----------|------------:| +| AROW | 0.8676038894615345 | +| PA1 | 0.8677782959894337 | +| CW | 0.8678037711002504 | +| SCW1 | 0.8678096499719774 | \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/kdd2010b.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010b.md b/docs/gitbook/binaryclass/kdd2010b.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/kdd2010b_arow.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010b_arow.md b/docs/gitbook/binaryclass/kdd2010b_arow.md new file mode 100644 index 0000000..7ac845a --- /dev/null +++ b/docs/gitbook/binaryclass/kdd2010b_arow.md @@ -0,0 +1,51 @@ +## training +```sql +-- SET mapred.reduce.tasks=32; +drop table kdd10b_arow_model1; +create table kdd10b_arow_model1 as +select + feature, + -- voted_avg(weight) as weight + argmin_kld(weight, covar) as weight -- [hivemall v0.2alpha3 or later] +from + (select + -- train_arow(addBias(features),label) as (feature,weight) -- [hivemall v0.1] + train_arow(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + from + kdd10b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view kdd10b_arow_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + kdd10b_test_exploded t LEFT OUTER JOIN + kdd10b_arow_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view kdd10b_arow_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + kdd10b_test t JOIN kdd10b_arow_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/748401 from kdd10b_arow_submit1 +where actual = predicted; +``` +> 0.8565808971393678 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/kdd2010b_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/kdd2010b_dataset.md b/docs/gitbook/binaryclass/kdd2010b_dataset.md new file mode 100644 index 0000000..5e26dba --- /dev/null +++ b/docs/gitbook/binaryclass/kdd2010b_dataset.md @@ -0,0 +1,59 @@ +[http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (bridge to algebra)](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (bridge to algebra)) + +* # of classes: 2 +* # of data: 19,264,097 / 748,401 (testing) +* # of features: 29,890,095 / 29,890,095 (testing) + +--- +# Define training/testing tables +```sql +add jar ./tmp/hivemall.jar; +source ./tmp/define-all.hive; + +create database kdd2010; +use kdd2010; + +create external table kdd10b_train ( + rowid int, + label int, + features ARRAY<STRING> +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," +STORED AS TEXTFILE LOCATION '/dataset/kdd10b/train'; + +create external table kdd10b_test ( + rowid int, + label int, + features ARRAY<STRING> +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," +STORED AS TEXTFILE LOCATION '/dataset/kdd10b/test'; +``` + +# Putting data into HDFS +[conv.awk](https://raw.githubusercontent.com/myui/hivemall/master/scripts/misc/conv.awk) +```sh +awk -f conv.awk kddb | hadoop fs -put - /dataset/kdd10b/train/kddb +awk -f conv.awk kddb.t | hadoop fs -put - /dataset/kdd10b/test/kddb.t +``` + +# Make auxiliary tables +```sql +create table kdd10b_test_exploded as +select + rowid, + label, + split(feature,":")[0] as feature, + cast(split(feature,":")[1] as float) as value +from + kdd10b_test LATERAL VIEW explode(addBias(features)) t AS feature; + +set hivevar:xtimes=3; +set hivevar:shufflebuffersize=1000; +create or replace view kdd10b_train_x3 +as +select + rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features) +from + kdd10b_train; +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/news20.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20.md b/docs/gitbook/binaryclass/news20.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/news20_adagrad.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_adagrad.md b/docs/gitbook/binaryclass/news20_adagrad.md new file mode 100644 index 0000000..08e39df --- /dev/null +++ b/docs/gitbook/binaryclass/news20_adagrad.md @@ -0,0 +1,170 @@ +_Note that this feature is supported since Hivemall v0.3-beta2 or later._ + +## UDF preparation +``` +add jar ./tmp/hivemall-with-dependencies.jar; +source ./tmp/define-all.hive; + +use news20; +``` + +#[AdaGradRDA] + +_Note that the current AdaGradRDA implmenetation can only be applied to classification, not to regression, because it uses hinge loss for the loss function._ + + +## model building +```sql +drop table news20b_adagrad_rda_model1; +create table news20b_adagrad_rda_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + train_adagrad_rda(addBias(features),label) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_adagrad_rda_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_adagrad_rda_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_adagrad_rda_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_adagrad_rda_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_adagrad_rda_submit1 +where actual == predicted; +``` +> SCW1 0.9661729383506805 + +> ADAGRAD+RDA 0.9677742193755005 + +#[AdaGrad] + +_Note that AdaGrad is better suited for a regression problem because the current implementation only support logistic loss._ + +## model building +```sql +drop table news20b_adagrad_model1; +create table news20b_adagrad_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + adagrad(addBias(features),convert_label(label)) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` +_adagrad takes 0/1 for a label value and convert_label(label) converts a label value from -1/+1 to 0/1._ +## prediction +```sql +create or replace view news20b_adagrad_predict1 +as +select + t.rowid, + case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_adagrad_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_adagrad_submit1 as +select + t.label as actual, + p.label as predicted +from + news20b_test t JOIN news20b_adagrad_predict1 p + on (t.rowid = p.rowid); +``` + +```sql +select count(1)/4996 from news20b_adagrad_submit1 +where actual == predicted; +``` +> 0.9549639711769415 (adagrad) + +#[AdaDelta] + +_Note that AdaDelta is better suited for regression problem because the current implementation only support logistic loss._ + +## model building +```sql +drop table news20b_adadelta_model1; +create table news20b_adadelta_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + adadelta(addBias(features),convert_label(label)) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_adadelta_predict1 +as +select + t.rowid, + case when sigmoid(sum(m.weight * t.value)) >= 0.5 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_adadelta_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_adadelta_submit1 as +select + t.label as actual, + p.label as predicted +from + news20b_test t JOIN news20b_adadelta_predict1 p + on (t.rowid = p.rowid); +``` + +```sql +select count(1)/4996 from news20b_adadelta_submit1 +where actual == predicted; +``` +> 0.9549639711769415 (adagrad) + +> 0.9545636509207366 (adadelta) + +_Note that AdaDelta often performs better than AdaGrad._ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/news20_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_dataset.md b/docs/gitbook/binaryclass/news20_dataset.md new file mode 100644 index 0000000..5ff80cd --- /dev/null +++ b/docs/gitbook/binaryclass/news20_dataset.md @@ -0,0 +1,84 @@ +Get the news20b dataset. +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#news20.binary + +```sh +cat <<EOF > conv.awk +BEGIN{ FS=" " } +{ + label=\$1; + features=\$2; + for(i=3;i<=NF;i++) + { + features = features "," \$i; + } + print NR "\t" label "\t" features; +} +END{} +EOF + +sort -R news20.binary > news20.random +# [mac] +# $ brew install coreutils +# $ gsort -R news20.binary > news20.random +head -15000 news20.random > news20.train +tail -4996 news20.random > news20.test +gawk -f conv.awk news20.train > news20.train.t +gawk -f conv.awk news20.test > news20.test.t +``` + +## Putting data on HDFS +``` +hadoop fs -mkdir -p /dataset/news20-binary/train +hadoop fs -mkdir -p /dataset/news20-binary/test + +hadoop fs -copyFromLocal news20.train.t /dataset/news20-binary/train +hadoop fs -copyFromLocal news20.test.t /dataset/news20-binary/test +``` + +## Training/test data prepareation +```sql +create database news20; +use news20; + +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; + +Create external table news20b_train ( + rowid int, + label int, + features ARRAY<STRING> +) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/news20-binary/train'; + +Create external table news20b_test ( + rowid int, + label int, + features ARRAY<STRING> +) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/news20-binary/test'; + +set hivevar:seed=31; +create or replace view news20b_train_x3 +as +select + * +from ( +select + amplify(3, *) as (rowid, label, features) +from + news20b_train +) t +CLUSTER BY rand(${seed}); + +create table news20b_test_exploded as +select + rowid, + label, + cast(split(feature,":")[0] as int) as feature, + cast(split(feature,":")[1] as float) as value + -- hivemall v0.3.1 or later + -- extract_feature(feature) as feature, + -- extract_weight(feature) as value +from + news20b_test LATERAL VIEW explode(addBias(features)) t AS feature; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/news20_pa.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_pa.md b/docs/gitbook/binaryclass/news20_pa.md new file mode 100644 index 0000000..7763a15 --- /dev/null +++ b/docs/gitbook/binaryclass/news20_pa.md @@ -0,0 +1,243 @@ +## UDF preparation +``` +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; +``` + +--- +#[Perceptron] + +## model building +```sql +drop table news20b_perceptron_model1; +create table news20b_perceptron_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + perceptron(addBias(features),label) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_perceptron_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_perceptron_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_perceptron_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_perceptron_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_perceptron_submit1 +where actual == predicted; +``` +> 0.9459567654123299 + +## Cleaning + +```sql +drop table news20b_perceptron_model1; +drop view news20b_perceptron_predict1; +drop view news20b_perceptron_submit1; +``` + +--- +#[Passive Aggressive] + +## model building +```sql +drop table news20b_pa_model1; +create table news20b_pa_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + train_pa(addBias(features),label) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_pa_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_pa_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +``` +create or replace view news20b_pa_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_pa_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_pa_submit1 +where actual == predicted; +``` +> 0.9603682946357086 + +## Cleaning + +```sql +drop table news20b_pa_model1; +drop view news20b_pa_predict1; +drop view news20b_pa_submit1; +``` + +--- +#[Passive Aggressive (PA1)] + +## model building +```sql +drop table news20b_pa1_model1; +create table news20b_pa1_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + train_pa1(addBias(features),label) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_pa1_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_pa1_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_pa1_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_pa1_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_pa1_submit1 +where actual == predicted; +``` +> 0.9601681345076061 + +## Cleaning + +```sql +drop table news20b_pa1_model1; +drop view news20b_pa1_predict1; +drop view news20b_pa1_submit1; +``` + +--- +#[Passive Aggressive (PA2)] + +## model building +```sql +drop table news20b_pa2_model1; +create table news20b_pa2_model1 as +select + feature, + voted_avg(weight) as weight +from + (select + train_pa2(addBias(features),label) as (feature,weight) + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_pa2_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_pa2_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_pa2_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_pa2_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_pa2_submit1 +where actual == predicted; +``` +> 0.9597678142514011 + +## Cleaning + +```sql +drop table news20b_pa2_model1; +drop view news20b_pa2_predict1; +drop view news20b_pa2_submit1; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/news20_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/news20_scw.md b/docs/gitbook/binaryclass/news20_scw.md new file mode 100644 index 0000000..0b27dab --- /dev/null +++ b/docs/gitbook/binaryclass/news20_scw.md @@ -0,0 +1,272 @@ +## UDF preparation +``` +use news20; + +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; +source /home/myui/tmp/define-all.hive; +``` + +--- +# Confidece Weighted (CW) + +## training +```sql +drop table news20b_cw_model1; +create table news20b_cw_model1 as +select + feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_cw(addBias(features), label) as (feature, weight) -- [hivemall v0.1] + train_cw(addBias(features), label) as (feature, weight, covar) -- [hivemall v0.2 or later] + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_cw_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_cw_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_cw_submit1 +as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_cw_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_cw_submit1 +where actual = predicted; +``` +> 0.9655724579663731 + +## Cleaning + +```sql +drop table news20b_cw_model1; +drop view news20b_cw_predict1; +drop view news20b_cw_submit1; +``` + +--- +# Adaptive Regularization of Weight Vectors (AROW) + +## training +```sql +drop table news20b_arow_model1; +create table news20b_arow_model1 as +select + feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_arow(addBias(features),label) as (feature,weight) -- [hivemall v0.1] + train_arow(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_arow_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_arow_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_arow_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_arow_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_arow_submit1 +where actual = predicted; +``` +> 0.9659727782225781 + +## Cleaning + +```sql +drop table news20b_arow_model1; +drop view news20b_arow_predict1; +drop view news20b_arow_submit1; +``` + +--- +# Soft Confidence-Weighted (SCW1) + +## training +```sql +drop table news20b_scw_model1; +create table news20b_scw_model1 as +select + feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_scw(addBias(features),label) as (feature,weight) -- [hivemall v0.1] + train_scw(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_scw_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_scw_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_scw_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_scw_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_scw_submit1 +where actual = predicted; +``` +> 0.9661729383506805 + +## Cleaning + +```sql +drop table news20b_scw_model1; +drop view news20b_scw_predict1; +drop view news20b_scw_submit1; +``` + +--- +# Soft Confidence-Weighted (SCW2) + +## training +```sql +drop table news20b_scw2_model1; +create table news20b_scw2_model1 as +select + feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_scw2(addBias(features),label) as (feature,weight) -- [hivemall v0.1] + train_scw2(addBias(features),label) as (feature,weight,covar) -- [hivemall v0.2 or later] + from + news20b_train_x3 + ) t +group by feature; +``` + +## prediction +```sql +create or replace view news20b_scw2_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + news20b_test_exploded t LEFT OUTER JOIN + news20b_scw2_model1 m ON (t.feature = m.feature) +group by + t.rowid; +``` + +## evaluation +```sql +create or replace view news20b_scw2_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + news20b_test t JOIN news20b_scw2_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/4996 from news20b_scw2_submit1 +where actual = predicted; +``` +> 0.9579663730984788 + +## Cleaning + +```sql +drop table news20b_scw2_model1; +drop view news20b_scw2_predict1; +drop view news20b_scw2_submit1; +``` + +-- + +| Algorithm | Accuracy | +|:-----------|------------:| +| Perceptron | 0.9459567654123299 | +| SCW2 | 0.9579663730984788 | +| PA2 | 0.9597678142514011 | +| PA1 | 0.9601681345076061 | +| PA | 0.9603682946357086 | +| CW | 0.9655724579663731 | +| AROW | 0.9659727782225781 | +| SCW1 | 0.9661729383506805 | + +My recommendation is AROW for classification. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/webspam.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/webspam.md b/docs/gitbook/binaryclass/webspam.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/webspam_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/webspam_dataset.md b/docs/gitbook/binaryclass/webspam_dataset.md new file mode 100644 index 0000000..4686865 --- /dev/null +++ b/docs/gitbook/binaryclass/webspam_dataset.md @@ -0,0 +1,76 @@ +Get the dataset from +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#webspam + +# Putting data on HDFS +```sql +hadoop fs -mkdir -p /dataset/webspam/raw + +awk -f conv.awk webspam_wc_normalized_trigram.svm | \ +hadoop fs -put - /dataset/webspam/raw/ +``` + +# Table preparation +```sql +create database webspam; +use webspam; + +delete jar ./tmp/hivemall.jar; +add jar ./tmp/hivemall.jar; +source ./tmp/define-all.hive; + +create external table webspam_raw ( + rowid int, + label int, + features ARRAY<STRING> +) ROW FORMAT +DELIMITED FIELDS TERMINATED BY '\t' +COLLECTION ITEMS TERMINATED BY "," +STORED AS TEXTFILE LOCATION '/dataset/webspam/raw'; + +set hive.sample.seednumber=43; +create table webspam_test +as +select * from webspam_raw TABLESAMPLE(1000 ROWS) s +CLUSTER BY rand(43) +limit 70000; +``` + +# Make auxiliary tables +```sql +create table webspam_train_orcfile ( + rowid int, + label int, + features array<string> +) STORED AS orc tblproperties ("orc.compress"="SNAPPY"); + +-- SET mapred.reduce.tasks=128; +INSERT OVERWRITE TABLE webspam_train_orcfile +select + s.rowid, + label, + addBias(features) as features +from webspam_raw s +where not exists (select rowid from webspam_test t where s.rowid = t.rowid) +CLUSTER BY rand(43); +-- SET mapred.reduce.tasks=-1; + +set hivevar:xtimes=3; +set hivevar:shufflebuffersize=100; +set hivemall.amplify.seed=32; +create or replace view webspam_train_x3 +as +select + rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features) +from + webspam_train_orcfile; + +create table webspam_test_exploded as +select + rowid, + label, + split(feature,":")[0] as feature, + cast(split(feature,":")[1] as float) as value +from + webspam_test LATERAL VIEW explode(addBias(features)) t AS feature; +``` +*Caution:* For this dataset, use small *shufflebuffersize* because each training example has lots of features though (xtimes * shufflebuffersize * N) training examples are cached in memory. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/binaryclass/webspam_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/binaryclass/webspam_scw.md b/docs/gitbook/binaryclass/webspam_scw.md new file mode 100644 index 0000000..635b32d --- /dev/null +++ b/docs/gitbook/binaryclass/webspam_scw.md @@ -0,0 +1,136 @@ +# Preparation + +``` +use webspam; + +delete jar ./tmp/hivemall.jar; +add jar ./tmp/hivemall.jar; +source ./tmp/define-all.hive; +``` + +# PA1 + +```sql +drop table webspam_pa1_model1; +create table webspam_pa1_model1 as +select + feature, + cast(voted_avg(weight) as float) as weight +from + (select + train_pa1(features,label) as (feature,weight) -- sparse model + -- train_pa1(features,label,"-dense -dims 33554432") as (feature,weight) + from + webspam_train_x3 + ) t +group by feature; + +create or replace view webspam_pa1_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + webspam_test_exploded t LEFT OUTER JOIN + webspam_pa1_model1 m ON (t.feature = m.feature) +group by + t.rowid; + +create or replace view webspam_pa1_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + webspam_test t JOIN webspam_pa1_predict1 pd + on (t.rowid = pd.rowid); + +select count(1)/70000 from webspam_pa1_submit1 +where actual = predicted; +``` +> Prediction accuracy: 0.9628428571428571 + +# AROW + +```sql +drop table webspam_arow_model1; +create table webspam_arow_model1 as +select + feature, + argmin_kld(weight,covar)as weight +from + (select + train_arow(features,label) as (feature,weight,covar) + from + webspam_train_x3 + ) t +group by feature; + +create or replace view webspam_arow_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + webspam_test_exploded t LEFT OUTER JOIN + webspam_arow_model1 m ON (t.feature = m.feature) +group by + t.rowid; + +create or replace view webspam_arow_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + webspam_test t JOIN webspam_arow_predict1 pd + on (t.rowid = pd.rowid); + +select count(1)/70000 from webspam_arow_submit1 +where actual = predicted; +``` +> Prediction accuracy: 0.9747428571428571 + +# SCW1 + +```sql +drop table webspam_scw_model1; +create table webspam_scw_model1 as +select + feature, + argmin_kld(weight,covar)as weight +from + (select + train_scw(features,label) as (feature,weight,covar) + from + webspam_train_x3 + ) t +group by feature; + +create or replace view webspam_scw_predict1 +as +select + t.rowid, + sum(m.weight * t.value) as total_weight, + case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label +from + webspam_test_exploded t LEFT OUTER JOIN + webspam_scw_model1 m ON (t.feature = m.feature) +group by + t.rowid; + +create or replace view webspam_scw_submit1 as +select + t.rowid, + t.label as actual, + pd.label as predicted +from + webspam_test t JOIN webspam_scw_predict1 pd + on (t.rowid = pd.rowid); + +select count(1)/70000 from webspam_scw_submit1 +where actual = predicted; +``` +> Prediction accuracy: 0.9778714285714286 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/book.json ---------------------------------------------------------------------- diff --git a/docs/gitbook/book.json b/docs/gitbook/book.json new file mode 100644 index 0000000..2f70ed9 --- /dev/null +++ b/docs/gitbook/book.json @@ -0,0 +1,64 @@ +{ + "gitbook": "3.x.x", + "title": "Hivemall User Manual", + "description": "User Manual for Apache Hivemall", + "plugins": [ + "theme-api", + "edit-link", + "github", + "splitter", + "sitemap", + "etoc", + "callouts", + "toggle-chapters", + "anchorjs", + "codeblock-filename", + "expandable-chapters", + "multipart", + "codeblock-filename", + "katex", + "emphasize" + ], + "pluginsConfig": { + "theme-default": { + "showLevel": true + }, + "theme-api": { + "theme": "dark", + "split": false + }, + "edit-link": { + "base": "https://github.com/apache/incubator-hivemall/docs/gitbook", + "label": "Edit" + }, + "github": { + "url": "https://github.com/apache/incubator-hivemall/" + }, + "sitemap": { + "hostname": "http://hivemall.incubator.apache.org/" + }, + "etoc": { + "mindepth": 1, + "maxdepth": 3, + "notoc": true + }, + "downloadpdf": { + "base": "https://github.com/apache/incubator-hivemall/docs/gitbook", + "label": "PDF", + "multilingual": false + }, + "fontsettings": { + "theme": "white", + "font": "sans", + "size": 2 + }, + "anchorjs": { + "selector": "h1,h2,h3,*:not(.callout) > h4,h5" + } + }, + "links": { + "sidebar": { + "<i class=\"fa fa-home\"></i> Home": "http://hivemall.incubator.apache.org/" + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/eval/datagen.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/eval/datagen.md b/docs/gitbook/eval/datagen.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/eval/lr_datagen.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/eval/lr_datagen.md b/docs/gitbook/eval/lr_datagen.md new file mode 100644 index 0000000..55cb360 --- /dev/null +++ b/docs/gitbook/eval/lr_datagen.md @@ -0,0 +1,92 @@ +_Note this feature is supported on hivemall v0.2-alpha3 or later._ + +# create a dual table + +Create a [dual table](http://en.wikipedia.org/wiki/DUAL_table) as follows: +```sql +CREATE TABLE dual ( + dummy int +); +INSERT INTO TABLE dual SELECT count(*)+1 FROM dual; +``` + +# Sparse dataset generation by a single task +```sql +create table regression_data1 +as +select lr_datagen("-n_examples 10k -n_features 10 -seed 100") as (label,features) +from dual; +``` +Find the details of the option in [LogisticRegressionDataGeneratorUDTF.java](https://github.com/myui/hivemall/blob/master/core/src/main/java/hivemall/dataset/LogisticRegressionDataGeneratorUDTF.java#L69). + +You can generate a sparse dataset as well as a dense dataset. By the default, a sparse dataset is generated. +```sql +hive> desc regression_data1; +OK +label float None +features array<string> None + +hive> select * from regression_data1 limit 2; +OK +0.7220096 ["140:2.8347101","165:3.0056276","179:4.030076","112:3.3919246","99:3.98914","16:3.5653272","128:3.046535","124:2.7708225","78:2.4960368","6:1.7866131"] +0.7346627 ["139:1.9607254","110:2.958568","186:3.2524762","31:3.9243593","167:0.72854257","26:1.8355447","117:2.7663715","3:2.1551287","179:3.1099443","19:3.6411424"] +Time taken: 0.046 seconds, Fetched: 2 row(s) +``` + +# Classification dataset generation +You can use "-cl" option to generation 0/1 label. +```sql +select lr_datagen("-cl") as (label,features) +from dual +limit 5; +OK +1.0 ["84:3.4227803","80:3.8875976","58:3.2909582","123:3.1056073","194:3.3360343","199:2.20207","75:3.5469763","74:3.3869767","126:0.9969454","93:2.5352612"] +0.0 ["84:-0.5568947","10:0.621897","6:-0.13126314","190:0.18610542","131:1.7232913","24:-2.7551131","113:-0.9842969","177:0.062993184","176:-0.19020283","21:-0.54811275"] +1.0 ["73:3.4391513","198:4.42387","164:4.248151","66:3.5224934","84:1.9026604","76:0.79803777","18:2.2168183","163:2.248695","119:1.5906067","72:2.0267224"] +1.0 ["34:2.9269936","35:0.37033868","39:3.771989","47:2.2087111","28:2.9445739","55:4.134555","14:2.4297745","164:3.0913055","52:2.0519433","128:2.9108515"] +1.0 ["98:4.2451696","4:3.486905","133:2.4589922","26:2.7301126","103:2.6827147","2:3.6198254","34:3.7042716","47:2.5515237","68:2.4294896","197:4.4958663"] +``` + +# Dense dataset generation +```sql +create table regression_data_dense +as +select lr_datagen("-dense -n_examples 9999 -n_features 100 -n_dims 100") as (label,features) +from dual; + +hive> desc regression_data_dense; +OK +label float None +features array<float> None + +hive> select * from regression_data_dense limit 1; +OK +0.7274741 [4.061373,3.9373128,3.5195694,3.3604698,3.7698417,4.2518,3.8796813,1.6020582,4.937072,1.5513933,3.0289552,2.6674519,3.432688,2.980945,1.8897587,2.9770515,3.3435504,1.7867403,3.4057906,1.2151588,5.0587463,2.1410913,2.8097973,2.4518871,3.175268,3.3347685,3.728993,3.1443396,3.5506077,3.6357877,4.248151,3.5224934,3.2423255,2.5188355,1.8626233,2.8432152,2.2762651,4.57472,2.2168183,2.248695,3.3636255,2.8359523,2.0327945,1.5917025,2.9269936,0.37033868,2.6151125,4.545956,2.0863252,3.7857852,2.9445739,4.134555,3.0660007,3.4279037,2.0519433,2.9108515,3.5171766,3.4708095,3.161707,2.39229,2.4589922,2.7301126,3.5303073,2.7398396,3.7042716,2.5515237,3.0943663,0.41565156,4.672767,3.1461313,3.0443575,3.4023938,2.2205734,1.8950733,2.1664586,4.8654623,2.787029,4.0460386,2.4455893,3.464298,1.062505,3.0513604,4.382525,2.771433,3.2828436,3.803544,2.178681,4.2466116,3.5440445,3.1546876,3.4248536,0.9067459,3.0134914,1.9528451,1.7175893,2.7029774,2.5759792,3.643847,3.0799,3.735559] +Time taken: 0.044 seconds, Fetched: 1 row(s) +``` + +# Parallel and scalable data generation using multiple reducers (RECOMMENDED) +Dataset generation using (at max) 10 reducers. + +```sql +set hivevar:n_parallel_datagen=10; + +create or replace view seq10 +as +select * from ( + select generate_series(1,${n_parallel_datagen}) + from dual +) t +DISTRIBUTE BY value; + +set mapred.reduce.tasks=${n_parallel_datagen}; +create table lrdata1k +as +select lr_datagen("-n_examples 100") +from seq10; +set mapred.reduce.tasks=-1; -- reset to the default setting + +hive> select count(1) from lrdata1k; +OK +1000 +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/eval/stat_eval.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/eval/stat_eval.md b/docs/gitbook/eval/stat_eval.md new file mode 100644 index 0000000..7f1688b --- /dev/null +++ b/docs/gitbook/eval/stat_eval.md @@ -0,0 +1,56 @@ +Using the [E2006 tfidf regression example](https://github.com/myui/hivemall/wiki/E2006-tfidf-regression-evaluation-(PA,-AROW)), we explain how to evaluate the prediction model on Hive. + +# Scoring by evaluation metrics + +```sql +select avg(actual), avg(predicted) from e2006tfidf_pa2a_submit; +``` +> -3.8200363760415414 -3.9124877451612488 + +```sql +set hivevar:mean_actual=-3.8200363760415414; + +select +-- Root Mean Squared Error + rmse(predicted, actual) as RMSE, + -- sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, +-- Mean Squared Error + mse(predicted, actual) as MSE, + -- sum(pow(predicted - actual,2.0))/count(1) as MSE, +-- Mean Absolute Error + mae(predicted, actual) as MAE, + -- sum(abs(predicted - actual))/count(1) as MAE, +-- coefficient of determination (R^2) + -- 1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2 + r2(actual, predicted) as R2 -- supported since Hivemall v0.4.1-alpha.5 +from + e2006tfidf_pa2a_submit; +``` +> 0.38538660838804495 0.14852283792484033 0.2466732002711477 0.48623913673053565 + +# Logarithmic Loss + +[Logarithmic Loss](https://www.kaggle.com/wiki/LogarithmicLoss) can be computed as follows: + +```sql +WITH t as ( + select + 0 as actual, + 0.01 as predicted + union all + select + 1 as actual, + 0.02 as predicted +) +select + -SUM(actual*LN(predicted)+(1-actual)*LN(1-predicted))/count(1) as logloss1, + logloss(predicted, actual) as logloss2 -- supported since Hivemall v0.4.2-rc.1 +from +from t; +``` +> 1.9610366706408238 1.9610366706408238 + +-- +**References** +* R2 http://en.wikipedia.org/wiki/Coefficient_of_determination +* Evaluation Metrics https://www.kaggle.com/wiki/Metrics \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/ft_trans.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/ft_trans.md b/docs/gitbook/ft_engineering/ft_trans.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/hashing.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/hashing.md b/docs/gitbook/ft_engineering/hashing.md new file mode 100644 index 0000000..09fa1ff --- /dev/null +++ b/docs/gitbook/ft_engineering/hashing.md @@ -0,0 +1,103 @@ +Hivemall supports [Feature Hashing](https://github.com/myui/hivemall/wiki/Feature-hashing) (a.k.a. hashing trick) through `feature_hashing` and `mhash` functions. +Find the differences in the following examples. + +_Note: `feature_hashing` UDF is supported since Hivemall `v0.4.2-rc.1`._ + +## `feature_hashing` function + +`feature_hashing` applies MurmuerHash3 hashing to features. + +```sql +select feature_hashing('aaa'); +> 4063537 + +select feature_hashing('aaa','-features 3'); +> 2 + +select feature_hashing(array('aaa','bbb')); +> ["4063537","8459207"] + +select feature_hashing(array('aaa','bbb'),'-features 10'); +> ["7","1"] + +select feature_hashing(array('aaa:1.0','aaa','bbb:2.0')); +> ["4063537:1.0","4063537","8459207:2.0"] + +select feature_hashing(array(1,2,3)); +> ["11293631","3322224","4331412"] + +select feature_hashing(array('1','2','3')); +> ["11293631","3322224","4331412"] + +select feature_hashing(array('1:0.1','2:0.2','3:0.3')); +> ["11293631:0.1","3322224:0.2","4331412:0.3"] + +select feature_hashing(features), features from training_fm limit 2; + +> ["1803454","6630176"] ["userid#5689","movieid#3072"] +> ["1828616","6238429"] ["userid#4505","movieid#2331"] + +select feature_hashing(array("userid#4505:3.3","movieid#2331:4.999", "movieid#2331")); + +> ["1828616:3.3","6238429:4.999","6238429"] +``` + +_Note: The hash value is starting from 1 and 0 is system reserved for a bias clause. The default number of features are 16777217 (2^24). You can control the number of features by `-num_features` (or `-features`) option._ + +```sql +select feature_hashing(null,'-help'); + +usage: feature_hashing(array<string> features [, const string options]) - + returns a hashed feature vector in array<string> [-features <arg>] + [-help] + -features,--num_features <arg> The number of features [default: + 16777217 (2^24)] + -help Show function help +``` + +## `mhash` function + +```sql +describe function extended mhash; +> mhash(string word) returns a murmurhash3 INT value starting from 1 +``` + +```sql + +select mhash('aaa'); +> 4063537 +``` + +_Note: The default number of features are `16777216 (2^24)`._ +```sql +set hivevar:num_features=16777216; + +select mhash('aaa',${num_features}); +>4063537 +``` + +_Note: `mhash` returns a `+1'd` murmerhash3 value starting from 1. Never returns 0 (It's a system reserved number)._ +```sql +set hivevar:num_features=1; + +select mhash('aaa',${num_features}); +> 1 +``` + +_Note: `mhash` does not considers feature values._ +```sql +select mhash('aaa:2.0'); +> 2746618 +``` + +_Note: `mhash` always returns a scalar INT value._ +```sql +select mhash(array('aaa','bbb')); +> 9566153 +``` + +_Note: `mhash` value of an array is element order-sentitive._ +```sql +select mhash(array('bbb','aaa')); +> 3874068 +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/quantify.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/quantify.md b/docs/gitbook/ft_engineering/quantify.md new file mode 100644 index 0000000..1d6a223 --- /dev/null +++ b/docs/gitbook/ft_engineering/quantify.md @@ -0,0 +1,164 @@ +`quantified_features` is useful for transforming values of non-number columns to indexed numbers. + +*Note: The feature is supported Hivemall v0.4 or later.* + +```sql +desc train; + +id int +age int +job string +marital string +education string +default string +balance int +housing string +loan string +contact string +day int +month string +duration int +campaign int +pdays int +previous int +poutcome string +y int +``` + +```sql +select * from train limit 10; + +1 39 blue-collar married secondary no 1756 yes no cellular 3 apr 939 1 -1 0 unknown 1 +2 51 entrepreneur married primary no 1443 no no cellular 18 feb 172 10 -1 0 unknown 1 +3 36 management single tertiary no 436 no no cellular 13 apr 567 1 595 2 failure 1 +4 63 retired married secondary no 474 no no cellular 25 jan 423 1 -1 0 unknown 1 +5 31 management single tertiary no 354 no no cellular 30 apr 502 1 9 2 success 1 +6 29 blue-collar single secondary no 260 yes no unknown 2 jun 707 14 -1 0 unknown 1 +7 37 services married secondary no 52 yes no cellular 6 sep 908 1 185 9 success 1 +8 32 technician single secondary no 230 yes no cellular 18 may 442 1 266 8 failure 1 +9 31 admin. single secondary no 0 yes no cellular 7 may 895 2 295 2 failure 1 +10 32 self-employed single tertiary no 1815 no no telephone 10 aug 235 1 102 2 failure 1 +``` + +```sql +set hivevar:output_row=true; + +select quantify(${output_row}, *) +from ( + select * from train + order by id asc -- force quantify() runs on a single reducer +) t +limit 10; + +1 39 0 0 0 0 1756 0 0 0 3 0 939 1 -1 0 0 1 +2 51 1 0 1 0 1443 1 0 0 18 1 172 10 -1 0 0 1 +3 36 2 1 2 0 436 1 0 0 13 0 567 1 595 2 1 1 +4 63 3 0 0 0 474 1 0 0 25 2 423 1 -1 0 0 1 +5 31 2 1 2 0 354 1 0 0 30 0 502 1 9 2 2 1 +6 29 0 1 0 0 260 0 0 1 2 3 707 14 -1 0 0 1 +7 37 4 0 0 0 52 0 0 0 6 4 908 1 185 9 2 1 +8 32 5 1 0 0 230 0 0 0 18 5 442 1 266 8 1 1 +9 31 6 1 0 0 0 0 0 0 7 5 895 2 295 2 1 1 +10 32 7 1 2 0 1815 1 0 2 10 6 235 1 102 2 1 1 +``` + +```sql +select + quantify( + ${output_row}, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, cast(pdays as string), previous, poutcome, y + ) as (id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome, y) +from ( + select * from train + order by id asc +) t +limit 10; + +1 39 0 0 0 0 1756 0 0 0 3 0 939 1 0 0 0 1 +2 51 1 0 1 0 1443 1 0 0 18 1 172 10 0 0 0 1 +3 36 2 1 2 0 436 1 0 0 13 0 567 1 1 2 1 1 +4 63 3 0 0 0 474 1 0 0 25 2 423 1 0 0 0 1 +5 31 2 1 2 0 354 1 0 0 30 0 502 1 2 2 2 1 +6 29 0 1 0 0 260 0 0 1 2 3 707 14 0 0 0 1 +7 37 4 0 0 0 52 0 0 0 6 4 908 1 3 9 2 1 +8 32 5 1 0 0 230 0 0 0 18 5 442 1 4 8 1 1 +9 31 6 1 0 0 0 0 0 0 7 5 895 2 5 2 1 1 +10 32 7 1 2 0 1815 1 0 2 10 6 235 1 6 2 1 1 +``` + +```sql +select + quantified_features( + ${output_row}, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, cast(pdays as string), previous, poutcome, y + ) as features +from ( + select * from train + order by id asc +) t +limit 10; + +[1.0,39.0,0.0,0.0,0.0,0.0,1756.0,0.0,0.0,0.0,3.0,0.0,939.0,1.0,0.0,0.0,0.0,1.0] +[2.0,51.0,1.0,0.0,1.0,0.0,1443.0,1.0,0.0,0.0,18.0,1.0,172.0,10.0,0.0,0.0,0.0,1.0] +[3.0,36.0,2.0,1.0,2.0,0.0,436.0,1.0,0.0,0.0,13.0,0.0,567.0,1.0,1.0,2.0,1.0,1.0] +[4.0,63.0,3.0,0.0,0.0,0.0,474.0,1.0,0.0,0.0,25.0,2.0,423.0,1.0,0.0,0.0,0.0,1.0] +[5.0,31.0,2.0,1.0,2.0,0.0,354.0,1.0,0.0,0.0,30.0,0.0,502.0,1.0,2.0,2.0,2.0,1.0] +[6.0,29.0,0.0,1.0,0.0,0.0,260.0,0.0,0.0,1.0,2.0,3.0,707.0,14.0,0.0,0.0,0.0,1.0] +[7.0,37.0,4.0,0.0,0.0,0.0,52.0,0.0,0.0,0.0,6.0,4.0,908.0,1.0,3.0,9.0,2.0,1.0] +[8.0,32.0,5.0,1.0,0.0,0.0,230.0,0.0,0.0,0.0,18.0,5.0,442.0,1.0,4.0,8.0,1.0,1.0] +[9.0,31.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,5.0,895.0,2.0,5.0,2.0,1.0,1.0] +[10.0,32.0,7.0,1.0,2.0,0.0,1815.0,1.0,0.0,2.0,10.0,6.0,235.0,1.0,6.0,2.0,1.0,1.0] +``` + +## Quantify test dataset + +```sql +select * from test limit 10; + +1 30 management single tertiary no 1028 no no cellular 4 feb 1294 2 -1 0 unknown +2 39 self-employed single tertiary no 426 no no unknown 18 jun 1029 1 -1 0 unknown +3 38 technician single tertiary no -572 yes yes unknown 5 jun 26 24 -1 0 unknown +4 34 technician single secondary no -476 yes no unknown 27 may 92 4 -1 0 unknown +5 37 entrepreneur married primary no 62 no no cellular 31 jul 404 2 -1 0 unknown +6 43 services married primary no 574 yes no cellular 8 may 140 1 -1 0 unknown +7 54 technician married secondary no 324 yes no telephone 13 may 51 1 -1 0 unknown +8 41 blue-collar married secondary no 121 yes no cellular 13 may 16 6 176 5 other +9 52 housemaid married primary no 1466 no yes cellular 20 nov 150 1 -1 0 unknown +10 32 management married secondary no 6217 yes yes cellular 18 nov 486 2 181 2 failure +``` + +```sql +select + id, + array(age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome) as features +from ( + select + quantify( + output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, if(pdays==-1,0,pdays), previous, poutcome + ) as (id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome) + from ( + select * from ( + select + 1 as train_first, false as output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome + from + train + union all + select + 2 as train_first, true as output_row, id, age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome + from + test + ) t0 + order by train_first, id asc + ) t1 +) t2 +limit 10; + +1 [30,2,1,2,0,1028,1,0,0,4,1,1294,2,0,0,0] +2 [39,7,1,2,0,426,1,0,1,18,3,1029,1,0,0,0] +3 [38,5,1,2,0,-572,0,1,1,5,3,26,24,0,0,0] +4 [34,5,1,0,0,-476,0,0,1,27,5,92,4,0,0,0] +5 [37,1,0,1,0,62,1,0,0,31,8,404,2,0,0,0] +6 [43,4,0,1,0,574,0,0,0,8,5,140,1,0,0,0] +7 [54,5,0,0,0,324,0,0,2,13,5,51,1,0,0,0] +8 [41,0,0,0,0,121,0,0,0,13,5,16,6,176,5,3] +9 [52,8,0,1,0,1466,1,1,0,20,9,150,1,0,0,0] +10 [32,2,0,0,0,6217,0,1,0,18,9,486,2,181,2,1] +``` \ No newline at end of file
