[03/50] [abbrv] incubator-hivemall git commit: Added a gitbook userguide

myui Wed, 30 Nov 2016 21:26:22 -0800

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/scaling.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/scaling.md 
b/docs/gitbook/ft_engineering/scaling.md
new file mode 100644
index 0000000..6e7d312
--- /dev/null
+++ b/docs/gitbook/ft_engineering/scaling.md
@@ -0,0 +1,173 @@
+# Min-Max Normalization
+http://en.wikipedia.org/wiki/Feature_scaling#Rescaling
+```sql
+select min(target), max(target)
+from (
+select target from e2006tfidf_train 
+-- union all
+-- select target from e2006tfidf_test 
+) t;
+```
+
+> -7.899578       -0.51940954
+
+```sql
+set hivevar:min_target=-7.899578;
+set hivevar:max_target=-0.51940954;
+
+create or replace view e2006tfidf_train_scaled 
+as
+select 
+  rowid,
+  rescale(target, ${min_target}, ${max_target}) as target, 
+  features
+from 
+  e2006tfidf_train;
+```
+
+# Feature scaling by zscore
+http://en.wikipedia.org/wiki/Standard_score
+
+```sql
+select avg(target), stddev_pop(target)
+from (
+select target from e2006tfidf_train 
+-- union all
+-- select target from e2006tfidf_test 
+) t;
+```
+> -3.566241460963296      0.6278076335455348
+
+```sql
+set hivevar:mean_target=-3.566241460963296;
+set hivevar:stddev_target=0.6278076335455348;
+
+create or replace view e2006tfidf_train_scaled 
+as
+select 
+  rowid,
+  zscore(target, ${mean_target}, ${stddev_target}) as target, 
+  features
+from 
+  e2006tfidf_train;
+```
+
+# Apply Normalization to more complex feature vector
+
+Apply normalization to the following data.
+
+```sql
+select rowid, features from train limit 3;
+```
+
+```
+1       ["weight:69.613","specific_heat:129.07","reflectance:52.111"]
+2       ["weight:70.67","specific_heat:128.161","reflectance:52.446"]
+3       ["weight:72.303","specific_heat:128.45","reflectance:52.853"]
+```
+
+We can create a normalized table as follows:
+
+```sql
+create table train_normalized
+as
+WITH fv as (
+select 
+  rowid, 
+  extract_feature(feature) as feature,
+  extract_weight(feature) as value
+from 
+  train 
+  LATERAL VIEW explode(features) exploded AS feature
+), 
+stats as (
+select
+  feature,
+  -- avg(value) as mean, stddev_pop(value) as stddev
+  min(value) as min, max(value) as max
+from
+  fv
+group by
+  feature
+), 
+norm as (
+select 
+  rowid, 
+  t1.feature, 
+  -- zscore(t1.value, t2.mean, t2.stddev) as zscore
+  rescale(t1.value, t2.min, t2.max) as minmax
+from 
+  fv t1 JOIN
+  stats t2 ON (t1.feature = t2.feature) 
+),
+norm_fv as (
+select
+  rowid, 
+  -- concat(feature, ":", zscore) as feature
+  -- concat(feature, ":", minmax) as feature  -- Before Hivemall v0.3.2-1
+  feature(feature, minmax) as feature         -- Hivemall v0.3.2-1 or later
+from
+  norm
+)
+select 
+  rowid, 
+  collect_list(feature) as features
+from
+  norm_fv
+group by
+  rowid
+;
+```
+
+```
+1       ["reflectance:0.5252967","specific_heat:0.19863537","weight:0.0"]
+2       
["reflectance:0.5950446","specific_heat:0.09166764","weight:0.052084323"]
+3       
["reflectance:0.6797837","specific_heat:0.12567581","weight:0.13255163"]
+...
+```
+
+# Tips for using both min-max and zscore normalization
+
+```sql
+WITH quantative as (
+  select id, true as minmax, "age" as feature, age as value from train
+  union all
+  select id, false as minmax, "balance" as feature, balance as value from train
+  union all
+  select id, true as minmax, "day" as feature, day as value from train
+  union all
+  select id, false as minmax, "duration" as feature, duration as value from 
train
+  union all
+  select id, false as minmax, "campaign" as feature, campaign as value from 
train
+  union all
+  select id, false as minmax, "pdays" as feature, if(pdays = -1, 0, pdays) as 
value from train
+  union all
+  select id, false as minmax,  "previous" as feature, previous as value from 
train  
+),
+quantative_stats as (
+select
+  feature,
+  avg(value) as mean, stddev_pop(value) as stddev,
+  min(value) as min, max(value) as max
+from
+  quantative
+group by
+  feature
+), 
+quantative_norm as (
+select 
+  t1.id,
+  collect_list(
+   feature(
+      t1.feature, 
+      if(t1.minmax,rescale(t1.value, t2.min, t2.max),zscore(t1.value, t2.mean, 
t2.stddev))
+    )
+  ) as features
+from 
+  quantative t1
+  JOIN quantative_stats t2 ON (t1.feature = t2.feature)   
+group by
+  t1.id
+)
+...
+```
\ No newline at end of file


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/tfidf.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/tfidf.md 
b/docs/gitbook/ft_engineering/tfidf.md
new file mode 100644
index 0000000..e881e10
--- /dev/null
+++ b/docs/gitbook/ft_engineering/tfidf.md
@@ -0,0 +1,149 @@
+This document explains how to compute 
[TF-IDF](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) with Apache Hive/Hivemall.
+
+What you need to compute TF-IDF is a table/view composing (docid, word) pair, 
2 views, and 1 query.
+
+_Note that this feature is supported since Hivemall v0.3-beta3 or later. Macro 
is supported since Hive 0.12 or later._
+
+# Define macros used in the TF-IDF computation
+```sql
+create temporary macro max2(x INT, y INT)
+if(x>y,x,y);
+
+-- create temporary macro idf(df_t INT, n_docs INT)
+-- (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0);
+
+create temporary macro tfidf(tf FLOAT, df_t INT, n_docs INT)
+tf * (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0);
+```
+
+# Data preparation
+To calculate TF-IDF, you need to prepare a relation consists of (docid,word) 
tuples.
+```sql
+create external table wikipage (
+  docid int,
+  page string
+)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
+STORED AS TEXTFILE;
+
+cd ~/tmp
+wget 
https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv
+
+LOAD DATA LOCAL INPATH '/home/myui/tmp/tfidf_test.tsv' INTO TABLE wikipage;
+
+create or replace view wikipage_exploded
+as
+select
+  docid, 
+  word
+from
+  wikipage LATERAL VIEW explode(tokenize(page,true)) t as word
+where
+  not is_stopword(word);
+```
+You can download the data of the wikipage table from [this link]( 
https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv).
+
+# Define views of TF/DF
+```sql
+create or replace view term_frequency 
+as
+select
+  docid, 
+  word,
+  freq
+from (
+select
+  docid,
+  tf(word) as word2freq
+from
+  wikipage_exploded
+group by
+  docid
+) t 
+LATERAL VIEW explode(word2freq) t2 as word, freq;
+
+create or replace view document_frequency
+as
+select
+  word, 
+  count(distinct docid) docs
+from
+  wikipage_exploded
+group by
+  word;
+```
+
+# TF-IDF calculation for each docid/word pair
+```sql
+-- set the total number of documents
+select count(distinct docid) from wikipage;
+set hivevar:n_docs=3;
+
+create or replace view tfidf
+as
+select
+  tf.docid,
+  tf.word, 
+  -- tf.freq * (log(10, CAST(${n_docs} as FLOAT)/max2(1,df.docs)) + 1.0) as 
tfidf
+  tfidf(tf.freq, df.docs, ${n_docs}) as tfidf
+from
+  term_frequency tf 
+  JOIN document_frequency df ON (tf.word = df.word)
+order by 
+  tfidf desc;
+```
+
+The result will be as follows:
+```
+docid  word     tfidf
+1       justice 0.1641245850805637
+3       knowledge       0.09484606645205085
+2       action  0.07033910867777095
+1       law     0.06564983513276658
+1       found   0.06564983513276658
+1       religion        0.06564983513276658
+1       discussion      0.06564983513276658
+  ...
+  ...
+2       act     0.017584777169442737
+2       virtues 0.017584777169442737
+2       well    0.017584777169442737
+2       willingness     0.017584777169442737
+2       find    0.017584777169442737
+2       1       0.014001086678120098
+2       experience      0.014001086678120098
+2       often   0.014001086678120098
+```
+The above result is considered to be appropriate as docid 1, 2, and 3 are the 
Wikipedia entries of Justice, Wisdom, and Knowledge, respectively.
+
+# Feature Vector with TF-IDF values
+
+```sql
+select
+  docid, 
+  -- collect_list(concat(word, ":", tfidf)) as features -- Hive 0.13 or later
+  collect_list(feature(word, tfidf)) as features -- Hivemall v0.3.4 & Hive 
0.13 or later
+  -- collect_all(concat(word, ":", tfidf)) as features -- before Hive 0.13
+from 
+  tfidf
+group by
+  docid;
+```
+
+```
+1       
["justice:0.1641245850805637","found:0.06564983513276658","discussion:0.06564983513276658","law:0.065
+64983513276658","based:0.06564983513276658","religion:0.06564983513276658","viewpoints:0.03282491756638329","
+rationality:0.03282491756638329","including:0.03282491756638329","context:0.03282491756638329","concept:0.032
+82491756638329","rightness:0.03282491756638329","general:0.03282491756638329","many:0.03282491756638329","dif
+fering:0.03282491756638329","fairness:0.03282491756638329","social:0.03282491756638329","broadest:0.032824917
+56638329","equity:0.03282491756638329","includes:0.03282491756638329","theology:0.03282491756638329","ethics:
+0.03282491756638329","moral:0.03282491756638329","numerous:0.03282491756638329","philosophical:0.032824917566
+38329","application:0.03282491756638329","perspectives:0.03282491756638329","procedural:0.03282491756638329",
+"realm:0.03282491756638329","divided:0.03282491756638329","concepts:0.03282491756638329","attainment:0.032824
+91756638329","fields:0.03282491756638329","often:0.026135361945200226","philosophy:0.026135361945200226","stu
+dy:0.026135361945200226"]
+2       
["action:0.07033910867777095","wisdom:0.05275433288400458","one:0.05275433288400458","understanding:0
+.04200326112968063","judgement:0.035169554338885474","apply:0.035169554338885474","disposition:0.035169554338
+885474","given:0.035169554338885474"
+...
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/vectorizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/vectorizer.md 
b/docs/gitbook/ft_engineering/vectorizer.md
new file mode 100644
index 0000000..bc929a5
--- /dev/null
+++ b/docs/gitbook/ft_engineering/vectorizer.md
@@ -0,0 +1,42 @@
+## Feature Vectorizer
+
+`array<string> vectorize_feature(array<string> featureNames, ...)` is useful 
to generate a feature vector for each row, from a table.
+
+```sql
+select vectorize_features(array("a","b"),"0.2","0.3") from dual;
+>["a:0.2","b:0.3"]
+
+-- avoid zero weight
+select vectorize_features(array("a","b"),"0.2",0) from dual;
+> ["a:0.2"]
+
+-- true boolean value is treated as 1.0 (a categorical value w/ its column 
name)
+select vectorize_features(array("a","b","bool"),0.2,0.3,true) from dual;
+> ["a:0.2","b:0.3","bool:1.0"]
+
+-- an example to generate feature vectors from table
+select * from dual;
+> 1                                         
+select vectorize_features(array("a"),*) from dual;
+> ["a:1.0"]
+
+-- has categorical feature
+select vectorize_features(array("a","b","wheather"),"0.2","0.3","sunny") from 
dual;
+> ["a:0.2","b:0.3","whether#sunny"]
+```
+
+```sql
+select
+  id,
+  vectorize_features(
+    
array("age","job","marital","education","default","balance","housing","loan","contact","day","month","duration","campaign","pdays","previous","poutcome"),
 
+    
age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
+  ) as features,
+  y
+from
+  train
+limit 2;
+
+> 1       
["age:39.0","job#blue-collar","marital#married","education#secondary","default#no","balance:1756.0","housing#yes","loan#no","contact#cellular","day:3.0","month#apr","duration:939.0","campaign:1.0","pdays:-1.0","poutcome#unknown"]
   1
+> 2       
["age:51.0","job#entrepreneur","marital#married","education#primary","default#no","balance:1443.0","housing#no","loan#no","contact#cellular","day:18.0","month#feb","duration:172.0","campaign:10.0","pdays:-1.0","poutcome#unknown"]
   1
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/README.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/getting_started/README.md 
b/docs/gitbook/getting_started/README.md
new file mode 100644
index 0000000..27870e5
--- /dev/null
+++ b/docs/gitbook/getting_started/README.md
@@ -0,0 +1 @@
+# Summary
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/input-format.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/getting_started/input-format.md 
b/docs/gitbook/getting_started/input-format.md
new file mode 100644
index 0000000..272d3eb
--- /dev/null
+++ b/docs/gitbook/getting_started/input-format.md
@@ -0,0 +1,214 @@
+This page explains the input format of training data in Hivemall. 
+Here, we use 
[EBNF](http://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_Form)-like 
notation for describing the format.
+
+<!-- toc -->
+
+# Input Format for Classification 
+
+The classifiers of Hivemall takes 2 (or 3) arguments: *features*, *label*, and 
*options* (a.k.a. 
[hyperparameters](http://en.wikipedia.org/wiki/Hyperparameter)). The first two 
arguments of training functions (e.g., 
[logress](https://github.com/myui/hivemall/wiki/a9a-binary-classification-(logistic-regression))
 and 
[train_scw](https://github.com/myui/hivemall/wiki/news20-binary-classification-%232-(CW,-AROW,-SCW)))
 represents training examples. 
+
+In Statistics, *features* and *label* are called [Explanatory variable and 
Response Variable](http://www.oswego.edu/~srp/stats/variable_types.htm), 
respectively.
+
+# Features format (for classification and regression)
+
+The format of *features* is common between (binary and multi-class) 
classification and regression.
+Hivemall accepts ARRAY&lt;INT|BIGINT|TEXT> for the type of *features* column.
+
+Hivemall uses a *sparse* data format (cf. [Compressed Row 
Storage](http://netlib.org/linalg/html_templates/node91.html)) which is similar 
to 
[LIBSVM](http://stackoverflow.com/questions/12112558/read-write-data-in-libsvm-format)
 and [Vowpal 
Wabbit](https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format).
+
+The format of each feature in an array is as follows:
+```
+feature ::= <index>:<weight> or <index>
+```
+
+Each element of *index* or *weight* then accepts the following format:
+```
+index ::= <INT | BIGINT | TEXT>
+weight ::= <FLOAT>
+```
+
+The *index* are usually a number (INT or BIGINT) starting from 1. 
+Here is an instance of a features.
+```
+10:3.4  123:0.5  34567:0.231
+```
+
+*Note:* As mentioned later, *index* "0" is reserved for a [Bias/Dummy 
variable](https://github.com/myui/hivemall/wiki/Using-explicit-addBias()-for-a-better-prediction).
+
+In addition to numbers, you can use a TEXT value for an index. For example, 
you can use array("height:1.5", "length:2.0") for the features.
+```
+"height:1.5" "length:2.0"
+```
+
+## Quantitative and Categorical variables
+
+A [quantitative variable](http://www.oswego.edu/~srp/stats/variable_types.htm) 
must have an *index* entry.
+
+Hivemall (v0.3.1 or later) provides *add_feature_index* function which is 
useful for adding indexes to quantitative variables. 
+
+```sql
+select add_feature_index(array(3,4.0,5)) from dual;
+```
+> ["1:3.0","2:4.0","3:5.0"]
+
+You can omit specifying *weight* for each feature e.g. for [Categorical 
variables](http://www.oswego.edu/~srp/stats/variable_types.htm) as follows:
+```
+feature ::= <index>
+```
+Note 1.0 is used for the weight when omitting *weight*. 
+
+## Bias/Dummy Variable in features
+
+Note that "0" is reserved for a Bias variable (called dummy variable in 
Statistics). 
+
+The 
[addBias](https://github.com/myui/hivemall/wiki/Using-explicit-addBias()-for-a-better-prediction)
 function is Hivemall appends "0:1.0" as an element of array in *features*.
+
+## Feature hashing
+
+Hivemall supports [feature hashing/hashing 
trick](http://en.wikipedia.org/wiki/Feature_hashing) through [mhash 
function](https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-dataset#converting-feature-representation-by-feature-hashing).
+
+The mhash function takes a feature (i.e., *index*) of TEXT format and 
generates a hash number of a range from 1 to 2^24 (=16777216) by the default 
setting.
+
+Feature hashing is useful where the dimension of feature vector (i.e., the 
number of elements in *features*) is so large. Consider applying [mhash 
function]((https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-dataset#converting-feature-representation-by-feature-hashing))
 when a prediction model does not fit in memory and OutOfMemory exception 
happens.
+
+In general, you don't need to use mhash when the dimension of feature vector 
is less than 16777216.
+If feature *index* is very long TEXT (e.g., "xxxxxxx-yyyyyy-weight:55.3") and 
uses huge memory spaces, consider using mhash as follows:
+```sql
+-- feature is v0.3.2 or before
+concat(mhash(extract_feature("xxxxxxx-yyyyyy-weight:55.3")), ":", 
extract_weight("xxxxxxx-yyyyyy-weight:55.3"))
+
+-- feature is v0.3.2-1 or later
+feature(mhash(extract_feature("xxxxxxx-yyyyyy-weight:55.3")), 
extract_weight("xxxxxxx-yyyyyy-weight:55.3"))
+```
+> 43352:55.3
+
+## Feature Normalization
+
+Feature (weight) normalization is important in machine learning. Please refer 
[https://github.com/myui/hivemall/wiki/Feature-scaling](https://github.com/myui/hivemall/wiki/Feature-scaling)
 for detail.
+
+***
+
+# Label format in Binary Classification
+
+The *label* must be an *INT* typed column and the values are positive (+1) or 
negative (-1) as follows:
+```
+<label> ::= 1 | -1
+```
+
+Alternatively, you can use the following format that represents 1 for a 
positive example and 0 for a negative example: 
+```
+<label> ::= 0 | 1
+```
+
+# Label format in Multi-class Classification
+
+You can used any PRIMITIVE type in the multi-class *label*.  
+
+```
+<label> ::= <primitive type>
+```
+
+Typically, the type of label column will be INT, BIGINT, or TEXT.
+
+***
+
+# Input format in Regression
+
+In regression, response/predictor variable (we denote it as *target*) is a 
real number.
+
+Before Hivemall v0.3, we accepts only FLOAT type for *target*.
+```
+<target> ::= <FLOAT> 
+```
+
+You need to explicitly cast a double value of *target* to float as follows:
+```sql
+CAST(target as FLOAT) 
+```
+
+On the other hand, Hivemall v0.3 or later accepts double compatible numbers in 
*target*.
+```
+<target> ::= <FLOAT | DOUBLE | INT | TINYINT | SMALLINT| BIGINT > 
+```
+
+## Target in Logistic Regression
+
+Logistic regression is actually a binary classification scheme while it can 
produce probabilities of positive of a training example. 
+
+A *target* value of a training input must be in range 0.0 to 1.0, specifically 
0.0 or 1.0.
+
+***
+
+# Helper functions
+
+```sql
+-- hivemall v0.3.2 and before
+select concat("weight",":",55.0);
+
+-- hivemall v0.3.2-1 and later
+select feature("weight", 55.0);
+```
+> weight:55.0
+
+```sql
+select extract_feature("weight:55.0"), extract_weight("weight:55.0");
+```
+> weight | 55.0
+
+```sql
+-- hivemall v0.4.0 and later
+select feature_index(array("10:0.2","7:0.3","9"));
+```
+> [10,7,9]
+
+```sql
+select 
+  convert_label(-1), convert_label(1), convert_label(0.0f), convert_label(1.0f)
+from 
+  dual;
+```
+> 0.0f | 1.0f | -1 | 1
+
+## Quantitative Features
+
+`array<string> quantitative_features(array<string> featureNames, ...)` is a 
helper function to create sparse quantitative features from a table.
+
+```sql
+select quantitative_features(array("apple","value"),1,120.3);
+```
+> ["apple:1.0","value:120.3"]
+
+## Categorical Features
+
+`array<string> categorical_features(array<string> featureNames, ...)` is a 
helper function to create sparse categorical features from a table.
+
+```sql
+select categorical_features(
+  array("is_cat","is_dog","is_lion","is_pengin","species"),
+  1, 0, 1.0, true, "dog"
+);
+```
+> ["is_cat#1","is_dog#0","is_lion#1.0","is_pengin#true","species#dog"]
+
+## Preparing training data table 
+
+You can create a training data table as follows:
+
+```sql
+select 
+  rowid() as rowid,
+  concat_array(
+    array("bias:1.0"),
+    categorical_features( 
+      array("id", "name"),
+      id, name
+    ),
+    quantitative_features(
+      array("height", "weight"),
+      height, weight
+    )
+  ) as features, 
+  click_or_not as label
+from
+  table;
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/installation.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/getting_started/installation.md 
b/docs/gitbook/getting_started/installation.md
new file mode 100644
index 0000000..bb1920e
--- /dev/null
+++ b/docs/gitbook/getting_started/installation.md
@@ -0,0 +1,25 @@
+Prerequisites
+============
+
+* Hive v0.12 or later
+* Java 7 or later
+* 
[hivemall-core-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases)
+* [define-all.hive](https://github.com/myui/hivemall/releases)
+
+Installation
+============
+
+Add the following two lines to your `$HOME/.hiverc` file.
+
+```
+add jar /home/myui/tmp/hivemall-core-xxx-with-dependencies.jar;
+source /home/myui/tmp/define-all.hive;
+```
+
+This automatically loads all Hivemall functions every time you start a Hive 
session. Alternatively, you can run the following command each time.
+
+```
+$ hive
+add jar /tmp/hivemall-core-xxx-with-dependencies.jar;
+source /tmp/define-all.hive;
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/permanent-functions.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/getting_started/permanent-functions.md 
b/docs/gitbook/getting_started/permanent-functions.md
new file mode 100644
index 0000000..aab399b
--- /dev/null
+++ b/docs/gitbook/getting_started/permanent-functions.md
@@ -0,0 +1,42 @@
+Hive v0.13 or later supports [permanent 
functions](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-Create/DropFunction)
 that live across sessions.
+
+Permanent functions are useful when you are using Hive through Hiveserver or 
to avoid hivemall installation for each session.
+
+_Note: This feature is supported since hivemall-0.3 beta 3 or later._
+
+<!-- toc -->
+
+# Put hivemall jar to HDFS
+
+First, put hivemall jar to HDFS as follows:
+```sh
+hadoop fs -mkdir -p /apps/hivemall
+hadoop fs -put hivemall-with-dependencies.jar /apps/hivemall
+```
+
+# Create permanent functions
+
+_The following is an auxiliary step to define functions for hivemall 
databases, not for the default database._
+```sql
+CREATE DATABASE IF NOT EXISTS hivemall;
+USE hivemall;
+```
+
+Then, create permanent functions using 
[define-all-as-permanent.hive](https://github.com/myui/hivemall/blob/master/resources/ddl/define-all-as-permanent.hive),
 a DDL script to define permanent UDFs.
+```sql
+set hivevar:hivemall_jar=hdfs:///apps/hivemall/hivemall-with-dependencies.jar;
+
+source /tmp/define-all-as-permanent.hive;
+```
+
+# Confirm functions
+
+```sql
+show functions "hivemall.*";
+
+> hivemall.adadelta
+> hivemall.adagrad
+```
+
+> #### Caution
+You need to specify "hivemall." prefix to call hivemall UDFs in your queries 
if UDFs are loaded into non-default scheme, in this case _hivemall_.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/misc/generic_funcs.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/generic_funcs.md 
b/docs/gitbook/misc/generic_funcs.md
new file mode 100644
index 0000000..1769699
--- /dev/null
+++ b/docs/gitbook/misc/generic_funcs.md
@@ -0,0 +1,210 @@
+This page describes a list of useful Hivemall generic functions.
+
+# Array functions
+
+## Array UDFs
+
+- `array_concat(array<ANY> x1, array<ANY> x2, ..)` - Returns a concatenated 
array
+
+```sql
+select array_concat(array(1),array(2,3));
+> [1,2,3]
+```
+
+- `array_intersect(array<ANY> x1, array<ANY> x2, ..)` - Returns an intersect 
of given arrays
+
+```sql
+select array_intersect(array(1,3,4),array(2,3,4),array(3,5));
+> [3]
+```
+
+- `array_remove(array<int|text> original, int|text|array<int> target)` - 
Returns an array that the target is removed from the original array
+
+```sql
+select array_remove(array(1,null,3),array(null));
+> [3]
+
+select array_remove(array("aaa","bbb"),"bbb");
+> ["aaa"]
+```
+
+- `sort_and_uniq_array(array<int>)` - Takes an array of type int and returns a 
sorted array in a natural order with duplicate elements eliminated
+
+```sql
+select sort_and_uniq_array(array(3,1,1,-2,10));
+> [-2,1,3,10]
+```
+
+- `subarray_endwith(array<int|text> original, int|text key)` - Returns an 
array that ends with the specified key
+
+```sql
+select subarray_endwith(array(1,2,3,4), 3);
+> [1,2,3]
+```
+
+- `subarray_startwith(array<int|text> original, int|text key)` - Returns an 
array that starts with the specified key
+
+```sql
+select subarray_startwith(array(1,2,3,4), 2);
+> [2,3,4]
+```
+
+- `subarray(array<int> orignal, int fromIndex, int toIndex)` - Returns a slice 
of the original array between the inclusive fromIndex and the exclusive toIndex
+
+```sql
+select subarray(array(1,2,3,4,5,6), 2,4);
+> [3,4]
+```
+
+## Array UDAFs
+
+- `array_avg(array<NUMBER>)` - Returns an array<double> in which each element 
is the mean of a set of numbers
+
+- `array_sum(array<NUMBER>)` - Returns an array<double> in which each element 
is summed up
+
+# Bitset functions
+
+## Bitset UDF
+
+- `to_bits(int[] indexes)` - Returns an bitset representation if the given 
indexes in long[]
+
+```sql
+select to_bits(array(1,2,3,128));
+>[14,-9223372036854775808]
+```
+
+- `unbits(long[] bitset)` - Returns an long array of the give bitset 
representation
+
+```sql
+select unbits(to_bits(array(1,4,2,3)));
+> [1,2,3,4]
+```
+
+- `bits_or(array<long> b1, array<long> b2, ..)` - Returns a logical OR given 
bitsets
+
+```sql
+select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));
+> [1,2,3,4]
+```
+
+## Bitset UDAF
+
+- `bits_collect(int|long x)` - Returns a bitset in array<long>
+
+
+# Compression functions
+
+- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed 
BINARY obeject by using Deflater.
+The compression level must be in range [-1,9]
+
+```sql
+select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
+> AA+=kaIM|WTt!+wbGAA
+```
+
+- `inflate(BINARY compressedData)` - Returns a decompressed STRING by using 
Inflater
+
+
+```sql
+select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
+> aaaaaaaaaaaaaaaabbbbccc
+```
+
+# Map functions
+
+## Map UDFs
+
+- `map_get_sum(map<int,float> src, array<int> keys)` - Returns sum of values 
that are retrieved by keys
+
+- `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted 
array of SRC
+
+## MAP UDAFs
+
+- `to_map(key, value)` - Convert two aggregated columns into a key-value map
+
+- `to_ordered_map(key, value [, const boolean reverseOrder=false])` - Convert 
two aggregated columns into an ordered key-value map
+
+
+# MapReduce functions
+
+- `rowid()` - Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER}
+
+- `taskid()` - Returns the value of mapred.task.partition
+
+# Math functions
+
+- `sigmoid(x)` - Returns 1.0 / (1.0 + exp(-x))
+
+# Text processing functions
+
+- `base91(binary)` - Convert the argument from binary to a BASE91 string
+
+```sql
+select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
+> AA+=kaIM|WTt!+wbGAA
+```
+
+- `unbase91(string)` - Convert a BASE91 string to a binary
+
+```sql
+select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
+> aaaaaaaaaaaaaaaabbbbccc
+```
+
+- `normalize_unicode(string str [, string form])` - Transforms `str` with the 
specified normalization form. The `form` takes one of NFC (default), NFD, NFKC, 
or NFKD
+
+```sql
+select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾','NFKC');
+> ãã³ã«ã¯ã«ã
+
+select normalize_unicode('ã±ã§ã¦â¢','NFKC');
+> (æ ª)ãã³ãã«III
+```
+
+- `split_words(string query [, string regex])` - Returns an array<text> 
containing splitted strings
+
+- `is_stopword(string word)` - Returns whether English stopword or not
+
+- `tokenize(string englishText [, boolean toLowerCase])` - Returns words in 
array<string>
+
+- `tokenize_ja(String line [, const string mode = "normal", const list<string> 
stopWords, const list<string> stopTags])` - returns tokenized strings in 
array<string>
+
+```sql
+select 
tokenize_ja("kuromojiãä½¿ã£ãåãã¡æ¸ãã®ãã¹ãã§ããç¬¬äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã");
+
+> 
["kuromoji","ä½¿ã","åãã¡æ¸ã","ãã¹ã","ç¬¬","äº","å¼æ°","normal","search","extended","æå®","ããã©ã«ã","normal","
 ã¢ã¼ã"]
+```
+
+https://github.com/myui/hivemall/wiki/Tokenizer
+
+# Other functions
+
+- `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or 
from 0.0f|1.0f to -1|1
+
+- `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values 
(or tail-K values when k is less than 0)
+
+https://github.com/myui/hivemall/wiki/Efficient-Top-k-computation-on-Apache-Hive-using-Hivemall-UDTF
+
+- `generate_series(const int|bigint start, const int|bigint end)` - Generate a 
series of values, from start to end
+
+```sql
+WITH dual as (
+  select 1
+)
+select generate_series(1,9)
+from dual;
+
+1
+2
+3
+4
+5
+6
+7
+8
+9
+```
+
+A similar function to PostgreSQL's `generate_serics`.
+http://www.postgresql.org/docs/current/static/functions-srf.html
+- `x_rank(KEY)` - Generates a pseudo sequence number starting from 1 for each 
key
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/misc/tokenizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
new file mode 100644
index 0000000..cd2ce08
--- /dev/null
+++ b/docs/gitbook/misc/tokenizer.md
@@ -0,0 +1,30 @@
+# Tokenizer for English Texts
+
+Hivemall provides simple English text tokenizer UDF that has following syntax:
+```sql
+tokenize(text input, optional boolean toLowerCase = false)
+```
+
+# Tokenizer for Japanese Texts
+
+Hivemall-NLP module provides a Japanese text tokenizer UDF using 
[Kuromoji](https://github.com/atilika/kuromoji). 
+
+First of all, you need to issue the following DDLs to use the NLP module. Note 
NLP module is not included in 
[hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases).
+
+> add jar 
/tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases);
+
+> source 
/tmp/[define-additional.hive](https://github.com/myui/hivemall/releases);
+
+The signature of the UDF is as follows:
+```sql
+tokenize_ja(text input, optional const text mode = "normal", optional const 
array<string> stopWords, optional const array<string> stopTags)
+```
+_Caution: `tokenize_ja` is supported since Hivemall v0.4.1 and later._
+
+It's basic usage is as follows:
+```sql
+select 
tokenize_ja("kuromojiãä½¿ã£ãåãã¡æ¸ãã®ãã¹ãã§ããç¬¬äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã");
+```
+> 
["kuromoji","ä½¿ã","åãã¡æ¸ã","ãã¹ã","ç¬¬","äº","å¼æ°","normal","search","extended","æå®","ããã©ã«ã","normal","ã¢ã¼ã"]
+
+For detailed APIs, please refer Javadoc of 
[JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html)
 as well.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/misc/topk.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/topk.md b/docs/gitbook/misc/topk.md
new file mode 100644
index 0000000..dcd545a
--- /dev/null
+++ b/docs/gitbook/misc/topk.md
@@ -0,0 +1,288 @@
+`each_top_k(int k, ANY group, double value, arg1, arg2, ..., argN)` returns a 
top-k records for each `group`. It returns a relation consists of `(int rank, 
double value, arg1, arg2, .., argN)`.
+
+This function is particularly useful for applying a similarity/distance 
function where the computation complexity is **O(nm)**.
+
+`each_top_k` is very fast when compared to other methods running top-k queries 
(e.g., [`rank/distributed 
by`](https://ragrawal.wordpress.com/2011/11/18/extract-top-n-records-in-each-group-in-hadoophive/))
 in Hive.
+
+## Caution
+* `each_top_k` is supported from Hivemall v0.3.2-3 or later.
+* This UDTF assumes that input records are sorted by `group`. Use `DISTRIBUTED 
BY group SORTED BY group` to ensure that. Or, you can use `LEFT OUTER JOIN` for 
certain cases.
+* It takes variable lengths arguments in `argN`. 
+* The third argument `value` is used for the comparison.
+* `Any number types` or `timestamp` are accepted for the type of `value`.
+* If k is less than 0, reverse order is used and `tail-K` records are returned 
for each `group`.
+* Note that this function returns [a pseudo 
ranking](http://www.michaelpollmeier.com/selecting-top-k-items-from-a-list-efficiently-in-java-groovy/)
 for top-k. It always returns `at-most K` records for each group. The ranking 
scheme is similar to `dense_rank` but slightly different in certain cases.
+
+# Usage
+
+## top-k clicks 
+
+http://stackoverflow.com/questions/9390698/hive-getting-top-n-records-in-group-by-query/32559050#32559050
+
+```sql
+set hivevar:k=5;
+
+select
+  page-id, 
+  user-id,
+  clicks
+from (
+  select
+    each_top_k(${k}, page-id, clicks, page-id, user-id)
+      as (rank, clicks, page-id, user-id)
+  from (
+    select
+      page-id, user-id, clicks
+    from
+      mytable
+    DISTRIBUTE BY page-id SORT BY page-id
+  ) t1
+) t2
+order by page-id ASC, clicks DESC;
+```
+
+## Top-k similarity computation
+
+```sql
+set hivevar:k=10;
+
+SELECT
+  each_top_k(
+    ${k}, t2.id, angular_similarity(t2.features, t1.features), 
+    t2.id, 
+    t1.id,  
+    t1.y
+  ) as (rank, similarity, base_id, neighbor_id, y)
+FROM
+  test_hivemall t2 
+  LEFT OUTER JOIN train_hivemall t1;
+```
+
+```
+1       0.8594650626182556      12      10514   0
+2       0.8585299849510193      12      11719   0
+3       0.856602132320404       12      21009   0
+4       0.8562054634094238      12      17582   0
+5       0.8516314029693604      12      22006   0
+6       0.8499397039413452      12      25364   0
+7       0.8467264771461487      12      900     0
+8       0.8463355302810669      12      8018    0
+9       0.8439178466796875      12      7041    0
+10      0.8438876867294312      12      21595   0
+1       0.8390793800354004      25      21125   0
+2       0.8344510793685913      25      14073   0
+3       0.8340602517127991      25      9008    0
+4       0.8328862190246582      25      6598    0
+5       0.8301891088485718      25      943     0
+6       0.8271955251693726      25      20400   0
+7       0.8255619406700134      25      10922   0
+8       0.8241575956344604      25      8477    0
+9       0.822281539440155       25      25977   0
+10      0.8205751180648804      25      21115   0
+1       0.9761330485343933      34      2513    0
+2       0.9536819458007812      34      8697    0
+3       0.9531533122062683      34      7326    0
+4       0.9493276476860046      34      15173   0
+5       0.9480557441711426      34      19468   0
+...
+```
+
+### Explicit grouping using `distribute by` and `sort by`
+
+```sql
+SELECT
+  each_top_k(
+    10, id1, angular_similarity(features1, features2), 
+    id1, 
+    id2,  
+    y
+  ) as (rank, similarity, id, other_id, y)
+FROM (
+select
+  t1.id as id1,
+  t2.id as id2,
+  t1.features as features1,
+  t2.features as features2,
+  t1.y
+from
+  train_hivemall t1
+  CROSS JOIN test_hivemall t2
+DISTRIBUTE BY id1 SORT BY id1
+) t;
+```
+
+### Parallelization of similarity computation using WITH clause
+
+```sql
+create table similarities
+as
+WITH test_rnd as (
+select
+  rand(31) as rnd,
+  id,
+  features
+from
+  test_hivemall
+),
+t01 as (
+select
+ id,
+ features
+from
+ test_rnd
+where
+ rnd < 0.2
+),
+t02 as (
+select
+ id,
+ features
+from
+ test_rnd
+where
+ rnd >= 0.2 and rnd < 0.4
+),
+t03 as (
+select
+ id,
+ features
+from
+ test_rnd
+where
+ rnd >= 0.4 and rnd < 0.6
+),
+t04 as (
+select
+ id,
+ features
+from
+ test_rnd
+where
+ rnd >= 0.6 and rnd < 0.8
+),
+t05 as (
+select
+ id,
+ features
+from
+ test_rnd
+where
+ rnd >= 0.8
+),
+s01 as (
+SELECT
+  each_top_k(
+    10, t2.id, angular_similarity(t2.features, t1.features), 
+    t2.id, 
+    t1.id,  
+    t1.y
+  ) as (rank, similarity, base_id, neighbor_id, y)
+FROM
+  t01 t2 
+  LEFT OUTER JOIN train_hivemall t1
+),
+s02 as (
+SELECT
+  each_top_k(
+    10, t2.id, angular_similarity(t2.features, t1.features), 
+    t2.id, 
+    t1.id,  
+    t1.y
+  ) as (rank, similarity, base_id, neighbor_id, y)
+FROM
+  t02 t2 
+  LEFT OUTER JOIN train_hivemall t1
+),
+s03 as (
+SELECT
+  each_top_k(
+    10, t2.id, angular_similarity(t2.features, t1.features), 
+    t2.id, 
+    t1.id,  
+    t1.y
+  ) as (rank, similarity, base_id, neighbor_id, y)
+FROM
+  t03 t2 
+  LEFT OUTER JOIN train_hivemall t1
+),
+s04 as (
+SELECT
+  each_top_k(
+    10, t2.id, angular_similarity(t2.features, t1.features), 
+    t2.id, 
+    t1.id,  
+    t1.y
+  ) as (rank, similarity, base_id, neighbor_id, y)
+FROM
+  t04 t2 
+  LEFT OUTER JOIN train_hivemall t1
+),
+s05 as (
+SELECT
+  each_top_k(
+    10, t2.id, angular_similarity(t2.features, t1.features), 
+    t2.id, 
+    t1.id,  
+    t1.y
+  ) as (rank, similarity, base_id, neighbor_id, y)
+FROM
+  t05 t2 
+  LEFT OUTER JOIN train_hivemall t1
+)
+select * from s01
+union all
+select * from s02
+union all
+select * from s03
+union all
+select * from s04
+union all
+select * from s05;
+```
+
+## tail-K
+
+```sql
+set hivevar:k=-10;
+
+SELECT
+  each_top_k(
+    ${k}, t2.id, angular_similarity(t2.features, t1.features), 
+    t2.id, 
+    t1.id,  
+    t1.y
+  ) as (rank, similarity, base_id, neighbor_id, y)
+FROM
+  test_hivemall t2 
+  LEFT OUTER JOIN train_hivemall t1
+-- limit 25
+```
+
+```
+1       0.4383084177970886      1       7503    0
+2       0.44166821241378784     1       10143   0
+3       0.4424300789833069      1       11073   0
+4       0.44254064559936523     1       17782   0
+5       0.4442034363746643      1       18556   0
+6       0.45163780450820923     1       3786    0
+7       0.45244503021240234     1       10242   0
+8       0.4525672197341919      1       21657   0
+9       0.4527127146720886      1       17218   0
+10      0.45314133167266846     1       25141   0
+1       0.44030147790908813     2       3786    0
+2       0.4408798813819885      2       23386   0
+3       0.44112563133239746     2       11073   0
+4       0.4415401816368103      2       22853   0
+5       0.4422193765640259      2       21657   0
+6       0.4429032802581787      2       10143   0
+7       0.4435907006263733      2       24413   0
+8       0.44569307565689087     2       7503    0
+9       0.4460843801498413      2       25141   0
+10      0.4464914798736572      2       24289   0
+1       0.43862903118133545     3       23150   1
+2       0.4398220181465149      3       9881    1
+3       0.44283604621887207     3       27121   0
+4       0.4432108402252197      3       26220   1
+5       0.44323229789733887     3       18541   0
+...
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/iris.md b/docs/gitbook/multiclass/iris.md
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris_dataset.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/iris_dataset.md 
b/docs/gitbook/multiclass/iris_dataset.md
new file mode 100644
index 0000000..86f89ad
--- /dev/null
+++ b/docs/gitbook/multiclass/iris_dataset.md
@@ -0,0 +1,203 @@
+# Dataset prepration
+Iris Dataset: https://archive.ics.uci.edu/ml/datasets/Iris
+
+```sh
+$ wget http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
+$ less iris.data
+
+   ...
+5.3,3.7,1.5,0.2,Iris-setosa
+5.0,3.3,1.4,0.2,Iris-setosa
+7.0,3.2,4.7,1.4,Iris-versicolor
+   ...
+```
+
+# Create training/test table in Hive
+
+```sql
+create database iris;
+use iris;
+
+create external table iris_raw (
+  rowid int,
+  label string,
+  features array<float>
+) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' COLLECTION ITEMS TERMINATED BY 
"," STORED AS TEXTFILE LOCATION '/dataset/iris/raw';
+```
+
+# Loading data into HDFS
+
+```sh
+$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | head 
-3
+
+1|Iris-setosa|5.1,3.5,1.4,0.2
+2|Iris-setosa|4.9,3.0,1.4,0.2
+3|Iris-setosa|4.7,3.2,1.3,0.2
+```
+
+```sh
+$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | 
hadoop fs -put - /dataset/iris/raw/iris.data
+```
+
+```sql
+select count(1) from iris_raw;
+
+> 150
+```
+
+# Feature scaling
+
+Normalization of feature weights is very important to get a good prediction in 
machine learning.
+
+```sql
+select 
+  min(features[0]), max(features[0]),
+  min(features[1]), max(features[1]),
+  min(features[2]), max(features[2]),
+  min(features[3]), max(features[3])
+from
+  iris_raw;
+
+> 4.3     7.9     2.0     4.4     1.0     6.9     0.1     2.5
+```
+
+```sql
+set hivevar:f0_min=4.3;
+set hivevar:f0_max=7.9;
+set hivevar:f1_min=2.0;
+set hivevar:f1_max=4.4;
+set hivevar:f2_min=1.0;
+set hivevar:f2_max=6.9;
+set hivevar:f3_min=0.1;
+set hivevar:f3_max=2.5;
+
+create or replace view iris_scaled
+as
+select
+  rowid, 
+  label,
+  add_bias(array(
+     concat("1:", rescale(features[0],${f0_min},${f0_max})), 
+     concat("2:", rescale(features[1],${f1_min},${f1_max})), 
+     concat("3:", rescale(features[2],${f2_min},${f2_max})), 
+     concat("4:", rescale(features[3],${f3_min},${f3_max}))
+  )) as features
+from 
+  iris_raw;
+```
+
+```sql
+select * from iris_scaled limit 3;
+
+> 1       Iris-setosa     
["1:0.22222215","2:0.625","3:0.0677966","4:0.041666664","0:1.0"]
+> 2       Iris-setosa     
["1:0.16666664","2:0.41666666","3:0.0677966","4:0.041666664","0:1.0"]
+> 3       Iris-setosa     
["1:0.11111101","2:0.5","3:0.05084745","4:0.041666664","0:1.0"]
+```
+
+_[LibSVM web 
page](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#iris)
 provides a normalized (using 
[ZScore](https://github.com/myui/hivemall/wiki/Feature-scaling)) version of 
Iris dataset._
+
+# Create training/test data
+
+```sql
+set hivevar:rand_seed=31;
+
+create table iris_shuffled 
+as
+select rand(${rand_seed}) as rnd, * from iris_scaled;
+
+-- 80% for training
+create table train80p as
+select * from  iris_shuffled 
+order by rnd DESC
+limit 120;
+
+-- 20% for testing
+create table test20p as
+select * from  iris_shuffled 
+order by rnd ASC
+limit 30;
+
+create table test20p_exploded 
+as
+select 
+  rowid,
+  label,
+  extract_feature(feature) as feature,
+  extract_weight(feature) as value
+from 
+  test20p LATERAL VIEW explode(features) t AS feature;
+```
+
+# Define an amplified view for the training input
+```sql
+set hivevar:xtimes=10;
+set hivevar:shufflebuffersize=1000;
+
+create or replace view training_x10
+as
+select
+   rand_amplify(${xtimes}, ${shufflebuffersize}, rowid, label, features) as 
(rowid, label, features)
+from  
+   train80p;
+```
+
+# Training (multiclass classification)
+
+```sql
+create table model_scw1 as
+select 
+ label, 
+ feature,
+ argmin_kld(weight, covar) as weight
+from 
+ (select 
+     train_multiclass_scw(features, label) as (label, feature, weight, covar)
+  from 
+     training_x10
+ ) t 
+group by label, feature;
+```
+
+# Predict
+
+```sql
+create or replace view predict_scw1
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    test20p_exploded t LEFT OUTER JOIN
+    model_scw1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+# Evaluation
+
+```sql
+create or replace view eval_scw1 as
+select 
+  t.label as actual, 
+  p.label as predicted
+from 
+  test20p t JOIN predict_scw1 p 
+    on (t.rowid = p.rowid);
+
+select count(1)/30 from eval_scw1 
+where actual = predicted;
+```
+> 0.9666666666666667

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris_randomforest.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/iris_randomforest.md 
b/docs/gitbook/multiclass/iris_randomforest.md
new file mode 100644
index 0000000..bafa338
--- /dev/null
+++ b/docs/gitbook/multiclass/iris_randomforest.md
@@ -0,0 +1,307 @@
+*NOTE: RandomForest is being supported from Hivemall v0.4 or later.*
+
+# Dataset
+
+* https://archive.ics.uci.edu/ml/datasets/Iris
+
+```
+Attribute Information:
+   1. sepal length in cm
+   2. sepal width in cm
+   3. petal length in cm
+   4. petal width in cm
+   5. class: 
+      -- Iris Setosa
+      -- Iris Versicolour
+      -- Iris Virginica
+```
+
+# Table preparation
+
+```sql
+create database iris;
+use iris;
+
+create external table raw (
+  sepal_length int,
+  sepal_width int,
+  petal_length int,
+  petak_width int,
+  class string
+)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n'
+STORED AS TEXTFILE LOCATION '/dataset/iris/raw';
+
+$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
+```
+
+```sql
+create table label_mapping 
+as
+select
+  class,
+  rank - 1 as label
+from (
+select
+  distinct class,
+  dense_rank() over (order by class) as rank
+from 
+  raw
+) t
+;
+```
+
+```sql
+create table training
+as
+select
+  rowid() as rowid,
+  array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as 
features,
+  t2.label
+from
+  raw t1
+  JOIN label_mapping t2 ON (t1.class = t2.class)
+;
+```
+
+# Training
+
+`train_randomforest_classifier` takes a dense `features` in double[] and a 
`label` starting from 0.
+
+```sql
+CREATE TABLE model 
+STORED AS SEQUENCEFILE 
+AS
+select 
+  train_randomforest_classifier(features, label) 
+  -- hivemall v0.4.1-alpha.2 and before
+  -- train_randomforest_classifier(features, label) as (pred_model, 
var_importance, oob_errors, oob_tests)
+  -- hivemall v0.4.1 and later
+  -- train_randomforest_classifier(features, label) as (model_id, model_type, 
pred_model, var_importance, oob_errors, oob_tests)
+from
+  training;
+```
+*Note: The default TEXTFILE should not be used for model table when using 
Javascript output through "-output javascript" option.*
+
+```
+hive> desc model;
+model_id                int                                         
+model_type              int                                         
+pred_model              string                                      
+var_importance          array<double>                               
+oob_errors              int                                         
+oob_tests               int  
+```
+
+## Training options
+
+"-help" option shows usage of the function.
+
+```
+select train_randomforest_classifier(features, label, "-help") from training;
+
+> FAILED: UDFArgumentException 
+usage: train_randomforest_classifier(double[] features, int label [,
+       string options]) - Returns a relation consists of <int model_id,
+       int model_type, string pred_model, array<double> var_importance,
+       int oob_errors, int oob_tests> [-attrs <arg>] [-depth <arg>]
+       [-disable_compression] [-help] [-leafs <arg>] [-output <arg>]
+       [-rule <arg>] [-seed <arg>] [-splits <arg>] [-trees <arg>] [-vars
+       <arg>]
+ -attrs,--attribute_types <arg>   Comma separated attribute types (Q for
+                                  quantitative variable and C for
+                                  categorical variable. e.g., [Q,C,Q,C])
+ -depth,--max_depth <arg>         The maximum number of the tree depth
+                                  [default: Integer.MAX_VALUE]
+ -disable_compression             Whether to disable compression of the
+                                  output script [default: false]
+ -help                            Show function help
+ -leafs,--max_leaf_nodes <arg>    The maximum number of leaf nodes
+                                  [default: Integer.MAX_VALUE]
+ -output,--output_type <arg>      The output type (serialization/ser or
+                                  opscode/vm or javascript/js) [default:
+                                  serialization]
+ -rule,--split_rule <arg>         Split algorithm [default: GINI, ENTROPY]
+ -seed <arg>                      seed value in long [default: -1
+                                  (random)]
+ -splits,--min_split <arg>        A node that has greater than or equals
+                                  to `min_split` examples will split
+                                  [default: 2]
+ -trees,--num_trees <arg>         The number of trees for each task
+                                  [default: 50]
+ -vars,--num_variables <arg>      The number of random selected features
+                                  [default: ceil(sqrt(x[0].length))].
+                                  int(num_variables * x[0].length) is
+                                  considered if num_variable is (0,1]
+```
+*Caution: "-num_trees" controls the number of trees for each task, not the 
total number of trees.*
+
+### Parallelize Training
+
+To parallelize RandomForest training, you can use UNION ALL as follows:
+
+```sql
+CREATE TABLE model 
+STORED AS SEQUENCEFILE 
+AS
+select 
+  train_randomforest_classifier(features, label, '-trees 25') 
+from
+  training
+UNION ALL
+select 
+  train_randomforest_classifier(features, label, '-trees 25')
+from
+  training
+;
+```
+
+### Learning stats
+
+[`Variable 
importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp)
 and [`Out Of Bag (OOB) error 
rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) 
of RandomForest can be shown as follows:
+
+```sql
+select
+  array_sum(var_importance) as var_importance,
+  sum(oob_errors) / sum(oob_tests) as oob_err_rate
+from
+  model;
+```
+> [2.81010338879605,0.4970357753626371,23.790369091407698,14.315316390235273]  
   0.05333333333333334
+
+### Output prediction model by Javascipt
+
+```sql
+CREATE TABLE model_javascript
+STORED AS SEQUENCEFILE 
+AS
+select train_randomforest_classifier(features, label, "-output_type js 
-disable_compression")
+from training;
+
+select model from model_javascript limit 1;
+```
+
+```js
+if(x[3] <= 0.5) {
+  0;
+} else  {
+  if(x[2] <= 4.5) {
+    if(x[3] <= 1.5) {
+      if(x[0] <= 4.5) {
+        1;
+      } else  {
+        if(x[0] <= 5.5) {
+          1;
+        } else  {
+          if(x[1] <= 2.5) {
+            1;
+          } else  {
+            1;
+          }
+        }
+      }
+    } else  {
+      2;
+    }
+  } else  {
+    if(x[3] <= 1.5) {
+      2;
+    } else  {
+      2;
+    }
+  }
+}
+```
+
+# Prediction
+
+```sql
+set hivevar:classification=true;
+set hive.auto.convert.join=true;
+set hive.mapjoin.optimized.hashtable=false;
+
+create table predicted_vm
+as
+SELECT
+  rowid,
+  rf_ensemble(predicted) as predicted
+FROM (
+  SELECT
+    rowid, 
+    -- hivemall v0.4.1-alpha.2 and before
+    -- tree_predict(p.model, t.features, ${classification}) as predicted
+    -- hivemall v0.4.1 and later
+    tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
+  FROM
+    model p
+    LEFT OUTER JOIN -- CROSS JOIN
+    training t
+) t1
+group by
+  rowid
+;
+```
+_Note: Javascript outputs can be evaluated by `js_tree_predict`._
+
+### Parallelize Prediction
+
+The following query runs predictions in N-parallel. It would reduce elapsed 
time for prediction almost by N.
+
+```sql
+SET hivevar:classification=true;
+set hive.auto.convert.join=true;
+SET hive.mapjoin.optimized.hashtable=false;
+SET mapred.reduce.tasks=8;
+
+create table predicted_vm
+as
+SELECT
+  rowid,
+  rf_ensemble(predicted) as predicted
+FROM (
+  SELECT
+    t.rowid, 
+    -- hivemall v0.4.1-alpha.2 and before
+    -- tree_predict(p.pred_model, t.features, ${classification}) as predicted
+    -- hivemall v0.4.1 and later
+    tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
+  FROM (
+    SELECT model_id, model_type, pred_model
+    FROM model
+    DISTRIBUTE BY rand(1)
+  ) p 
+  LEFT OUTER JOIN training t
+) t1
+group by
+  rowid
+;
+```
+
+# Evaluation
+
+```sql
+select count(1) from training;
+> 150
+
+set hivevar:total_cnt=150;
+
+WITH t1 as (
+SELECT
+  t.rowid,
+  t.label as actual,
+  p.predicted.label as predicted
+FROM
+  predicted_vm p
+  LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
+)
+SELECT
+  count(1) / ${total_cnt}
+FROM
+  t1
+WHERE
+  actual = predicted
+;
+```
+> 0.9533333333333334
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris_scw.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/iris_scw.md 
b/docs/gitbook/multiclass/iris_scw.md
new file mode 100644
index 0000000..bafa338
--- /dev/null
+++ b/docs/gitbook/multiclass/iris_scw.md
@@ -0,0 +1,307 @@
+*NOTE: RandomForest is being supported from Hivemall v0.4 or later.*
+
+# Dataset
+
+* https://archive.ics.uci.edu/ml/datasets/Iris
+
+```
+Attribute Information:
+   1. sepal length in cm
+   2. sepal width in cm
+   3. petal length in cm
+   4. petal width in cm
+   5. class: 
+      -- Iris Setosa
+      -- Iris Versicolour
+      -- Iris Virginica
+```
+
+# Table preparation
+
+```sql
+create database iris;
+use iris;
+
+create external table raw (
+  sepal_length int,
+  sepal_width int,
+  petal_length int,
+  petak_width int,
+  class string
+)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n'
+STORED AS TEXTFILE LOCATION '/dataset/iris/raw';
+
+$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
+```
+
+```sql
+create table label_mapping 
+as
+select
+  class,
+  rank - 1 as label
+from (
+select
+  distinct class,
+  dense_rank() over (order by class) as rank
+from 
+  raw
+) t
+;
+```
+
+```sql
+create table training
+as
+select
+  rowid() as rowid,
+  array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as 
features,
+  t2.label
+from
+  raw t1
+  JOIN label_mapping t2 ON (t1.class = t2.class)
+;
+```
+
+# Training
+
+`train_randomforest_classifier` takes a dense `features` in double[] and a 
`label` starting from 0.
+
+```sql
+CREATE TABLE model 
+STORED AS SEQUENCEFILE 
+AS
+select 
+  train_randomforest_classifier(features, label) 
+  -- hivemall v0.4.1-alpha.2 and before
+  -- train_randomforest_classifier(features, label) as (pred_model, 
var_importance, oob_errors, oob_tests)
+  -- hivemall v0.4.1 and later
+  -- train_randomforest_classifier(features, label) as (model_id, model_type, 
pred_model, var_importance, oob_errors, oob_tests)
+from
+  training;
+```
+*Note: The default TEXTFILE should not be used for model table when using 
Javascript output through "-output javascript" option.*
+
+```
+hive> desc model;
+model_id                int                                         
+model_type              int                                         
+pred_model              string                                      
+var_importance          array<double>                               
+oob_errors              int                                         
+oob_tests               int  
+```
+
+## Training options
+
+"-help" option shows usage of the function.
+
+```
+select train_randomforest_classifier(features, label, "-help") from training;
+
+> FAILED: UDFArgumentException 
+usage: train_randomforest_classifier(double[] features, int label [,
+       string options]) - Returns a relation consists of <int model_id,
+       int model_type, string pred_model, array<double> var_importance,
+       int oob_errors, int oob_tests> [-attrs <arg>] [-depth <arg>]
+       [-disable_compression] [-help] [-leafs <arg>] [-output <arg>]
+       [-rule <arg>] [-seed <arg>] [-splits <arg>] [-trees <arg>] [-vars
+       <arg>]
+ -attrs,--attribute_types <arg>   Comma separated attribute types (Q for
+                                  quantitative variable and C for
+                                  categorical variable. e.g., [Q,C,Q,C])
+ -depth,--max_depth <arg>         The maximum number of the tree depth
+                                  [default: Integer.MAX_VALUE]
+ -disable_compression             Whether to disable compression of the
+                                  output script [default: false]
+ -help                            Show function help
+ -leafs,--max_leaf_nodes <arg>    The maximum number of leaf nodes
+                                  [default: Integer.MAX_VALUE]
+ -output,--output_type <arg>      The output type (serialization/ser or
+                                  opscode/vm or javascript/js) [default:
+                                  serialization]
+ -rule,--split_rule <arg>         Split algorithm [default: GINI, ENTROPY]
+ -seed <arg>                      seed value in long [default: -1
+                                  (random)]
+ -splits,--min_split <arg>        A node that has greater than or equals
+                                  to `min_split` examples will split
+                                  [default: 2]
+ -trees,--num_trees <arg>         The number of trees for each task
+                                  [default: 50]
+ -vars,--num_variables <arg>      The number of random selected features
+                                  [default: ceil(sqrt(x[0].length))].
+                                  int(num_variables * x[0].length) is
+                                  considered if num_variable is (0,1]
+```
+*Caution: "-num_trees" controls the number of trees for each task, not the 
total number of trees.*
+
+### Parallelize Training
+
+To parallelize RandomForest training, you can use UNION ALL as follows:
+
+```sql
+CREATE TABLE model 
+STORED AS SEQUENCEFILE 
+AS
+select 
+  train_randomforest_classifier(features, label, '-trees 25') 
+from
+  training
+UNION ALL
+select 
+  train_randomforest_classifier(features, label, '-trees 25')
+from
+  training
+;
+```
+
+### Learning stats
+
+[`Variable 
importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp)
 and [`Out Of Bag (OOB) error 
rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) 
of RandomForest can be shown as follows:
+
+```sql
+select
+  array_sum(var_importance) as var_importance,
+  sum(oob_errors) / sum(oob_tests) as oob_err_rate
+from
+  model;
+```
+> [2.81010338879605,0.4970357753626371,23.790369091407698,14.315316390235273]  
   0.05333333333333334
+
+### Output prediction model by Javascipt
+
+```sql
+CREATE TABLE model_javascript
+STORED AS SEQUENCEFILE 
+AS
+select train_randomforest_classifier(features, label, "-output_type js 
-disable_compression")
+from training;
+
+select model from model_javascript limit 1;
+```
+
+```js
+if(x[3] <= 0.5) {
+  0;
+} else  {
+  if(x[2] <= 4.5) {
+    if(x[3] <= 1.5) {
+      if(x[0] <= 4.5) {
+        1;
+      } else  {
+        if(x[0] <= 5.5) {
+          1;
+        } else  {
+          if(x[1] <= 2.5) {
+            1;
+          } else  {
+            1;
+          }
+        }
+      }
+    } else  {
+      2;
+    }
+  } else  {
+    if(x[3] <= 1.5) {
+      2;
+    } else  {
+      2;
+    }
+  }
+}
+```
+
+# Prediction
+
+```sql
+set hivevar:classification=true;
+set hive.auto.convert.join=true;
+set hive.mapjoin.optimized.hashtable=false;
+
+create table predicted_vm
+as
+SELECT
+  rowid,
+  rf_ensemble(predicted) as predicted
+FROM (
+  SELECT
+    rowid, 
+    -- hivemall v0.4.1-alpha.2 and before
+    -- tree_predict(p.model, t.features, ${classification}) as predicted
+    -- hivemall v0.4.1 and later
+    tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
+  FROM
+    model p
+    LEFT OUTER JOIN -- CROSS JOIN
+    training t
+) t1
+group by
+  rowid
+;
+```
+_Note: Javascript outputs can be evaluated by `js_tree_predict`._
+
+### Parallelize Prediction
+
+The following query runs predictions in N-parallel. It would reduce elapsed 
time for prediction almost by N.
+
+```sql
+SET hivevar:classification=true;
+set hive.auto.convert.join=true;
+SET hive.mapjoin.optimized.hashtable=false;
+SET mapred.reduce.tasks=8;
+
+create table predicted_vm
+as
+SELECT
+  rowid,
+  rf_ensemble(predicted) as predicted
+FROM (
+  SELECT
+    t.rowid, 
+    -- hivemall v0.4.1-alpha.2 and before
+    -- tree_predict(p.pred_model, t.features, ${classification}) as predicted
+    -- hivemall v0.4.1 and later
+    tree_predict(p.model_id, p.model_type, p.pred_model, t.features, 
${classification}) as predicted
+  FROM (
+    SELECT model_id, model_type, pred_model
+    FROM model
+    DISTRIBUTE BY rand(1)
+  ) p 
+  LEFT OUTER JOIN training t
+) t1
+group by
+  rowid
+;
+```
+
+# Evaluation
+
+```sql
+select count(1) from training;
+> 150
+
+set hivevar:total_cnt=150;
+
+WITH t1 as (
+SELECT
+  t.rowid,
+  t.label as actual,
+  p.predicted.label as predicted
+FROM
+  predicted_vm p
+  LEFT OUTER JOIN training t ON (t.rowid = p.rowid)
+)
+SELECT
+  count(1) / ${total_cnt}
+FROM
+  t1
+WHERE
+  actual = predicted
+;
+```
+> 0.9533333333333334
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/news20.md 
b/docs/gitbook/multiclass/news20.md
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_dataset.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/news20_dataset.md 
b/docs/gitbook/multiclass/news20_dataset.md
new file mode 100644
index 0000000..35ada12
--- /dev/null
+++ b/docs/gitbook/multiclass/news20_dataset.md
@@ -0,0 +1,77 @@
+Get the news20 dataset.
+http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20
+
+```sh
+$ cat <<EOF > conv.awk
+BEGIN{ FS=" " }
+{
+    label=\$1;
+    features=\$2;
+    for(i=3;i<=NF;i++)
+    {
+        features = features "," \$i;
+    }
+    print NR "\t" label "\t" features;
+}
+END{}
+EOF
+
+$ gawk -f conv.awk news20.scale > news20_scale.train
+$ gawk -f conv.awk news20.t.scale > news20_scale.test
+```
+
+## Putting data on HDFS
+```sh
+hadoop fs -mkdir -p /dataset/news20-multiclass/train
+hadoop fs -mkdir -p /dataset/news20-multiclass/test
+
+hadoop fs -copyFromLocal news20_scale.train /dataset/news20-multiclass/train
+hadoop fs -copyFromLocal news20_scale.test /dataset/news20-multiclass/test
+```
+
+## Training/test data prepareation
+```sql
+use news20;
+
+delete jar /home/myui/tmp/hivemall.jar;
+add jar /home/myui/tmp/hivemall.jar;
+
+source /home/myui/tmp/define-all.hive;
+
+Create external table news20mc_train (
+  rowid int,
+  label int,
+  features ARRAY<STRING>
+) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED 
BY "," STORED AS TEXTFILE LOCATION '/dataset/news20-multiclass/train';
+
+Create external table news20mc_test (
+  rowid int, 
+  label int,
+  features ARRAY<STRING>
+) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED 
BY "," STORED AS TEXTFILE LOCATION '/dataset/news20-multiclass/test';
+
+set hivevar:seed=31;
+create or replace view news20mc_train_x3
+as
+select 
+  * 
+from (
+select
+   amplify(3, *) as (rowid, label, features)
+from  
+   news20mc_train 
+) t
+CLUSTER BY rand(${seed});
+
+create table news20mc_test_exploded as
+select 
+  rowid,
+  label,
+  cast(split(feature,":")[0] as int) as feature,
+  cast(split(feature,":")[1] as float) as value
+  -- hivemall v0.3.1 or later
+  -- cast(extract_feature(feature) as int) as feature,
+  -- extract_weight(feature) as value
+from 
+  news20mc_test LATERAL VIEW explode(addBias(features)) t AS feature;
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_ensemble.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/news20_ensemble.md 
b/docs/gitbook/multiclass/news20_ensemble.md
new file mode 100644
index 0000000..9cfd35d
--- /dev/null
+++ b/docs/gitbook/multiclass/news20_ensemble.md
@@ -0,0 +1,180 @@
+This example explains how to run ensemble learning in Hivemall.   
+Two heads are better than one? Let's verify it by ensemble learning.
+
+---
+
+## UDF preparation
+```sql
+delete jar /home/myui/tmp/hivemall.jar;
+add jar /home/myui/tmp/hivemall.jar;
+
+source /home/myui/tmp/define-all.hive;
+```
+
+[Case1] Model ensemble/mixing
+=======================
+
+## training
+```sql
+SET hive.exec.parallel=true;
+SET hive.exec.parallel.thread.number=8;
+SET mapred.reduce.tasks=4;
+
+drop table news20mc_ensemble_model1;
+create table news20mc_ensemble_model1 as
+select 
+ label, 
+ -- cast(feature as int) as feature, -- hivemall v0.1
+ argmin_kld(feature, covar) as feature, -- hivemall v0.2 or later
+ voted_avg(weight) as weight
+from 
+ (select 
+     -- train_multiclass_cw(addBias(features),label) as (label,feature,weight) 
     -- hivemall v0.1
+     train_multiclass_cw(addBias(features),label) as 
(label,feature,weight,covar)   -- hivemall v0.2 or later
+  from 
+     news20mc_train_x3
+  union all
+  select 
+     -- train_multiclass_arow(addBias(features),label) as 
(label,feature,weight)    -- hivemall v0.1
+     train_multiclass_arow(addBias(features),label) as 
(label,feature,weight,covar) -- hivemall v0.2 or later
+  from 
+     news20mc_train_x3
+  union all
+  select 
+     -- train_multiclass_scw(addBias(features),label) as 
(label,feature,weight)     -- hivemall v0.1
+     train_multiclass_scw(addBias(features),label) as 
(label,feature,weight,covar)  -- hivemall v0.2 or later
+  from 
+     news20mc_train_x3
+ ) t 
+group by label, feature;
+
+-- reset to the default
+SET hive.exec.parallel=false;
+SET mapred.reduce.tasks=-1;
+```
+
+## prediction
+```sql
+create or replace view news20mc_ensemble_predict1 
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    news20mc_test_exploded t LEFT OUTER JOIN
+    news20mc_ensemble_model1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+## evaluation
+```sql
+create or replace view news20mc_ensemble_submit1 as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20mc_ensemble_predict1 pd 
+    on (t.rowid = pd.rowid);
+```
+
+```
+select count(1)/3993 from news20mc_ensemble_submit1 
+where actual == predicted;
+```
+
+> 0.8494866015527173
+
+## Cleaning
+
+```sql
+drop table news20mc_ensemble_model1;
+drop view news20mc_ensemble_predict1;
+drop view news20mc_ensemble_submit1;
+```
+---
+
+Unfortunately, too many cooks spoil the broth in this case :-(
+
+| Algorithm | Accuracy |
+|:-----------|------------:|
+| AROW | 0.8474830954169797 |
+| SCW2 |  0.8482344102178813 |
+| Ensemble(model) | 0.8494866015527173 |
+| CW |  0.850488354620586 |
+
+
+---
+
+[Case2] Prediction ensemble
+=================
+
+## prediction
+```sql
+create or replace view news20mc_pred_ensemble_predict1 
+as
+select 
+  rowid, 
+  m.col1 as label
+from (
+  select
+    rowid, 
+    maxrow(cnt, label) as m
+  from (
+    select
+      rowid,
+      label,
+      count(1) as cnt
+    from (
+      select * from news20mc_arow_predict1
+      union all
+      select * from news20mc_scw2_predict1
+      union all
+      select * from news20mc_cw_predict1
+    ) t1
+    group by rowid, label
+  ) t2
+  group by rowid
+) t3;
+```
+
+## evaluation
+```sql
+create or replace view news20mc_pred_ensemble_submit1 as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20mc_pred_ensemble_predict1 pd 
+    on (t.rowid = pd.rowid);
+```
+
+```
+select count(1)/3993 from news20mc_pred_ensemble_submit1 
+where actual == predicted;
+```
+
+> 0.8499874780866516
+
+Unfortunately, too many cooks spoil the broth in this case too :-(
+
+| Algorithm | Accuracy |
+|:-----------|------------:|
+| AROW | 0.8474830954169797 |
+| SCW2 |  0.8482344102178813 |
+| Ensemble(model) | 0.8494866015527173 |
+| Ensemble(prediction) | 0.8499874780866516 |
+| CW |  0.850488354620586 |
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_one-vs-the-rest.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/news20_one-vs-the-rest.md 
b/docs/gitbook/multiclass/news20_one-vs-the-rest.md
new file mode 100644
index 0000000..4c611d0
--- /dev/null
+++ b/docs/gitbook/multiclass/news20_one-vs-the-rest.md
@@ -0,0 +1,330 @@
+A one-vs-the-rest classifier use the binary classifier for each class.
+
+## UDF preparation
+```sql
+delete jar /home/myui/tmp/hivemall.jar;
+add jar /home/myui/tmp/hivemall.jar;
+
+source /home/myui/tmp/define-all.hive;
+```
+
+## training
+```sql
+SET mapred.reduce.tasks=4;
+
+drop table news20_onevsrest_arow_model;
+create table news20_onevsrest_arow_model 
+as
+select
+  label,
+  feature,
+  -- voted_avg(weight) as weight -- [hivemall v0.1]
+  argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later]
+from (
+select
+  1 as label,
+  *
+from (
+select 
+  -- train_arow(features, target) as (feature, weight)     -- [hivemall v0.1]
+  train_arow(features, target) as (feature, weight, covar) -- [hivemall v0.2 
or later]
+from 
+  news20_onevsrest_train_x3
+where
+  label = 1
+) t1
+union all
+select
+  2 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 2
+) t2
+union all
+select
+  3 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 3
+) t3
+union all
+select
+  4 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 4
+) t4
+union all
+select
+  5 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 5
+) t5
+union all
+select
+  6 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 6
+) t6
+union all
+select
+  7 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 7
+) t7
+union all
+select
+  8 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 8
+) t8
+union all
+select
+  9 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 9
+) t9
+union all
+select
+  10 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 10
+) t10
+union all
+select
+  11 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 11
+) t11
+union all
+select
+  12 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 12
+) t12
+union all
+select
+  13 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 13
+) t13
+union all
+select
+  14 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 14
+) t14
+union all
+select
+  15 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 15
+) t15
+union all
+select
+  16 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 16
+) t16
+union all
+select
+  17 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 17
+) t17
+union all
+select
+  18 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 18
+) t18
+union all
+select
+  19 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 19
+) t19
+union all
+select
+  20 as label,
+  *
+from (
+select 
+  train_arow(features, target) as (feature, weight, covar)
+from 
+  news20_onevsrest_train_x3
+where
+  label = 20
+) t20
+) t
+group by 
+  label, feature;
+
+-- reset to the default
+SET mapred.reduce.tasks=-1;
+```
+Note that the above query is optimized to scan news20_onevsrest_train_x3 once!
+
+## prediction
+```sql
+create or replace view news20_onevsrest_arow_predict 
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    news20mc_test_exploded t LEFT OUTER JOIN
+    news20_onevsrest_arow_model m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+## evaluation
+```sql
+create or replace view news20_onevsrest_arow_submit as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20_onevsrest_arow_predict pd 
+    on (t.rowid = pd.rowid);
+```
+
+```
+select count(1)/3993 from news20_onevsrest_arow_submit
+where actual == predicted;
+```
+
+> 0.8567493112947658
+
+## Cleaning
+
+```sql
+drop table news20_onevsrest_arow_model1;
+drop view news20_onevsrest_arow_predict1;
+drop view news20_onevsrest_arow_submit1;
+```
+
+| Algorithm | Accuracy |
+|:-----------|------------:|
+| AROW(multi-class) | 0.8474830954169797 |
+| CW |  0.850488354620586 |
+| AROW(one-vs-rest) | 0.8567493112947658 |

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md 
b/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md
new file mode 100644
index 0000000..2a69615
--- /dev/null
+++ b/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md
@@ -0,0 +1,52 @@
+*One-vs-the-rest* is a multiclass classification method that uses binary 
classifiers independently for each class.
+http://en.wikipedia.org/wiki/Multiclass_classification#one_vs_all
+
+## UDF preparation
+```sql
+delete jar /home/myui/tmp/hivemall.jar;
+add jar /home/myui/tmp/hivemall.jar;
+
+source /home/myui/tmp/define-all.hive;
+```
+
+## Dataset preparation for one-vs-the-rest classifiers
+
+```sql
+select collect_set(label) from news20mc_train;
+```
+> [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20]
+
+```sql
+SET 
hivevar:possible_labels="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20";
+```
+
+[one-vs-rest.awk](https://github.com/myui/hivemall/blob/master/resources/misc/one-vs-rest.awk)
+
+```
+create or replace view news20_onevsrest_train
+as
+select transform(${possible_labels}, rowid, label, addBias(features))
+  ROW FORMAT DELIMITED
+    FIELDS TERMINATED BY "\t"
+    COLLECTION ITEMS TERMINATED BY ","
+    LINES TERMINATED BY "\n"
+using 'gawk -f one-vs-rest.awk'
+  as (rowid BIGINT, label INT, target INT, features ARRAY<STRING>)
+  ROW FORMAT DELIMITED
+    FIELDS TERMINATED BY "\t"
+    COLLECTION ITEMS TERMINATED BY ","
+    LINES TERMINATED BY "\n"
+from news20mc_train;
+
+create or replace view news20_onevsrest_train_x3
+as
+select
+ *
+from (
+  select
+    amplify(3, *) as (rowid, label, target, features)
+  from
+    news20_onevsrest_train
+) t
+CLUSTER BY rand();
+```

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_pa.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/news20_pa.md 
b/docs/gitbook/multiclass/news20_pa.md
new file mode 100644
index 0000000..8e69beb
--- /dev/null
+++ b/docs/gitbook/multiclass/news20_pa.md
@@ -0,0 +1,90 @@
+Preparation
+=========
+
+## UDF preparation
+```
+delete jar /home/myui/tmp/hivemall.jar;
+add jar /home/myui/tmp/hivemall.jar;
+
+source /home/myui/tmp/define-all.hive;
+```
+
+---
+#[Passive Aggressive (PA2)]
+
+Training
+======
+
+## model building
+```sql
+drop table news20mc_pa2_model1;
+create table news20mc_pa2_model1 as
+select 
+ label, 
+ cast(feature as int) as feature,
+ voted_avg(weight) as weight
+from 
+ (select 
+     train_multiclass_pa2(addBias(features),label) as (label,feature,weight)
+  from 
+     news20mc_train_x3
+ ) t 
+group by label, feature;
+```
+
+## prediction
+```
+create or replace view news20mc_pa2_predict1 
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    news20mc_test_exploded t LEFT OUTER JOIN
+    news20mc_pa2_model1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+## evaluation
+```sql
+create or replace view news20mc_pa2_submit1 as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20mc_pa2_predict1 pd 
+    on (t.rowid = pd.rowid);
+```
+
+```sql
+select count(1)/3993 from news20mc_pa2_submit1 
+where actual == predicted;
+```
+
+> 0.7478086651640371 (plain)
+
+> 0.8204357625845229 (x3)
+
+> 0.8204357625845229 (x3 + bagging)
+
+## Cleaning
+
+```sql
+drop table news20mc_pa2_model1;
+drop table news20mc_pa2_predict1;
+drop view news20mc_pa2_submit1;
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_scw.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/multiclass/news20_scw.md 
b/docs/gitbook/multiclass/news20_scw.md
new file mode 100644
index 0000000..330c163
--- /dev/null
+++ b/docs/gitbook/multiclass/news20_scw.md
@@ -0,0 +1,319 @@
+| Algorithm | Accuracy |
+|:-----------|------------:|
+| PA2 | 0.8204357625845229 |
+| SCW1 | 0.8314550463310794 |
+| AROW | 0.8474830954169797 |
+| SCW2 |  0.8482344102178813 |
+| CW |  0.850488354620586 |
+---
+
+Preparation
+=========
+
+## UDF preparation
+```sql
+delete jar /home/myui/tmp/hivemall.jar;
+add jar /home/myui/tmp/hivemall.jar;
+
+source /home/myui/tmp/define-all.hive;
+```
+
+---
+#[CW]
+
+## training
+```sql
+drop table news20mc_cw_model1;
+create table news20mc_cw_model1 as
+select 
+ label, 
+ cast(feature as int) as feature,
+ -- voted_avg(weight) as weight -- [hivemall v0.1]
+ argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later]
+from 
+ (select 
+     -- train_multiclass_cw(addBias(features),label) as (label,feature,weight) 
-- [hivemall v0.1]
+     train_multiclass_cw(addBias(features),label) as 
(label,feature,weight,covar)    -- [hivemall v0.2 or later]
+  from 
+     news20mc_train_x3
+ ) t 
+group by label, feature;
+```
+
+## prediction
+```sql
+create or replace view news20mc_cw_predict1 
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    news20mc_test_exploded t LEFT OUTER JOIN
+    news20mc_cw_model1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+## evaluation
+```sql
+create or replace view news20mc_cw_submit1 as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20mc_cw_predict1 pd 
+    on (t.rowid = pd.rowid);
+```
+
+```
+select count(1)/3993 from news20mc_cw_submit1 
+where actual == predicted;
+```
+
+> 0.850488354620586
+
+## Cleaning
+
+```sql
+drop table news20mc_cw_model1;
+drop table news20mc_cw_predict1;
+drop view news20mc_cw_submit1;
+```
+
+---
+#[AROW]
+
+## training
+```sql
+drop table news20mc_arow_model1;
+create table news20mc_arow_model1 as
+select 
+ label, 
+ cast(feature as int) as feature,
+ -- voted_avg(weight) as weight -- [hivemall v0.1]
+ argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later]
+from 
+ (select 
+     -- train_multiclass_arow(addBias(features),label) as 
(label,feature,weight) -- [hivemall v0.1]
+     train_multiclass_arow(addBias(features),label) as 
(label,feature,weight,covar) -- [hivemall v0.2 or later]
+  from 
+     news20mc_train_x3
+ ) t 
+group by label, feature;
+```
+
+## prediction
+```sql
+create or replace view news20mc_arow_predict1 
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    news20mc_test_exploded t LEFT OUTER JOIN
+    news20mc_arow_model1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+## evaluation
+```sql
+create or replace view news20mc_arow_submit1 as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20mc_arow_predict1 pd 
+    on (t.rowid = pd.rowid);
+```
+
+```
+select count(1)/3993 from news20mc_arow_submit1 
+where actual == predicted;
+```
+
+> 0.8474830954169797
+
+## Cleaning
+
+```sql
+drop table news20mc_arow_model1;
+drop table news20mc_arow_predict1;
+drop view news20mc_arow_submit1;
+```
+
+---
+#[SCW1]
+
+## training
+```sql
+drop table news20mc_scw_model1;
+create table news20mc_scw_model1 as
+select 
+ label, 
+ cast(feature as int) as feature,
+ -- voted_avg(weight) as weight -- [hivemall v0.1]
+ argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later]
+from 
+ (select 
+     -- train_multiclass_scw(addBias(features),label) as 
(label,feature,weight) -- [hivemall v0.1]
+     train_multiclass_scw(addBias(features),label) as 
(label,feature,weight,covar) -- [hivemall v0.2 or later]
+  from 
+     news20mc_train_x3
+ ) t 
+group by label, feature;
+```
+
+## prediction
+```sql
+create or replace view news20mc_scw_predict1 
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    news20mc_test_exploded t LEFT OUTER JOIN
+    news20mc_scw_model1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+## evaluation
+```sql
+create or replace view news20mc_scw_submit1 as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20mc_scw_predict1 pd 
+    on (t.rowid = pd.rowid);
+```
+
+```
+select count(1)/3993 from news20mc_scw_submit1 
+where actual == predicted;
+```
+
+> 0.8314550463310794
+
+## Cleaning
+
+```sql
+drop table news20mc_scw_model1;
+drop table news20mc_scw_predict1;
+drop view news20mc_scw_submit1;
+```
+
+---
+#[SCW2]
+
+## training
+```sql
+drop table news20mc_scw2_model1;
+create table news20mc_scw2_model1 as
+select 
+ label, 
+ cast(feature as int) as feature,
+ -- voted_avg(weight) as weight -- [hivemall v0.1]
+ argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later]
+from 
+ (select 
+     -- train_multiclass_scw2(addBias(features),label) as 
(label,feature,weight) -- [hivemall v0.1]
+     train_multiclass_scw2(addBias(features),label) as 
(label,feature,weight,covar) -- [hivemall v0.2 or later]
+  from 
+     news20mc_train_x3
+ ) t 
+group by label, feature;
+```
+
+## prediction
+```sql
+create or replace view news20mc_scw2_predict1 
+as
+select 
+  rowid, 
+  m.col0 as score, 
+  m.col1 as label
+from (
+select
+   rowid, 
+   maxrow(score, label) as m
+from (
+  select
+    t.rowid,
+    m.label,
+    sum(m.weight * t.value) as score
+  from 
+    news20mc_test_exploded t LEFT OUTER JOIN
+    news20mc_scw2_model1 m ON (t.feature = m.feature)
+  group by
+    t.rowid, m.label
+) t1
+group by rowid
+) t2;
+```
+
+## evaluation
+```sql
+create or replace view news20mc_scw2_submit1 as
+select 
+  t.label as actual, 
+  pd.label as predicted
+from 
+  news20mc_test t JOIN news20mc_scw2_predict1 pd 
+    on (t.rowid = pd.rowid);
+```
+
+```
+select count(1)/3993 from news20mc_scw2_submit1 
+where actual == predicted;
+```
+
+> 0.8482344102178813
+
+## Cleaning
+
+```sql
+drop table news20mc_scw2_model1;
+drop table news20mc_scw2_predict1;
+drop view news20mc_scw2_submit1;
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/pig/.gitkeep
----------------------------------------------------------------------
diff --git a/docs/gitbook/pig/.gitkeep b/docs/gitbook/pig/.gitkeep
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/recommend/cf.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/recommend/cf.md b/docs/gitbook/recommend/cf.md
new file mode 100644
index 0000000..e69de29

[03/50] [abbrv] incubator-hivemall git commit: Added a gitbook userguide

Reply via email to