http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/scaling.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/scaling.md b/docs/gitbook/ft_engineering/scaling.md new file mode 100644 index 0000000..6e7d312 --- /dev/null +++ b/docs/gitbook/ft_engineering/scaling.md @@ -0,0 +1,173 @@ +# Min-Max Normalization +http://en.wikipedia.org/wiki/Feature_scaling#Rescaling +```sql +select min(target), max(target) +from ( +select target from e2006tfidf_train +-- union all +-- select target from e2006tfidf_test +) t; +``` + +> -7.899578 -0.51940954 + +```sql +set hivevar:min_target=-7.899578; +set hivevar:max_target=-0.51940954; + +create or replace view e2006tfidf_train_scaled +as +select + rowid, + rescale(target, ${min_target}, ${max_target}) as target, + features +from + e2006tfidf_train; +``` + +# Feature scaling by zscore +http://en.wikipedia.org/wiki/Standard_score + +```sql +select avg(target), stddev_pop(target) +from ( +select target from e2006tfidf_train +-- union all +-- select target from e2006tfidf_test +) t; +``` +> -3.566241460963296 0.6278076335455348 + +```sql +set hivevar:mean_target=-3.566241460963296; +set hivevar:stddev_target=0.6278076335455348; + +create or replace view e2006tfidf_train_scaled +as +select + rowid, + zscore(target, ${mean_target}, ${stddev_target}) as target, + features +from + e2006tfidf_train; +``` + +# Apply Normalization to more complex feature vector + +Apply normalization to the following data. + +```sql +select rowid, features from train limit 3; +``` + +``` +1 ["weight:69.613","specific_heat:129.07","reflectance:52.111"] +2 ["weight:70.67","specific_heat:128.161","reflectance:52.446"] +3 ["weight:72.303","specific_heat:128.45","reflectance:52.853"] +``` + +We can create a normalized table as follows: + +```sql +create table train_normalized +as +WITH fv as ( +select + rowid, + extract_feature(feature) as feature, + extract_weight(feature) as value +from + train + LATERAL VIEW explode(features) exploded AS feature +), +stats as ( +select + feature, + -- avg(value) as mean, stddev_pop(value) as stddev + min(value) as min, max(value) as max +from + fv +group by + feature +), +norm as ( +select + rowid, + t1.feature, + -- zscore(t1.value, t2.mean, t2.stddev) as zscore + rescale(t1.value, t2.min, t2.max) as minmax +from + fv t1 JOIN + stats t2 ON (t1.feature = t2.feature) +), +norm_fv as ( +select + rowid, + -- concat(feature, ":", zscore) as feature + -- concat(feature, ":", minmax) as feature -- Before Hivemall v0.3.2-1 + feature(feature, minmax) as feature -- Hivemall v0.3.2-1 or later +from + norm +) +select + rowid, + collect_list(feature) as features +from + norm_fv +group by + rowid +; +``` + +``` +1 ["reflectance:0.5252967","specific_heat:0.19863537","weight:0.0"] +2 ["reflectance:0.5950446","specific_heat:0.09166764","weight:0.052084323"] +3 ["reflectance:0.6797837","specific_heat:0.12567581","weight:0.13255163"] +... +``` + +# Tips for using both min-max and zscore normalization + +```sql +WITH quantative as ( + select id, true as minmax, "age" as feature, age as value from train + union all + select id, false as minmax, "balance" as feature, balance as value from train + union all + select id, true as minmax, "day" as feature, day as value from train + union all + select id, false as minmax, "duration" as feature, duration as value from train + union all + select id, false as minmax, "campaign" as feature, campaign as value from train + union all + select id, false as minmax, "pdays" as feature, if(pdays = -1, 0, pdays) as value from train + union all + select id, false as minmax, "previous" as feature, previous as value from train +), +quantative_stats as ( +select + feature, + avg(value) as mean, stddev_pop(value) as stddev, + min(value) as min, max(value) as max +from + quantative +group by + feature +), +quantative_norm as ( +select + t1.id, + collect_list( + feature( + t1.feature, + if(t1.minmax,rescale(t1.value, t2.min, t2.max),zscore(t1.value, t2.mean, t2.stddev)) + ) + ) as features +from + quantative t1 + JOIN quantative_stats t2 ON (t1.feature = t2.feature) +group by + t1.id +) +... +``` \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/tfidf.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/tfidf.md b/docs/gitbook/ft_engineering/tfidf.md new file mode 100644 index 0000000..e881e10 --- /dev/null +++ b/docs/gitbook/ft_engineering/tfidf.md @@ -0,0 +1,149 @@ +This document explains how to compute [TF-IDF](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) with Apache Hive/Hivemall. + +What you need to compute TF-IDF is a table/view composing (docid, word) pair, 2 views, and 1 query. + +_Note that this feature is supported since Hivemall v0.3-beta3 or later. Macro is supported since Hive 0.12 or later._ + +# Define macros used in the TF-IDF computation +```sql +create temporary macro max2(x INT, y INT) +if(x>y,x,y); + +-- create temporary macro idf(df_t INT, n_docs INT) +-- (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0); + +create temporary macro tfidf(tf FLOAT, df_t INT, n_docs INT) +tf * (log(10, CAST(n_docs as FLOAT)/max2(1,df_t)) + 1.0); +``` + +# Data preparation +To calculate TF-IDF, you need to prepare a relation consists of (docid,word) tuples. +```sql +create external table wikipage ( + docid int, + page string +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE; + +cd ~/tmp +wget https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv + +LOAD DATA LOCAL INPATH '/home/myui/tmp/tfidf_test.tsv' INTO TABLE wikipage; + +create or replace view wikipage_exploded +as +select + docid, + word +from + wikipage LATERAL VIEW explode(tokenize(page,true)) t as word +where + not is_stopword(word); +``` +You can download the data of the wikipage table from [this link]( https://gist.githubusercontent.com/myui/190b91a3a792ccfceda0/raw/327acd192da4f96da8276dcdff01b19947a4373c/tfidf_test.tsv). + +# Define views of TF/DF +```sql +create or replace view term_frequency +as +select + docid, + word, + freq +from ( +select + docid, + tf(word) as word2freq +from + wikipage_exploded +group by + docid +) t +LATERAL VIEW explode(word2freq) t2 as word, freq; + +create or replace view document_frequency +as +select + word, + count(distinct docid) docs +from + wikipage_exploded +group by + word; +``` + +# TF-IDF calculation for each docid/word pair +```sql +-- set the total number of documents +select count(distinct docid) from wikipage; +set hivevar:n_docs=3; + +create or replace view tfidf +as +select + tf.docid, + tf.word, + -- tf.freq * (log(10, CAST(${n_docs} as FLOAT)/max2(1,df.docs)) + 1.0) as tfidf + tfidf(tf.freq, df.docs, ${n_docs}) as tfidf +from + term_frequency tf + JOIN document_frequency df ON (tf.word = df.word) +order by + tfidf desc; +``` + +The result will be as follows: +``` +docid word tfidf +1 justice 0.1641245850805637 +3 knowledge 0.09484606645205085 +2 action 0.07033910867777095 +1 law 0.06564983513276658 +1 found 0.06564983513276658 +1 religion 0.06564983513276658 +1 discussion 0.06564983513276658 + ... + ... +2 act 0.017584777169442737 +2 virtues 0.017584777169442737 +2 well 0.017584777169442737 +2 willingness 0.017584777169442737 +2 find 0.017584777169442737 +2 1 0.014001086678120098 +2 experience 0.014001086678120098 +2 often 0.014001086678120098 +``` +The above result is considered to be appropriate as docid 1, 2, and 3 are the Wikipedia entries of Justice, Wisdom, and Knowledge, respectively. + +# Feature Vector with TF-IDF values + +```sql +select + docid, + -- collect_list(concat(word, ":", tfidf)) as features -- Hive 0.13 or later + collect_list(feature(word, tfidf)) as features -- Hivemall v0.3.4 & Hive 0.13 or later + -- collect_all(concat(word, ":", tfidf)) as features -- before Hive 0.13 +from + tfidf +group by + docid; +``` + +``` +1 ["justice:0.1641245850805637","found:0.06564983513276658","discussion:0.06564983513276658","law:0.065 +64983513276658","based:0.06564983513276658","religion:0.06564983513276658","viewpoints:0.03282491756638329"," +rationality:0.03282491756638329","including:0.03282491756638329","context:0.03282491756638329","concept:0.032 +82491756638329","rightness:0.03282491756638329","general:0.03282491756638329","many:0.03282491756638329","dif +fering:0.03282491756638329","fairness:0.03282491756638329","social:0.03282491756638329","broadest:0.032824917 +56638329","equity:0.03282491756638329","includes:0.03282491756638329","theology:0.03282491756638329","ethics: +0.03282491756638329","moral:0.03282491756638329","numerous:0.03282491756638329","philosophical:0.032824917566 +38329","application:0.03282491756638329","perspectives:0.03282491756638329","procedural:0.03282491756638329", +"realm:0.03282491756638329","divided:0.03282491756638329","concepts:0.03282491756638329","attainment:0.032824 +91756638329","fields:0.03282491756638329","often:0.026135361945200226","philosophy:0.026135361945200226","stu +dy:0.026135361945200226"] +2 ["action:0.07033910867777095","wisdom:0.05275433288400458","one:0.05275433288400458","understanding:0 +.04200326112968063","judgement:0.035169554338885474","apply:0.035169554338885474","disposition:0.035169554338 +885474","given:0.035169554338885474" +... +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/ft_engineering/vectorizer.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/vectorizer.md b/docs/gitbook/ft_engineering/vectorizer.md new file mode 100644 index 0000000..bc929a5 --- /dev/null +++ b/docs/gitbook/ft_engineering/vectorizer.md @@ -0,0 +1,42 @@ +## Feature Vectorizer + +`array<string> vectorize_feature(array<string> featureNames, ...)` is useful to generate a feature vector for each row, from a table. + +```sql +select vectorize_features(array("a","b"),"0.2","0.3") from dual; +>["a:0.2","b:0.3"] + +-- avoid zero weight +select vectorize_features(array("a","b"),"0.2",0) from dual; +> ["a:0.2"] + +-- true boolean value is treated as 1.0 (a categorical value w/ its column name) +select vectorize_features(array("a","b","bool"),0.2,0.3,true) from dual; +> ["a:0.2","b:0.3","bool:1.0"] + +-- an example to generate feature vectors from table +select * from dual; +> 1 +select vectorize_features(array("a"),*) from dual; +> ["a:1.0"] + +-- has categorical feature +select vectorize_features(array("a","b","wheather"),"0.2","0.3","sunny") from dual; +> ["a:0.2","b:0.3","whether#sunny"] +``` + +```sql +select + id, + vectorize_features( + array("age","job","marital","education","default","balance","housing","loan","contact","day","month","duration","campaign","pdays","previous","poutcome"), + age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome + ) as features, + y +from + train +limit 2; + +> 1 ["age:39.0","job#blue-collar","marital#married","education#secondary","default#no","balance:1756.0","housing#yes","loan#no","contact#cellular","day:3.0","month#apr","duration:939.0","campaign:1.0","pdays:-1.0","poutcome#unknown"] 1 +> 2 ["age:51.0","job#entrepreneur","marital#married","education#primary","default#no","balance:1443.0","housing#no","loan#no","contact#cellular","day:18.0","month#feb","duration:172.0","campaign:10.0","pdays:-1.0","poutcome#unknown"] 1 +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/README.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/getting_started/README.md b/docs/gitbook/getting_started/README.md new file mode 100644 index 0000000..27870e5 --- /dev/null +++ b/docs/gitbook/getting_started/README.md @@ -0,0 +1 @@ +# Summary \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/input-format.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/getting_started/input-format.md b/docs/gitbook/getting_started/input-format.md new file mode 100644 index 0000000..272d3eb --- /dev/null +++ b/docs/gitbook/getting_started/input-format.md @@ -0,0 +1,214 @@ +This page explains the input format of training data in Hivemall. +Here, we use [EBNF](http://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_Form)-like notation for describing the format. + +<!-- toc --> + +# Input Format for Classification + +The classifiers of Hivemall takes 2 (or 3) arguments: *features*, *label*, and *options* (a.k.a. [hyperparameters](http://en.wikipedia.org/wiki/Hyperparameter)). The first two arguments of training functions (e.g., [logress](https://github.com/myui/hivemall/wiki/a9a-binary-classification-(logistic-regression)) and [train_scw](https://github.com/myui/hivemall/wiki/news20-binary-classification-%232-(CW,-AROW,-SCW))) represents training examples. + +In Statistics, *features* and *label* are called [Explanatory variable and Response Variable](http://www.oswego.edu/~srp/stats/variable_types.htm), respectively. + +# Features format (for classification and regression) + +The format of *features* is common between (binary and multi-class) classification and regression. +Hivemall accepts ARRAY<INT|BIGINT|TEXT> for the type of *features* column. + +Hivemall uses a *sparse* data format (cf. [Compressed Row Storage](http://netlib.org/linalg/html_templates/node91.html)) which is similar to [LIBSVM](http://stackoverflow.com/questions/12112558/read-write-data-in-libsvm-format) and [Vowpal Wabbit](https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format). + +The format of each feature in an array is as follows: +``` +feature ::= <index>:<weight> or <index> +``` + +Each element of *index* or *weight* then accepts the following format: +``` +index ::= <INT | BIGINT | TEXT> +weight ::= <FLOAT> +``` + +The *index* are usually a number (INT or BIGINT) starting from 1. +Here is an instance of a features. +``` +10:3.4 123:0.5 34567:0.231 +``` + +*Note:* As mentioned later, *index* "0" is reserved for a [Bias/Dummy variable](https://github.com/myui/hivemall/wiki/Using-explicit-addBias()-for-a-better-prediction). + +In addition to numbers, you can use a TEXT value for an index. For example, you can use array("height:1.5", "length:2.0") for the features. +``` +"height:1.5" "length:2.0" +``` + +## Quantitative and Categorical variables + +A [quantitative variable](http://www.oswego.edu/~srp/stats/variable_types.htm) must have an *index* entry. + +Hivemall (v0.3.1 or later) provides *add_feature_index* function which is useful for adding indexes to quantitative variables. + +```sql +select add_feature_index(array(3,4.0,5)) from dual; +``` +> ["1:3.0","2:4.0","3:5.0"] + +You can omit specifying *weight* for each feature e.g. for [Categorical variables](http://www.oswego.edu/~srp/stats/variable_types.htm) as follows: +``` +feature ::= <index> +``` +Note 1.0 is used for the weight when omitting *weight*. + +## Bias/Dummy Variable in features + +Note that "0" is reserved for a Bias variable (called dummy variable in Statistics). + +The [addBias](https://github.com/myui/hivemall/wiki/Using-explicit-addBias()-for-a-better-prediction) function is Hivemall appends "0:1.0" as an element of array in *features*. + +## Feature hashing + +Hivemall supports [feature hashing/hashing trick](http://en.wikipedia.org/wiki/Feature_hashing) through [mhash function](https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-dataset#converting-feature-representation-by-feature-hashing). + +The mhash function takes a feature (i.e., *index*) of TEXT format and generates a hash number of a range from 1 to 2^24 (=16777216) by the default setting. + +Feature hashing is useful where the dimension of feature vector (i.e., the number of elements in *features*) is so large. Consider applying [mhash function]((https://github.com/myui/hivemall/wiki/KDDCup-2012-track-2-CTR-prediction-dataset#converting-feature-representation-by-feature-hashing)) when a prediction model does not fit in memory and OutOfMemory exception happens. + +In general, you don't need to use mhash when the dimension of feature vector is less than 16777216. +If feature *index* is very long TEXT (e.g., "xxxxxxx-yyyyyy-weight:55.3") and uses huge memory spaces, consider using mhash as follows: +```sql +-- feature is v0.3.2 or before +concat(mhash(extract_feature("xxxxxxx-yyyyyy-weight:55.3")), ":", extract_weight("xxxxxxx-yyyyyy-weight:55.3")) + +-- feature is v0.3.2-1 or later +feature(mhash(extract_feature("xxxxxxx-yyyyyy-weight:55.3")), extract_weight("xxxxxxx-yyyyyy-weight:55.3")) +``` +> 43352:55.3 + +## Feature Normalization + +Feature (weight) normalization is important in machine learning. Please refer [https://github.com/myui/hivemall/wiki/Feature-scaling](https://github.com/myui/hivemall/wiki/Feature-scaling) for detail. + +*** + +# Label format in Binary Classification + +The *label* must be an *INT* typed column and the values are positive (+1) or negative (-1) as follows: +``` +<label> ::= 1 | -1 +``` + +Alternatively, you can use the following format that represents 1 for a positive example and 0 for a negative example: +``` +<label> ::= 0 | 1 +``` + +# Label format in Multi-class Classification + +You can used any PRIMITIVE type in the multi-class *label*. + +``` +<label> ::= <primitive type> +``` + +Typically, the type of label column will be INT, BIGINT, or TEXT. + +*** + +# Input format in Regression + +In regression, response/predictor variable (we denote it as *target*) is a real number. + +Before Hivemall v0.3, we accepts only FLOAT type for *target*. +``` +<target> ::= <FLOAT> +``` + +You need to explicitly cast a double value of *target* to float as follows: +```sql +CAST(target as FLOAT) +``` + +On the other hand, Hivemall v0.3 or later accepts double compatible numbers in *target*. +``` +<target> ::= <FLOAT | DOUBLE | INT | TINYINT | SMALLINT| BIGINT > +``` + +## Target in Logistic Regression + +Logistic regression is actually a binary classification scheme while it can produce probabilities of positive of a training example. + +A *target* value of a training input must be in range 0.0 to 1.0, specifically 0.0 or 1.0. + +*** + +# Helper functions + +```sql +-- hivemall v0.3.2 and before +select concat("weight",":",55.0); + +-- hivemall v0.3.2-1 and later +select feature("weight", 55.0); +``` +> weight:55.0 + +```sql +select extract_feature("weight:55.0"), extract_weight("weight:55.0"); +``` +> weight | 55.0 + +```sql +-- hivemall v0.4.0 and later +select feature_index(array("10:0.2","7:0.3","9")); +``` +> [10,7,9] + +```sql +select + convert_label(-1), convert_label(1), convert_label(0.0f), convert_label(1.0f) +from + dual; +``` +> 0.0f | 1.0f | -1 | 1 + +## Quantitative Features + +`array<string> quantitative_features(array<string> featureNames, ...)` is a helper function to create sparse quantitative features from a table. + +```sql +select quantitative_features(array("apple","value"),1,120.3); +``` +> ["apple:1.0","value:120.3"] + +## Categorical Features + +`array<string> categorical_features(array<string> featureNames, ...)` is a helper function to create sparse categorical features from a table. + +```sql +select categorical_features( + array("is_cat","is_dog","is_lion","is_pengin","species"), + 1, 0, 1.0, true, "dog" +); +``` +> ["is_cat#1","is_dog#0","is_lion#1.0","is_pengin#true","species#dog"] + +## Preparing training data table + +You can create a training data table as follows: + +```sql +select + rowid() as rowid, + concat_array( + array("bias:1.0"), + categorical_features( + array("id", "name"), + id, name + ), + quantitative_features( + array("height", "weight"), + height, weight + ) + ) as features, + click_or_not as label +from + table; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/installation.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/getting_started/installation.md b/docs/gitbook/getting_started/installation.md new file mode 100644 index 0000000..bb1920e --- /dev/null +++ b/docs/gitbook/getting_started/installation.md @@ -0,0 +1,25 @@ +Prerequisites +============ + +* Hive v0.12 or later +* Java 7 or later +* [hivemall-core-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases) +* [define-all.hive](https://github.com/myui/hivemall/releases) + +Installation +============ + +Add the following two lines to your `$HOME/.hiverc` file. + +``` +add jar /home/myui/tmp/hivemall-core-xxx-with-dependencies.jar; +source /home/myui/tmp/define-all.hive; +``` + +This automatically loads all Hivemall functions every time you start a Hive session. Alternatively, you can run the following command each time. + +``` +$ hive +add jar /tmp/hivemall-core-xxx-with-dependencies.jar; +source /tmp/define-all.hive; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/getting_started/permanent-functions.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/getting_started/permanent-functions.md b/docs/gitbook/getting_started/permanent-functions.md new file mode 100644 index 0000000..aab399b --- /dev/null +++ b/docs/gitbook/getting_started/permanent-functions.md @@ -0,0 +1,42 @@ +Hive v0.13 or later supports [permanent functions](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-Create/DropFunction) that live across sessions. + +Permanent functions are useful when you are using Hive through Hiveserver or to avoid hivemall installation for each session. + +_Note: This feature is supported since hivemall-0.3 beta 3 or later._ + +<!-- toc --> + +# Put hivemall jar to HDFS + +First, put hivemall jar to HDFS as follows: +```sh +hadoop fs -mkdir -p /apps/hivemall +hadoop fs -put hivemall-with-dependencies.jar /apps/hivemall +``` + +# Create permanent functions + +_The following is an auxiliary step to define functions for hivemall databases, not for the default database._ +```sql +CREATE DATABASE IF NOT EXISTS hivemall; +USE hivemall; +``` + +Then, create permanent functions using [define-all-as-permanent.hive](https://github.com/myui/hivemall/blob/master/resources/ddl/define-all-as-permanent.hive), a DDL script to define permanent UDFs. +```sql +set hivevar:hivemall_jar=hdfs:///apps/hivemall/hivemall-with-dependencies.jar; + +source /tmp/define-all-as-permanent.hive; +``` + +# Confirm functions + +```sql +show functions "hivemall.*"; + +> hivemall.adadelta +> hivemall.adagrad +``` + +> #### Caution +You need to specify "hivemall." prefix to call hivemall UDFs in your queries if UDFs are loaded into non-default scheme, in this case _hivemall_. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/misc/generic_funcs.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md new file mode 100644 index 0000000..1769699 --- /dev/null +++ b/docs/gitbook/misc/generic_funcs.md @@ -0,0 +1,210 @@ +This page describes a list of useful Hivemall generic functions. + +# Array functions + +## Array UDFs + +- `array_concat(array<ANY> x1, array<ANY> x2, ..)` - Returns a concatenated array + +```sql +select array_concat(array(1),array(2,3)); +> [1,2,3] +``` + +- `array_intersect(array<ANY> x1, array<ANY> x2, ..)` - Returns an intersect of given arrays + +```sql +select array_intersect(array(1,3,4),array(2,3,4),array(3,5)); +> [3] +``` + +- `array_remove(array<int|text> original, int|text|array<int> target)` - Returns an array that the target is removed from the original array + +```sql +select array_remove(array(1,null,3),array(null)); +> [3] + +select array_remove(array("aaa","bbb"),"bbb"); +> ["aaa"] +``` + +- `sort_and_uniq_array(array<int>)` - Takes an array of type int and returns a sorted array in a natural order with duplicate elements eliminated + +```sql +select sort_and_uniq_array(array(3,1,1,-2,10)); +> [-2,1,3,10] +``` + +- `subarray_endwith(array<int|text> original, int|text key)` - Returns an array that ends with the specified key + +```sql +select subarray_endwith(array(1,2,3,4), 3); +> [1,2,3] +``` + +- `subarray_startwith(array<int|text> original, int|text key)` - Returns an array that starts with the specified key + +```sql +select subarray_startwith(array(1,2,3,4), 2); +> [2,3,4] +``` + +- `subarray(array<int> orignal, int fromIndex, int toIndex)` - Returns a slice of the original array between the inclusive fromIndex and the exclusive toIndex + +```sql +select subarray(array(1,2,3,4,5,6), 2,4); +> [3,4] +``` + +## Array UDAFs + +- `array_avg(array<NUMBER>)` - Returns an array<double> in which each element is the mean of a set of numbers + +- `array_sum(array<NUMBER>)` - Returns an array<double> in which each element is summed up + +# Bitset functions + +## Bitset UDF + +- `to_bits(int[] indexes)` - Returns an bitset representation if the given indexes in long[] + +```sql +select to_bits(array(1,2,3,128)); +>[14,-9223372036854775808] +``` + +- `unbits(long[] bitset)` - Returns an long array of the give bitset representation + +```sql +select unbits(to_bits(array(1,4,2,3))); +> [1,2,3,4] +``` + +- `bits_or(array<long> b1, array<long> b2, ..)` - Returns a logical OR given bitsets + +```sql +select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); +> [1,2,3,4] +``` + +## Bitset UDAF + +- `bits_collect(int|long x)` - Returns a bitset in array<long> + + +# Compression functions + +- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed BINARY obeject by using Deflater. +The compression level must be in range [-1,9] + +```sql +select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); +> AA+=kaIM|WTt!+wbGAA +``` + +- `inflate(BINARY compressedData)` - Returns a decompressed STRING by using Inflater + + +```sql +select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); +> aaaaaaaaaaaaaaaabbbbccc +``` + +# Map functions + +## Map UDFs + +- `map_get_sum(map<int,float> src, array<int> keys)` - Returns sum of values that are retrieved by keys + +- `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted array of SRC + +## MAP UDAFs + +- `to_map(key, value)` - Convert two aggregated columns into a key-value map + +- `to_ordered_map(key, value [, const boolean reverseOrder=false])` - Convert two aggregated columns into an ordered key-value map + + +# MapReduce functions + +- `rowid()` - Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER} + +- `taskid()` - Returns the value of mapred.task.partition + +# Math functions + +- `sigmoid(x)` - Returns 1.0 / (1.0 + exp(-x)) + +# Text processing functions + +- `base91(binary)` - Convert the argument from binary to a BASE91 string + +```sql +select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); +> AA+=kaIM|WTt!+wbGAA +``` + +- `unbase91(string)` - Convert a BASE91 string to a binary + +```sql +select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); +> aaaaaaaaaaaaaaaabbbbccc +``` + +- `normalize_unicode(string str [, string form])` - Transforms `str` with the specified normalization form. The `form` takes one of NFC (default), NFD, NFKC, or NFKD + +```sql +select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾ ','NFKC'); +> ãã³ã«ã¯ã«ã + +select normalize_unicode('ã±ã§ã¦â ¢','NFKC'); +> (æ ª)ãã³ãã«III +``` + +- `split_words(string query [, string regex])` - Returns an array<text> containing splitted strings + +- `is_stopword(string word)` - Returns whether English stopword or not + +- `tokenize(string englishText [, boolean toLowerCase])` - Returns words in array<string> + +- `tokenize_ja(String line [, const string mode = "normal", const list<string> stopWords, const list<string> stopTags])` - returns tokenized strings in array<string> + +```sql +select tokenize_ja("kuromojiã使ã£ãåãã¡æ¸ãã®ãã¹ãã§ãã第äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã"); + +> ["kuromoji","使ã","åãã¡æ¸ã","ãã¹ã","第","äº","弿°","normal","search","extended","æå®","ããã©ã«ã","normal"," ã¢ã¼ã"] +``` + +https://github.com/myui/hivemall/wiki/Tokenizer + +# Other functions + +- `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1 + +- `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values (or tail-K values when k is less than 0) + +https://github.com/myui/hivemall/wiki/Efficient-Top-k-computation-on-Apache-Hive-using-Hivemall-UDTF + +- `generate_series(const int|bigint start, const int|bigint end)` - Generate a series of values, from start to end + +```sql +WITH dual as ( + select 1 +) +select generate_series(1,9) +from dual; + +1 +2 +3 +4 +5 +6 +7 +8 +9 +``` + +A similar function to PostgreSQL's `generate_serics`. +http://www.postgresql.org/docs/current/static/functions-srf.html +- `x_rank(KEY)` - Generates a pseudo sequence number starting from 1 for each key \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/misc/tokenizer.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md new file mode 100644 index 0000000..cd2ce08 --- /dev/null +++ b/docs/gitbook/misc/tokenizer.md @@ -0,0 +1,30 @@ +# Tokenizer for English Texts + +Hivemall provides simple English text tokenizer UDF that has following syntax: +```sql +tokenize(text input, optional boolean toLowerCase = false) +``` + +# Tokenizer for Japanese Texts + +Hivemall-NLP module provides a Japanese text tokenizer UDF using [Kuromoji](https://github.com/atilika/kuromoji). + +First of all, you need to issue the following DDLs to use the NLP module. Note NLP module is not included in [hivemall-with-dependencies.jar](https://github.com/myui/hivemall/releases). + +> add jar /tmp/[hivemall-nlp-xxx-with-dependencies.jar](https://github.com/myui/hivemall/releases); + +> source /tmp/[define-additional.hive](https://github.com/myui/hivemall/releases); + +The signature of the UDF is as follows: +```sql +tokenize_ja(text input, optional const text mode = "normal", optional const array<string> stopWords, optional const array<string> stopTags) +``` +_Caution: `tokenize_ja` is supported since Hivemall v0.4.1 and later._ + +It's basic usage is as follows: +```sql +select tokenize_ja("kuromojiã使ã£ãåãã¡æ¸ãã®ãã¹ãã§ãã第äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã"); +``` +> ["kuromoji","使ã","åãã¡æ¸ã","ãã¹ã","第","äº","弿°","normal","search","extended","æå®","ããã©ã«ã","normal","ã¢ã¼ã"] + +For detailed APIs, please refer Javadoc of [JapaneseAnalyzer](https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html) as well. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/misc/topk.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/topk.md b/docs/gitbook/misc/topk.md new file mode 100644 index 0000000..dcd545a --- /dev/null +++ b/docs/gitbook/misc/topk.md @@ -0,0 +1,288 @@ +`each_top_k(int k, ANY group, double value, arg1, arg2, ..., argN)` returns a top-k records for each `group`. It returns a relation consists of `(int rank, double value, arg1, arg2, .., argN)`. + +This function is particularly useful for applying a similarity/distance function where the computation complexity is **O(nm)**. + +`each_top_k` is very fast when compared to other methods running top-k queries (e.g., [`rank/distributed by`](https://ragrawal.wordpress.com/2011/11/18/extract-top-n-records-in-each-group-in-hadoophive/)) in Hive. + +## Caution +* `each_top_k` is supported from Hivemall v0.3.2-3 or later. +* This UDTF assumes that input records are sorted by `group`. Use `DISTRIBUTED BY group SORTED BY group` to ensure that. Or, you can use `LEFT OUTER JOIN` for certain cases. +* It takes variable lengths arguments in `argN`. +* The third argument `value` is used for the comparison. +* `Any number types` or `timestamp` are accepted for the type of `value`. +* If k is less than 0, reverse order is used and `tail-K` records are returned for each `group`. +* Note that this function returns [a pseudo ranking](http://www.michaelpollmeier.com/selecting-top-k-items-from-a-list-efficiently-in-java-groovy/) for top-k. It always returns `at-most K` records for each group. The ranking scheme is similar to `dense_rank` but slightly different in certain cases. + +# Usage + +## top-k clicks + +http://stackoverflow.com/questions/9390698/hive-getting-top-n-records-in-group-by-query/32559050#32559050 + +```sql +set hivevar:k=5; + +select + page-id, + user-id, + clicks +from ( + select + each_top_k(${k}, page-id, clicks, page-id, user-id) + as (rank, clicks, page-id, user-id) + from ( + select + page-id, user-id, clicks + from + mytable + DISTRIBUTE BY page-id SORT BY page-id + ) t1 +) t2 +order by page-id ASC, clicks DESC; +``` + +## Top-k similarity computation + +```sql +set hivevar:k=10; + +SELECT + each_top_k( + ${k}, t2.id, angular_similarity(t2.features, t1.features), + t2.id, + t1.id, + t1.y + ) as (rank, similarity, base_id, neighbor_id, y) +FROM + test_hivemall t2 + LEFT OUTER JOIN train_hivemall t1; +``` + +``` +1 0.8594650626182556 12 10514 0 +2 0.8585299849510193 12 11719 0 +3 0.856602132320404 12 21009 0 +4 0.8562054634094238 12 17582 0 +5 0.8516314029693604 12 22006 0 +6 0.8499397039413452 12 25364 0 +7 0.8467264771461487 12 900 0 +8 0.8463355302810669 12 8018 0 +9 0.8439178466796875 12 7041 0 +10 0.8438876867294312 12 21595 0 +1 0.8390793800354004 25 21125 0 +2 0.8344510793685913 25 14073 0 +3 0.8340602517127991 25 9008 0 +4 0.8328862190246582 25 6598 0 +5 0.8301891088485718 25 943 0 +6 0.8271955251693726 25 20400 0 +7 0.8255619406700134 25 10922 0 +8 0.8241575956344604 25 8477 0 +9 0.822281539440155 25 25977 0 +10 0.8205751180648804 25 21115 0 +1 0.9761330485343933 34 2513 0 +2 0.9536819458007812 34 8697 0 +3 0.9531533122062683 34 7326 0 +4 0.9493276476860046 34 15173 0 +5 0.9480557441711426 34 19468 0 +... +``` + +### Explicit grouping using `distribute by` and `sort by` + +```sql +SELECT + each_top_k( + 10, id1, angular_similarity(features1, features2), + id1, + id2, + y + ) as (rank, similarity, id, other_id, y) +FROM ( +select + t1.id as id1, + t2.id as id2, + t1.features as features1, + t2.features as features2, + t1.y +from + train_hivemall t1 + CROSS JOIN test_hivemall t2 +DISTRIBUTE BY id1 SORT BY id1 +) t; +``` + +### Parallelization of similarity computation using WITH clause + +```sql +create table similarities +as +WITH test_rnd as ( +select + rand(31) as rnd, + id, + features +from + test_hivemall +), +t01 as ( +select + id, + features +from + test_rnd +where + rnd < 0.2 +), +t02 as ( +select + id, + features +from + test_rnd +where + rnd >= 0.2 and rnd < 0.4 +), +t03 as ( +select + id, + features +from + test_rnd +where + rnd >= 0.4 and rnd < 0.6 +), +t04 as ( +select + id, + features +from + test_rnd +where + rnd >= 0.6 and rnd < 0.8 +), +t05 as ( +select + id, + features +from + test_rnd +where + rnd >= 0.8 +), +s01 as ( +SELECT + each_top_k( + 10, t2.id, angular_similarity(t2.features, t1.features), + t2.id, + t1.id, + t1.y + ) as (rank, similarity, base_id, neighbor_id, y) +FROM + t01 t2 + LEFT OUTER JOIN train_hivemall t1 +), +s02 as ( +SELECT + each_top_k( + 10, t2.id, angular_similarity(t2.features, t1.features), + t2.id, + t1.id, + t1.y + ) as (rank, similarity, base_id, neighbor_id, y) +FROM + t02 t2 + LEFT OUTER JOIN train_hivemall t1 +), +s03 as ( +SELECT + each_top_k( + 10, t2.id, angular_similarity(t2.features, t1.features), + t2.id, + t1.id, + t1.y + ) as (rank, similarity, base_id, neighbor_id, y) +FROM + t03 t2 + LEFT OUTER JOIN train_hivemall t1 +), +s04 as ( +SELECT + each_top_k( + 10, t2.id, angular_similarity(t2.features, t1.features), + t2.id, + t1.id, + t1.y + ) as (rank, similarity, base_id, neighbor_id, y) +FROM + t04 t2 + LEFT OUTER JOIN train_hivemall t1 +), +s05 as ( +SELECT + each_top_k( + 10, t2.id, angular_similarity(t2.features, t1.features), + t2.id, + t1.id, + t1.y + ) as (rank, similarity, base_id, neighbor_id, y) +FROM + t05 t2 + LEFT OUTER JOIN train_hivemall t1 +) +select * from s01 +union all +select * from s02 +union all +select * from s03 +union all +select * from s04 +union all +select * from s05; +``` + +## tail-K + +```sql +set hivevar:k=-10; + +SELECT + each_top_k( + ${k}, t2.id, angular_similarity(t2.features, t1.features), + t2.id, + t1.id, + t1.y + ) as (rank, similarity, base_id, neighbor_id, y) +FROM + test_hivemall t2 + LEFT OUTER JOIN train_hivemall t1 +-- limit 25 +``` + +``` +1 0.4383084177970886 1 7503 0 +2 0.44166821241378784 1 10143 0 +3 0.4424300789833069 1 11073 0 +4 0.44254064559936523 1 17782 0 +5 0.4442034363746643 1 18556 0 +6 0.45163780450820923 1 3786 0 +7 0.45244503021240234 1 10242 0 +8 0.4525672197341919 1 21657 0 +9 0.4527127146720886 1 17218 0 +10 0.45314133167266846 1 25141 0 +1 0.44030147790908813 2 3786 0 +2 0.4408798813819885 2 23386 0 +3 0.44112563133239746 2 11073 0 +4 0.4415401816368103 2 22853 0 +5 0.4422193765640259 2 21657 0 +6 0.4429032802581787 2 10143 0 +7 0.4435907006263733 2 24413 0 +8 0.44569307565689087 2 7503 0 +9 0.4460843801498413 2 25141 0 +10 0.4464914798736572 2 24289 0 +1 0.43862903118133545 3 23150 1 +2 0.4398220181465149 3 9881 1 +3 0.44283604621887207 3 27121 0 +4 0.4432108402252197 3 26220 1 +5 0.44323229789733887 3 18541 0 +... +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/iris.md b/docs/gitbook/multiclass/iris.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/iris_dataset.md b/docs/gitbook/multiclass/iris_dataset.md new file mode 100644 index 0000000..86f89ad --- /dev/null +++ b/docs/gitbook/multiclass/iris_dataset.md @@ -0,0 +1,203 @@ +# Dataset prepration +Iris Dataset: https://archive.ics.uci.edu/ml/datasets/Iris + +```sh +$ wget http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data +$ less iris.data + + ... +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor + ... +``` + +# Create training/test table in Hive + +```sql +create database iris; +use iris; + +create external table iris_raw ( + rowid int, + label string, + features array<float> +) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/iris/raw'; +``` + +# Loading data into HDFS + +```sh +$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | head -3 + +1|Iris-setosa|5.1,3.5,1.4,0.2 +2|Iris-setosa|4.9,3.0,1.4,0.2 +3|Iris-setosa|4.7,3.2,1.3,0.2 +``` + +```sh +$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data +``` + +```sql +select count(1) from iris_raw; + +> 150 +``` + +# Feature scaling + +Normalization of feature weights is very important to get a good prediction in machine learning. + +```sql +select + min(features[0]), max(features[0]), + min(features[1]), max(features[1]), + min(features[2]), max(features[2]), + min(features[3]), max(features[3]) +from + iris_raw; + +> 4.3 7.9 2.0 4.4 1.0 6.9 0.1 2.5 +``` + +```sql +set hivevar:f0_min=4.3; +set hivevar:f0_max=7.9; +set hivevar:f1_min=2.0; +set hivevar:f1_max=4.4; +set hivevar:f2_min=1.0; +set hivevar:f2_max=6.9; +set hivevar:f3_min=0.1; +set hivevar:f3_max=2.5; + +create or replace view iris_scaled +as +select + rowid, + label, + add_bias(array( + concat("1:", rescale(features[0],${f0_min},${f0_max})), + concat("2:", rescale(features[1],${f1_min},${f1_max})), + concat("3:", rescale(features[2],${f2_min},${f2_max})), + concat("4:", rescale(features[3],${f3_min},${f3_max})) + )) as features +from + iris_raw; +``` + +```sql +select * from iris_scaled limit 3; + +> 1 Iris-setosa ["1:0.22222215","2:0.625","3:0.0677966","4:0.041666664","0:1.0"] +> 2 Iris-setosa ["1:0.16666664","2:0.41666666","3:0.0677966","4:0.041666664","0:1.0"] +> 3 Iris-setosa ["1:0.11111101","2:0.5","3:0.05084745","4:0.041666664","0:1.0"] +``` + +_[LibSVM web page](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#iris) provides a normalized (using [ZScore](https://github.com/myui/hivemall/wiki/Feature-scaling)) version of Iris dataset._ + +# Create training/test data + +```sql +set hivevar:rand_seed=31; + +create table iris_shuffled +as +select rand(${rand_seed}) as rnd, * from iris_scaled; + +-- 80% for training +create table train80p as +select * from iris_shuffled +order by rnd DESC +limit 120; + +-- 20% for testing +create table test20p as +select * from iris_shuffled +order by rnd ASC +limit 30; + +create table test20p_exploded +as +select + rowid, + label, + extract_feature(feature) as feature, + extract_weight(feature) as value +from + test20p LATERAL VIEW explode(features) t AS feature; +``` + +# Define an amplified view for the training input +```sql +set hivevar:xtimes=10; +set hivevar:shufflebuffersize=1000; + +create or replace view training_x10 +as +select + rand_amplify(${xtimes}, ${shufflebuffersize}, rowid, label, features) as (rowid, label, features) +from + train80p; +``` + +# Training (multiclass classification) + +```sql +create table model_scw1 as +select + label, + feature, + argmin_kld(weight, covar) as weight +from + (select + train_multiclass_scw(features, label) as (label, feature, weight, covar) + from + training_x10 + ) t +group by label, feature; +``` + +# Predict + +```sql +create or replace view predict_scw1 +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + test20p_exploded t LEFT OUTER JOIN + model_scw1 m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +# Evaluation + +```sql +create or replace view eval_scw1 as +select + t.label as actual, + p.label as predicted +from + test20p t JOIN predict_scw1 p + on (t.rowid = p.rowid); + +select count(1)/30 from eval_scw1 +where actual = predicted; +``` +> 0.9666666666666667 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris_randomforest.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/iris_randomforest.md b/docs/gitbook/multiclass/iris_randomforest.md new file mode 100644 index 0000000..bafa338 --- /dev/null +++ b/docs/gitbook/multiclass/iris_randomforest.md @@ -0,0 +1,307 @@ +*NOTE: RandomForest is being supported from Hivemall v0.4 or later.* + +# Dataset + +* https://archive.ics.uci.edu/ml/datasets/Iris + +``` +Attribute Information: + 1. sepal length in cm + 2. sepal width in cm + 3. petal length in cm + 4. petal width in cm + 5. class: + -- Iris Setosa + -- Iris Versicolour + -- Iris Virginica +``` + +# Table preparation + +```sql +create database iris; +use iris; + +create external table raw ( + sepal_length int, + sepal_width int, + petal_length int, + petak_width int, + class string +) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY ',' + LINES TERMINATED BY '\n' +STORED AS TEXTFILE LOCATION '/dataset/iris/raw'; + +$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data +``` + +```sql +create table label_mapping +as +select + class, + rank - 1 as label +from ( +select + distinct class, + dense_rank() over (order by class) as rank +from + raw +) t +; +``` + +```sql +create table training +as +select + rowid() as rowid, + array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as features, + t2.label +from + raw t1 + JOIN label_mapping t2 ON (t1.class = t2.class) +; +``` + +# Training + +`train_randomforest_classifier` takes a dense `features` in double[] and a `label` starting from 0. + +```sql +CREATE TABLE model +STORED AS SEQUENCEFILE +AS +select + train_randomforest_classifier(features, label) + -- hivemall v0.4.1-alpha.2 and before + -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests) + -- hivemall v0.4.1 and later + -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) +from + training; +``` +*Note: The default TEXTFILE should not be used for model table when using Javascript output through "-output javascript" option.* + +``` +hive> desc model; +model_id int +model_type int +pred_model string +var_importance array<double> +oob_errors int +oob_tests int +``` + +## Training options + +"-help" option shows usage of the function. + +``` +select train_randomforest_classifier(features, label, "-help") from training; + +> FAILED: UDFArgumentException +usage: train_randomforest_classifier(double[] features, int label [, + string options]) - Returns a relation consists of <int model_id, + int model_type, string pred_model, array<double> var_importance, + int oob_errors, int oob_tests> [-attrs <arg>] [-depth <arg>] + [-disable_compression] [-help] [-leafs <arg>] [-output <arg>] + [-rule <arg>] [-seed <arg>] [-splits <arg>] [-trees <arg>] [-vars + <arg>] + -attrs,--attribute_types <arg> Comma separated attribute types (Q for + quantitative variable and C for + categorical variable. e.g., [Q,C,Q,C]) + -depth,--max_depth <arg> The maximum number of the tree depth + [default: Integer.MAX_VALUE] + -disable_compression Whether to disable compression of the + output script [default: false] + -help Show function help + -leafs,--max_leaf_nodes <arg> The maximum number of leaf nodes + [default: Integer.MAX_VALUE] + -output,--output_type <arg> The output type (serialization/ser or + opscode/vm or javascript/js) [default: + serialization] + -rule,--split_rule <arg> Split algorithm [default: GINI, ENTROPY] + -seed <arg> seed value in long [default: -1 + (random)] + -splits,--min_split <arg> A node that has greater than or equals + to `min_split` examples will split + [default: 2] + -trees,--num_trees <arg> The number of trees for each task + [default: 50] + -vars,--num_variables <arg> The number of random selected features + [default: ceil(sqrt(x[0].length))]. + int(num_variables * x[0].length) is + considered if num_variable is (0,1] +``` +*Caution: "-num_trees" controls the number of trees for each task, not the total number of trees.* + +### Parallelize Training + +To parallelize RandomForest training, you can use UNION ALL as follows: + +```sql +CREATE TABLE model +STORED AS SEQUENCEFILE +AS +select + train_randomforest_classifier(features, label, '-trees 25') +from + training +UNION ALL +select + train_randomforest_classifier(features, label, '-trees 25') +from + training +; +``` + +### Learning stats + +[`Variable importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp) and [`Out Of Bag (OOB) error rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) of RandomForest can be shown as follows: + +```sql +select + array_sum(var_importance) as var_importance, + sum(oob_errors) / sum(oob_tests) as oob_err_rate +from + model; +``` +> [2.81010338879605,0.4970357753626371,23.790369091407698,14.315316390235273] 0.05333333333333334 + +### Output prediction model by Javascipt + +```sql +CREATE TABLE model_javascript +STORED AS SEQUENCEFILE +AS +select train_randomforest_classifier(features, label, "-output_type js -disable_compression") +from training; + +select model from model_javascript limit 1; +``` + +```js +if(x[3] <= 0.5) { + 0; +} else { + if(x[2] <= 4.5) { + if(x[3] <= 1.5) { + if(x[0] <= 4.5) { + 1; + } else { + if(x[0] <= 5.5) { + 1; + } else { + if(x[1] <= 2.5) { + 1; + } else { + 1; + } + } + } + } else { + 2; + } + } else { + if(x[3] <= 1.5) { + 2; + } else { + 2; + } + } +} +``` + +# Prediction + +```sql +set hivevar:classification=true; +set hive.auto.convert.join=true; +set hive.mapjoin.optimized.hashtable=false; + +create table predicted_vm +as +SELECT + rowid, + rf_ensemble(predicted) as predicted +FROM ( + SELECT + rowid, + -- hivemall v0.4.1-alpha.2 and before + -- tree_predict(p.model, t.features, ${classification}) as predicted + -- hivemall v0.4.1 and later + tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted + FROM + model p + LEFT OUTER JOIN -- CROSS JOIN + training t +) t1 +group by + rowid +; +``` +_Note: Javascript outputs can be evaluated by `js_tree_predict`._ + +### Parallelize Prediction + +The following query runs predictions in N-parallel. It would reduce elapsed time for prediction almost by N. + +```sql +SET hivevar:classification=true; +set hive.auto.convert.join=true; +SET hive.mapjoin.optimized.hashtable=false; +SET mapred.reduce.tasks=8; + +create table predicted_vm +as +SELECT + rowid, + rf_ensemble(predicted) as predicted +FROM ( + SELECT + t.rowid, + -- hivemall v0.4.1-alpha.2 and before + -- tree_predict(p.pred_model, t.features, ${classification}) as predicted + -- hivemall v0.4.1 and later + tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted + FROM ( + SELECT model_id, model_type, pred_model + FROM model + DISTRIBUTE BY rand(1) + ) p + LEFT OUTER JOIN training t +) t1 +group by + rowid +; +``` + +# Evaluation + +```sql +select count(1) from training; +> 150 + +set hivevar:total_cnt=150; + +WITH t1 as ( +SELECT + t.rowid, + t.label as actual, + p.predicted.label as predicted +FROM + predicted_vm p + LEFT OUTER JOIN training t ON (t.rowid = p.rowid) +) +SELECT + count(1) / ${total_cnt} +FROM + t1 +WHERE + actual = predicted +; +``` +> 0.9533333333333334 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/iris_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/iris_scw.md b/docs/gitbook/multiclass/iris_scw.md new file mode 100644 index 0000000..bafa338 --- /dev/null +++ b/docs/gitbook/multiclass/iris_scw.md @@ -0,0 +1,307 @@ +*NOTE: RandomForest is being supported from Hivemall v0.4 or later.* + +# Dataset + +* https://archive.ics.uci.edu/ml/datasets/Iris + +``` +Attribute Information: + 1. sepal length in cm + 2. sepal width in cm + 3. petal length in cm + 4. petal width in cm + 5. class: + -- Iris Setosa + -- Iris Versicolour + -- Iris Virginica +``` + +# Table preparation + +```sql +create database iris; +use iris; + +create external table raw ( + sepal_length int, + sepal_width int, + petal_length int, + petak_width int, + class string +) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY ',' + LINES TERMINATED BY '\n' +STORED AS TEXTFILE LOCATION '/dataset/iris/raw'; + +$ sed '/^$/d' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data +``` + +```sql +create table label_mapping +as +select + class, + rank - 1 as label +from ( +select + distinct class, + dense_rank() over (order by class) as rank +from + raw +) t +; +``` + +```sql +create table training +as +select + rowid() as rowid, + array(t1.sepal_length, t1.sepal_width, t1.petal_length, t1.petak_width) as features, + t2.label +from + raw t1 + JOIN label_mapping t2 ON (t1.class = t2.class) +; +``` + +# Training + +`train_randomforest_classifier` takes a dense `features` in double[] and a `label` starting from 0. + +```sql +CREATE TABLE model +STORED AS SEQUENCEFILE +AS +select + train_randomforest_classifier(features, label) + -- hivemall v0.4.1-alpha.2 and before + -- train_randomforest_classifier(features, label) as (pred_model, var_importance, oob_errors, oob_tests) + -- hivemall v0.4.1 and later + -- train_randomforest_classifier(features, label) as (model_id, model_type, pred_model, var_importance, oob_errors, oob_tests) +from + training; +``` +*Note: The default TEXTFILE should not be used for model table when using Javascript output through "-output javascript" option.* + +``` +hive> desc model; +model_id int +model_type int +pred_model string +var_importance array<double> +oob_errors int +oob_tests int +``` + +## Training options + +"-help" option shows usage of the function. + +``` +select train_randomforest_classifier(features, label, "-help") from training; + +> FAILED: UDFArgumentException +usage: train_randomforest_classifier(double[] features, int label [, + string options]) - Returns a relation consists of <int model_id, + int model_type, string pred_model, array<double> var_importance, + int oob_errors, int oob_tests> [-attrs <arg>] [-depth <arg>] + [-disable_compression] [-help] [-leafs <arg>] [-output <arg>] + [-rule <arg>] [-seed <arg>] [-splits <arg>] [-trees <arg>] [-vars + <arg>] + -attrs,--attribute_types <arg> Comma separated attribute types (Q for + quantitative variable and C for + categorical variable. e.g., [Q,C,Q,C]) + -depth,--max_depth <arg> The maximum number of the tree depth + [default: Integer.MAX_VALUE] + -disable_compression Whether to disable compression of the + output script [default: false] + -help Show function help + -leafs,--max_leaf_nodes <arg> The maximum number of leaf nodes + [default: Integer.MAX_VALUE] + -output,--output_type <arg> The output type (serialization/ser or + opscode/vm or javascript/js) [default: + serialization] + -rule,--split_rule <arg> Split algorithm [default: GINI, ENTROPY] + -seed <arg> seed value in long [default: -1 + (random)] + -splits,--min_split <arg> A node that has greater than or equals + to `min_split` examples will split + [default: 2] + -trees,--num_trees <arg> The number of trees for each task + [default: 50] + -vars,--num_variables <arg> The number of random selected features + [default: ceil(sqrt(x[0].length))]. + int(num_variables * x[0].length) is + considered if num_variable is (0,1] +``` +*Caution: "-num_trees" controls the number of trees for each task, not the total number of trees.* + +### Parallelize Training + +To parallelize RandomForest training, you can use UNION ALL as follows: + +```sql +CREATE TABLE model +STORED AS SEQUENCEFILE +AS +select + train_randomforest_classifier(features, label, '-trees 25') +from + training +UNION ALL +select + train_randomforest_classifier(features, label, '-trees 25') +from + training +; +``` + +### Learning stats + +[`Variable importance`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#varimp) and [`Out Of Bag (OOB) error rate`](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr) of RandomForest can be shown as follows: + +```sql +select + array_sum(var_importance) as var_importance, + sum(oob_errors) / sum(oob_tests) as oob_err_rate +from + model; +``` +> [2.81010338879605,0.4970357753626371,23.790369091407698,14.315316390235273] 0.05333333333333334 + +### Output prediction model by Javascipt + +```sql +CREATE TABLE model_javascript +STORED AS SEQUENCEFILE +AS +select train_randomforest_classifier(features, label, "-output_type js -disable_compression") +from training; + +select model from model_javascript limit 1; +``` + +```js +if(x[3] <= 0.5) { + 0; +} else { + if(x[2] <= 4.5) { + if(x[3] <= 1.5) { + if(x[0] <= 4.5) { + 1; + } else { + if(x[0] <= 5.5) { + 1; + } else { + if(x[1] <= 2.5) { + 1; + } else { + 1; + } + } + } + } else { + 2; + } + } else { + if(x[3] <= 1.5) { + 2; + } else { + 2; + } + } +} +``` + +# Prediction + +```sql +set hivevar:classification=true; +set hive.auto.convert.join=true; +set hive.mapjoin.optimized.hashtable=false; + +create table predicted_vm +as +SELECT + rowid, + rf_ensemble(predicted) as predicted +FROM ( + SELECT + rowid, + -- hivemall v0.4.1-alpha.2 and before + -- tree_predict(p.model, t.features, ${classification}) as predicted + -- hivemall v0.4.1 and later + tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted + FROM + model p + LEFT OUTER JOIN -- CROSS JOIN + training t +) t1 +group by + rowid +; +``` +_Note: Javascript outputs can be evaluated by `js_tree_predict`._ + +### Parallelize Prediction + +The following query runs predictions in N-parallel. It would reduce elapsed time for prediction almost by N. + +```sql +SET hivevar:classification=true; +set hive.auto.convert.join=true; +SET hive.mapjoin.optimized.hashtable=false; +SET mapred.reduce.tasks=8; + +create table predicted_vm +as +SELECT + rowid, + rf_ensemble(predicted) as predicted +FROM ( + SELECT + t.rowid, + -- hivemall v0.4.1-alpha.2 and before + -- tree_predict(p.pred_model, t.features, ${classification}) as predicted + -- hivemall v0.4.1 and later + tree_predict(p.model_id, p.model_type, p.pred_model, t.features, ${classification}) as predicted + FROM ( + SELECT model_id, model_type, pred_model + FROM model + DISTRIBUTE BY rand(1) + ) p + LEFT OUTER JOIN training t +) t1 +group by + rowid +; +``` + +# Evaluation + +```sql +select count(1) from training; +> 150 + +set hivevar:total_cnt=150; + +WITH t1 as ( +SELECT + t.rowid, + t.label as actual, + p.predicted.label as predicted +FROM + predicted_vm p + LEFT OUTER JOIN training t ON (t.rowid = p.rowid) +) +SELECT + count(1) / ${total_cnt} +FROM + t1 +WHERE + actual = predicted +; +``` +> 0.9533333333333334 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/news20.md b/docs/gitbook/multiclass/news20.md new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/news20_dataset.md b/docs/gitbook/multiclass/news20_dataset.md new file mode 100644 index 0000000..35ada12 --- /dev/null +++ b/docs/gitbook/multiclass/news20_dataset.md @@ -0,0 +1,77 @@ +Get the news20 dataset. +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20 + +```sh +$ cat <<EOF > conv.awk +BEGIN{ FS=" " } +{ + label=\$1; + features=\$2; + for(i=3;i<=NF;i++) + { + features = features "," \$i; + } + print NR "\t" label "\t" features; +} +END{} +EOF + +$ gawk -f conv.awk news20.scale > news20_scale.train +$ gawk -f conv.awk news20.t.scale > news20_scale.test +``` + +## Putting data on HDFS +```sh +hadoop fs -mkdir -p /dataset/news20-multiclass/train +hadoop fs -mkdir -p /dataset/news20-multiclass/test + +hadoop fs -copyFromLocal news20_scale.train /dataset/news20-multiclass/train +hadoop fs -copyFromLocal news20_scale.test /dataset/news20-multiclass/test +``` + +## Training/test data prepareation +```sql +use news20; + +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; + +Create external table news20mc_train ( + rowid int, + label int, + features ARRAY<STRING> +) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/news20-multiclass/train'; + +Create external table news20mc_test ( + rowid int, + label int, + features ARRAY<STRING> +) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/news20-multiclass/test'; + +set hivevar:seed=31; +create or replace view news20mc_train_x3 +as +select + * +from ( +select + amplify(3, *) as (rowid, label, features) +from + news20mc_train +) t +CLUSTER BY rand(${seed}); + +create table news20mc_test_exploded as +select + rowid, + label, + cast(split(feature,":")[0] as int) as feature, + cast(split(feature,":")[1] as float) as value + -- hivemall v0.3.1 or later + -- cast(extract_feature(feature) as int) as feature, + -- extract_weight(feature) as value +from + news20mc_test LATERAL VIEW explode(addBias(features)) t AS feature; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_ensemble.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/news20_ensemble.md b/docs/gitbook/multiclass/news20_ensemble.md new file mode 100644 index 0000000..9cfd35d --- /dev/null +++ b/docs/gitbook/multiclass/news20_ensemble.md @@ -0,0 +1,180 @@ +This example explains how to run ensemble learning in Hivemall. +Two heads are better than one? Let's verify it by ensemble learning. + +--- + +## UDF preparation +```sql +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; +``` + +[Case1] Model ensemble/mixing +======================= + +## training +```sql +SET hive.exec.parallel=true; +SET hive.exec.parallel.thread.number=8; +SET mapred.reduce.tasks=4; + +drop table news20mc_ensemble_model1; +create table news20mc_ensemble_model1 as +select + label, + -- cast(feature as int) as feature, -- hivemall v0.1 + argmin_kld(feature, covar) as feature, -- hivemall v0.2 or later + voted_avg(weight) as weight +from + (select + -- train_multiclass_cw(addBias(features),label) as (label,feature,weight) -- hivemall v0.1 + train_multiclass_cw(addBias(features),label) as (label,feature,weight,covar) -- hivemall v0.2 or later + from + news20mc_train_x3 + union all + select + -- train_multiclass_arow(addBias(features),label) as (label,feature,weight) -- hivemall v0.1 + train_multiclass_arow(addBias(features),label) as (label,feature,weight,covar) -- hivemall v0.2 or later + from + news20mc_train_x3 + union all + select + -- train_multiclass_scw(addBias(features),label) as (label,feature,weight) -- hivemall v0.1 + train_multiclass_scw(addBias(features),label) as (label,feature,weight,covar) -- hivemall v0.2 or later + from + news20mc_train_x3 + ) t +group by label, feature; + +-- reset to the default +SET hive.exec.parallel=false; +SET mapred.reduce.tasks=-1; +``` + +## prediction +```sql +create or replace view news20mc_ensemble_predict1 +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + news20mc_test_exploded t LEFT OUTER JOIN + news20mc_ensemble_model1 m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +## evaluation +```sql +create or replace view news20mc_ensemble_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20mc_ensemble_predict1 pd + on (t.rowid = pd.rowid); +``` + +``` +select count(1)/3993 from news20mc_ensemble_submit1 +where actual == predicted; +``` + +> 0.8494866015527173 + +## Cleaning + +```sql +drop table news20mc_ensemble_model1; +drop view news20mc_ensemble_predict1; +drop view news20mc_ensemble_submit1; +``` +--- + +Unfortunately, too many cooks spoil the broth in this case :-( + +| Algorithm | Accuracy | +|:-----------|------------:| +| AROW | 0.8474830954169797 | +| SCW2 | 0.8482344102178813 | +| Ensemble(model) | 0.8494866015527173 | +| CW | 0.850488354620586 | + + +--- + +[Case2] Prediction ensemble +================= + +## prediction +```sql +create or replace view news20mc_pred_ensemble_predict1 +as +select + rowid, + m.col1 as label +from ( + select + rowid, + maxrow(cnt, label) as m + from ( + select + rowid, + label, + count(1) as cnt + from ( + select * from news20mc_arow_predict1 + union all + select * from news20mc_scw2_predict1 + union all + select * from news20mc_cw_predict1 + ) t1 + group by rowid, label + ) t2 + group by rowid +) t3; +``` + +## evaluation +```sql +create or replace view news20mc_pred_ensemble_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20mc_pred_ensemble_predict1 pd + on (t.rowid = pd.rowid); +``` + +``` +select count(1)/3993 from news20mc_pred_ensemble_submit1 +where actual == predicted; +``` + +> 0.8499874780866516 + +Unfortunately, too many cooks spoil the broth in this case too :-( + +| Algorithm | Accuracy | +|:-----------|------------:| +| AROW | 0.8474830954169797 | +| SCW2 | 0.8482344102178813 | +| Ensemble(model) | 0.8494866015527173 | +| Ensemble(prediction) | 0.8499874780866516 | +| CW | 0.850488354620586 | \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_one-vs-the-rest.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/news20_one-vs-the-rest.md b/docs/gitbook/multiclass/news20_one-vs-the-rest.md new file mode 100644 index 0000000..4c611d0 --- /dev/null +++ b/docs/gitbook/multiclass/news20_one-vs-the-rest.md @@ -0,0 +1,330 @@ +A one-vs-the-rest classifier use the binary classifier for each class. + +## UDF preparation +```sql +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; +``` + +## training +```sql +SET mapred.reduce.tasks=4; + +drop table news20_onevsrest_arow_model; +create table news20_onevsrest_arow_model +as +select + label, + feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from ( +select + 1 as label, + * +from ( +select + -- train_arow(features, target) as (feature, weight) -- [hivemall v0.1] + train_arow(features, target) as (feature, weight, covar) -- [hivemall v0.2 or later] +from + news20_onevsrest_train_x3 +where + label = 1 +) t1 +union all +select + 2 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 2 +) t2 +union all +select + 3 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 3 +) t3 +union all +select + 4 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 4 +) t4 +union all +select + 5 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 5 +) t5 +union all +select + 6 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 6 +) t6 +union all +select + 7 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 7 +) t7 +union all +select + 8 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 8 +) t8 +union all +select + 9 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 9 +) t9 +union all +select + 10 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 10 +) t10 +union all +select + 11 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 11 +) t11 +union all +select + 12 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 12 +) t12 +union all +select + 13 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 13 +) t13 +union all +select + 14 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 14 +) t14 +union all +select + 15 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 15 +) t15 +union all +select + 16 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 16 +) t16 +union all +select + 17 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 17 +) t17 +union all +select + 18 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 18 +) t18 +union all +select + 19 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 19 +) t19 +union all +select + 20 as label, + * +from ( +select + train_arow(features, target) as (feature, weight, covar) +from + news20_onevsrest_train_x3 +where + label = 20 +) t20 +) t +group by + label, feature; + +-- reset to the default +SET mapred.reduce.tasks=-1; +``` +Note that the above query is optimized to scan news20_onevsrest_train_x3 once! + +## prediction +```sql +create or replace view news20_onevsrest_arow_predict +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + news20mc_test_exploded t LEFT OUTER JOIN + news20_onevsrest_arow_model m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +## evaluation +```sql +create or replace view news20_onevsrest_arow_submit as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20_onevsrest_arow_predict pd + on (t.rowid = pd.rowid); +``` + +``` +select count(1)/3993 from news20_onevsrest_arow_submit +where actual == predicted; +``` + +> 0.8567493112947658 + +## Cleaning + +```sql +drop table news20_onevsrest_arow_model1; +drop view news20_onevsrest_arow_predict1; +drop view news20_onevsrest_arow_submit1; +``` + +| Algorithm | Accuracy | +|:-----------|------------:| +| AROW(multi-class) | 0.8474830954169797 | +| CW | 0.850488354620586 | +| AROW(one-vs-rest) | 0.8567493112947658 | http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md b/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md new file mode 100644 index 0000000..2a69615 --- /dev/null +++ b/docs/gitbook/multiclass/news20_one-vs-the-rest_dataset.md @@ -0,0 +1,52 @@ +*One-vs-the-rest* is a multiclass classification method that uses binary classifiers independently for each class. +http://en.wikipedia.org/wiki/Multiclass_classification#one_vs_all + +## UDF preparation +```sql +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; +``` + +## Dataset preparation for one-vs-the-rest classifiers + +```sql +select collect_set(label) from news20mc_train; +``` +> [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20] + +```sql +SET hivevar:possible_labels="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16,19,18,20"; +``` + +[one-vs-rest.awk](https://github.com/myui/hivemall/blob/master/resources/misc/one-vs-rest.awk) + +``` +create or replace view news20_onevsrest_train +as +select transform(${possible_labels}, rowid, label, addBias(features)) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY "\t" + COLLECTION ITEMS TERMINATED BY "," + LINES TERMINATED BY "\n" +using 'gawk -f one-vs-rest.awk' + as (rowid BIGINT, label INT, target INT, features ARRAY<STRING>) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY "\t" + COLLECTION ITEMS TERMINATED BY "," + LINES TERMINATED BY "\n" +from news20mc_train; + +create or replace view news20_onevsrest_train_x3 +as +select + * +from ( + select + amplify(3, *) as (rowid, label, target, features) + from + news20_onevsrest_train +) t +CLUSTER BY rand(); +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_pa.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/news20_pa.md b/docs/gitbook/multiclass/news20_pa.md new file mode 100644 index 0000000..8e69beb --- /dev/null +++ b/docs/gitbook/multiclass/news20_pa.md @@ -0,0 +1,90 @@ +Preparation +========= + +## UDF preparation +``` +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; +``` + +--- +#[Passive Aggressive (PA2)] + +Training +====== + +## model building +```sql +drop table news20mc_pa2_model1; +create table news20mc_pa2_model1 as +select + label, + cast(feature as int) as feature, + voted_avg(weight) as weight +from + (select + train_multiclass_pa2(addBias(features),label) as (label,feature,weight) + from + news20mc_train_x3 + ) t +group by label, feature; +``` + +## prediction +``` +create or replace view news20mc_pa2_predict1 +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + news20mc_test_exploded t LEFT OUTER JOIN + news20mc_pa2_model1 m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +## evaluation +```sql +create or replace view news20mc_pa2_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20mc_pa2_predict1 pd + on (t.rowid = pd.rowid); +``` + +```sql +select count(1)/3993 from news20mc_pa2_submit1 +where actual == predicted; +``` + +> 0.7478086651640371 (plain) + +> 0.8204357625845229 (x3) + +> 0.8204357625845229 (x3 + bagging) + +## Cleaning + +```sql +drop table news20mc_pa2_model1; +drop table news20mc_pa2_predict1; +drop view news20mc_pa2_submit1; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/multiclass/news20_scw.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/multiclass/news20_scw.md b/docs/gitbook/multiclass/news20_scw.md new file mode 100644 index 0000000..330c163 --- /dev/null +++ b/docs/gitbook/multiclass/news20_scw.md @@ -0,0 +1,319 @@ +| Algorithm | Accuracy | +|:-----------|------------:| +| PA2 | 0.8204357625845229 | +| SCW1 | 0.8314550463310794 | +| AROW | 0.8474830954169797 | +| SCW2 | 0.8482344102178813 | +| CW | 0.850488354620586 | +--- + +Preparation +========= + +## UDF preparation +```sql +delete jar /home/myui/tmp/hivemall.jar; +add jar /home/myui/tmp/hivemall.jar; + +source /home/myui/tmp/define-all.hive; +``` + +--- +#[CW] + +## training +```sql +drop table news20mc_cw_model1; +create table news20mc_cw_model1 as +select + label, + cast(feature as int) as feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_multiclass_cw(addBias(features),label) as (label,feature,weight) -- [hivemall v0.1] + train_multiclass_cw(addBias(features),label) as (label,feature,weight,covar) -- [hivemall v0.2 or later] + from + news20mc_train_x3 + ) t +group by label, feature; +``` + +## prediction +```sql +create or replace view news20mc_cw_predict1 +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + news20mc_test_exploded t LEFT OUTER JOIN + news20mc_cw_model1 m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +## evaluation +```sql +create or replace view news20mc_cw_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20mc_cw_predict1 pd + on (t.rowid = pd.rowid); +``` + +``` +select count(1)/3993 from news20mc_cw_submit1 +where actual == predicted; +``` + +> 0.850488354620586 + +## Cleaning + +```sql +drop table news20mc_cw_model1; +drop table news20mc_cw_predict1; +drop view news20mc_cw_submit1; +``` + +--- +#[AROW] + +## training +```sql +drop table news20mc_arow_model1; +create table news20mc_arow_model1 as +select + label, + cast(feature as int) as feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_multiclass_arow(addBias(features),label) as (label,feature,weight) -- [hivemall v0.1] + train_multiclass_arow(addBias(features),label) as (label,feature,weight,covar) -- [hivemall v0.2 or later] + from + news20mc_train_x3 + ) t +group by label, feature; +``` + +## prediction +```sql +create or replace view news20mc_arow_predict1 +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + news20mc_test_exploded t LEFT OUTER JOIN + news20mc_arow_model1 m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +## evaluation +```sql +create or replace view news20mc_arow_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20mc_arow_predict1 pd + on (t.rowid = pd.rowid); +``` + +``` +select count(1)/3993 from news20mc_arow_submit1 +where actual == predicted; +``` + +> 0.8474830954169797 + +## Cleaning + +```sql +drop table news20mc_arow_model1; +drop table news20mc_arow_predict1; +drop view news20mc_arow_submit1; +``` + +--- +#[SCW1] + +## training +```sql +drop table news20mc_scw_model1; +create table news20mc_scw_model1 as +select + label, + cast(feature as int) as feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_multiclass_scw(addBias(features),label) as (label,feature,weight) -- [hivemall v0.1] + train_multiclass_scw(addBias(features),label) as (label,feature,weight,covar) -- [hivemall v0.2 or later] + from + news20mc_train_x3 + ) t +group by label, feature; +``` + +## prediction +```sql +create or replace view news20mc_scw_predict1 +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + news20mc_test_exploded t LEFT OUTER JOIN + news20mc_scw_model1 m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +## evaluation +```sql +create or replace view news20mc_scw_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20mc_scw_predict1 pd + on (t.rowid = pd.rowid); +``` + +``` +select count(1)/3993 from news20mc_scw_submit1 +where actual == predicted; +``` + +> 0.8314550463310794 + +## Cleaning + +```sql +drop table news20mc_scw_model1; +drop table news20mc_scw_predict1; +drop view news20mc_scw_submit1; +``` + +--- +#[SCW2] + +## training +```sql +drop table news20mc_scw2_model1; +create table news20mc_scw2_model1 as +select + label, + cast(feature as int) as feature, + -- voted_avg(weight) as weight -- [hivemall v0.1] + argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] +from + (select + -- train_multiclass_scw2(addBias(features),label) as (label,feature,weight) -- [hivemall v0.1] + train_multiclass_scw2(addBias(features),label) as (label,feature,weight,covar) -- [hivemall v0.2 or later] + from + news20mc_train_x3 + ) t +group by label, feature; +``` + +## prediction +```sql +create or replace view news20mc_scw2_predict1 +as +select + rowid, + m.col0 as score, + m.col1 as label +from ( +select + rowid, + maxrow(score, label) as m +from ( + select + t.rowid, + m.label, + sum(m.weight * t.value) as score + from + news20mc_test_exploded t LEFT OUTER JOIN + news20mc_scw2_model1 m ON (t.feature = m.feature) + group by + t.rowid, m.label +) t1 +group by rowid +) t2; +``` + +## evaluation +```sql +create or replace view news20mc_scw2_submit1 as +select + t.label as actual, + pd.label as predicted +from + news20mc_test t JOIN news20mc_scw2_predict1 pd + on (t.rowid = pd.rowid); +``` + +``` +select count(1)/3993 from news20mc_scw2_submit1 +where actual == predicted; +``` + +> 0.8482344102178813 + +## Cleaning + +```sql +drop table news20mc_scw2_model1; +drop table news20mc_scw2_predict1; +drop view news20mc_scw2_submit1; +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/pig/.gitkeep ---------------------------------------------------------------------- diff --git a/docs/gitbook/pig/.gitkeep b/docs/gitbook/pig/.gitkeep new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/370e2aa3/docs/gitbook/recommend/cf.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/recommend/cf.md b/docs/gitbook/recommend/cf.md new file mode 100644 index 0000000..e69de29
