http://git-wip-us.apache.org/repos/asf/incubator-hivemall-site/blob/6c6a1b42/userguide/supervised_learning/tutorial.html
----------------------------------------------------------------------
diff --git a/userguide/supervised_learning/tutorial.html 
b/userguide/supervised_learning/tutorial.html
new file mode 100644
index 0000000..0863296
--- /dev/null
+++ b/userguide/supervised_learning/tutorial.html
@@ -0,0 +1,2982 @@
+
+<!DOCTYPE HTML>
+<html lang="" >
+    <head>
+        <meta charset="UTF-8">
+        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
+        <title>Step-by-Step Tutorial on Supervised Learning · Hivemall User 
Manual</title>
+        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+        <meta name="description" content="">
+        <meta name="generator" content="GitBook 3.2.3">
+        
+        
+        
+    
+    <link rel="stylesheet" href="../gitbook/style.css">
+
+    
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-splitter/splitter.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-etoc/plugin.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-callouts/plugin.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-toggle-chapters/toggle.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-codeblock-filename/block.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-multipart/multipart.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-katex/katex.min.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-emphasize/plugin.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-highlight/website.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-search/search.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-fontsettings/website.css">
+                
+            
+                
+                <link rel="stylesheet" 
href="../gitbook/gitbook-plugin-theme-api/theme-api.css">
+                
+            
+        
+
+    
+
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+
+        
+    
+    
+    <meta name="HandheldFriendly" content="true"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1, 
user-scalable=no">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black">
+    <link rel="apple-touch-icon-precomposed" sizes="152x152" 
href="../gitbook/images/apple-touch-icon-precomposed-152.png">
+    <link rel="shortcut icon" href="../gitbook/images/favicon.ico" 
type="image/x-icon">
+
+    
+    <link rel="next" href="../binaryclass/general.html" />
+    
+    
+    <link rel="prev" href="prediction.html" />
+    
+
+    </head>
+    <body>
+        
+<div class="book">
+    <div class="book-summary">
+        
+            
+<div id="book-search-input" role="search">
+    <input type="text" placeholder="Type to search" />
+</div>
+
+            
+                <nav role="navigation">
+                
+
+
+<ul class="summary">
+    
+    
+    
+        
+        <li>
+            <a href="http://hivemall.incubator.apache.org/"; target="_blank" 
class="custom-link"><i class="fa fa-home"></i> Home</a>
+        </li>
+    
+    
+
+    
+    <li class="divider"></li>
+    
+
+    
+        
+        <li class="header">TABLE OF CONTENTS</li>
+        
+        
+    
+        <li class="chapter " data-level="1.1" data-path="../">
+            
+                <a href="../">
+            
+                    
+                        <b>1.1.</b>
+                    
+                    Introduction
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2" data-path="../getting_started/">
+            
+                <a href="../getting_started/">
+            
+                    
+                        <b>1.2.</b>
+                    
+                    Getting Started
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.2.1" 
data-path="../getting_started/installation.html">
+            
+                <a href="../getting_started/installation.html">
+            
+                    
+                        <b>1.2.1.</b>
+                    
+                    Installation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.2" 
data-path="../getting_started/permanent-functions.html">
+            
+                <a href="../getting_started/permanent-functions.html">
+            
+                    
+                        <b>1.2.2.</b>
+                    
+                    Install as permanent functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.3" 
data-path="../getting_started/input-format.html">
+            
+                <a href="../getting_started/input-format.html">
+            
+                    
+                        <b>1.2.3.</b>
+                    
+                    Input Format
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3" data-path="../misc/funcs.html">
+            
+                <a href="../misc/funcs.html">
+            
+                    
+                        <b>1.3.</b>
+                    
+                    List of Functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4" data-path="../tips/">
+            
+                <a href="../tips/">
+            
+                    
+                        <b>1.4.</b>
+                    
+                    Tips for Effective Hivemall
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.4.1" 
data-path="../tips/addbias.html">
+            
+                <a href="../tips/addbias.html">
+            
+                    
+                        <b>1.4.1.</b>
+                    
+                    Explicit add_bias() for better prediction
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4.2" 
data-path="../tips/rand_amplify.html">
+            
+                <a href="../tips/rand_amplify.html">
+            
+                    
+                        <b>1.4.2.</b>
+                    
+                    Use rand_amplify() to better prediction results
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4.3" 
data-path="../tips/rt_prediction.html">
+            
+                <a href="../tips/rt_prediction.html">
+            
+                    
+                        <b>1.4.3.</b>
+                    
+                    Real-time prediction on RDBMS
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4.4" 
data-path="../tips/ensemble_learning.html">
+            
+                <a href="../tips/ensemble_learning.html">
+            
+                    
+                        <b>1.4.4.</b>
+                    
+                    Ensemble learning for stable prediction
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4.5" 
data-path="../tips/mixserver.html">
+            
+                <a href="../tips/mixserver.html">
+            
+                    
+                        <b>1.4.5.</b>
+                    
+                    Mixing models for a better prediction convergence (MIX 
server)
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4.6" data-path="../tips/emr.html">
+            
+                <a href="../tips/emr.html">
+            
+                    
+                        <b>1.4.6.</b>
+                    
+                    Run Hivemall on Amazon Elastic MapReduce
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5" 
data-path="../tips/general_tips.html">
+            
+                <a href="../tips/general_tips.html">
+            
+                    
+                        <b>1.5.</b>
+                    
+                    General Hive/Hadoop Tips
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.5.1" data-path="../tips/rowid.html">
+            
+                <a href="../tips/rowid.html">
+            
+                    
+                        <b>1.5.1.</b>
+                    
+                    Adding rowid for each row
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5.2" 
data-path="../tips/hadoop_tuning.html">
+            
+                <a href="../tips/hadoop_tuning.html">
+            
+                    
+                        <b>1.5.2.</b>
+                    
+                    Hadoop tuning for Hivemall
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.6" data-path="../troubleshooting/">
+            
+                <a href="../troubleshooting/">
+            
+                    
+                        <b>1.6.</b>
+                    
+                    Troubleshooting
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.6.1" 
data-path="../troubleshooting/oom.html">
+            
+                <a href="../troubleshooting/oom.html">
+            
+                    
+                        <b>1.6.1.</b>
+                    
+                    OutOfMemoryError in training
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.6.2" 
data-path="../troubleshooting/mapjoin_task_error.html">
+            
+                <a href="../troubleshooting/mapjoin_task_error.html">
+            
+                    
+                        <b>1.6.2.</b>
+                    
+                    SemanticException generate map join task error: Cannot 
serialize object
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.6.3" 
data-path="../troubleshooting/asterisk.html">
+            
+                <a href="../troubleshooting/asterisk.html">
+            
+                    
+                        <b>1.6.3.</b>
+                    
+                    Asterisk argument for UDTF does not work
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.6.4" 
data-path="../troubleshooting/num_mappers.html">
+            
+                <a href="../troubleshooting/num_mappers.html">
+            
+                    
+                        <b>1.6.4.</b>
+                    
+                    The number of mappers is less than input splits in Hadoop 
2.x
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.6.5" 
data-path="../troubleshooting/mapjoin_classcastex.html">
+            
+                <a href="../troubleshooting/mapjoin_classcastex.html">
+            
+                    
+                        <b>1.6.5.</b>
+                    
+                    Map-side join causes ClassCastException on Tez
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part II - Generic Features</li>
+        
+        
+    
+        <li class="chapter " data-level="2.1" 
data-path="../misc/generic_funcs.html">
+            
+                <a href="../misc/generic_funcs.html">
+            
+                    
+                        <b>2.1.</b>
+                    
+                    List of Generic Hivemall Functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="2.2" data-path="../misc/topk.html">
+            
+                <a href="../misc/topk.html">
+            
+                    
+                        <b>2.2.</b>
+                    
+                    Efficient Top-K Query Processing
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="2.3" 
data-path="../misc/tokenizer.html">
+            
+                <a href="../misc/tokenizer.html">
+            
+                    
+                        <b>2.3.</b>
+                    
+                    Text Tokenizer
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="2.4" data-path="../misc/approx.html">
+            
+                <a href="../misc/approx.html">
+            
+                    
+                        <b>2.4.</b>
+                    
+                    Approximate Aggregate Functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part III - Feature Engineering</li>
+        
+        
+    
+        <li class="chapter " data-level="3.1" 
data-path="../ft_engineering/scaling.html">
+            
+                <a href="../ft_engineering/scaling.html">
+            
+                    
+                        <b>3.1.</b>
+                    
+                    Feature Scaling
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.2" 
data-path="../ft_engineering/hashing.html">
+            
+                <a href="../ft_engineering/hashing.html">
+            
+                    
+                        <b>3.2.</b>
+                    
+                    Feature Hashing
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.3" 
data-path="../ft_engineering/selection.html">
+            
+                <a href="../ft_engineering/selection.html">
+            
+                    
+                        <b>3.3.</b>
+                    
+                    Feature Selection
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.4" 
data-path="../ft_engineering/binning.html">
+            
+                <a href="../ft_engineering/binning.html">
+            
+                    
+                        <b>3.4.</b>
+                    
+                    Feature Binning
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.5" 
data-path="../ft_engineering/pairing.html">
+            
+                <a href="../ft_engineering/pairing.html">
+            
+                    
+                        <b>3.5.</b>
+                    
+                    Feature Paring
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="3.5.1" 
data-path="../ft_engineering/polynomial.html">
+            
+                <a href="../ft_engineering/polynomial.html">
+            
+                    
+                        <b>3.5.1.</b>
+                    
+                    Polynomial features
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="3.6" 
data-path="../ft_engineering/ft_trans.html">
+            
+                <a href="../ft_engineering/ft_trans.html">
+            
+                    
+                        <b>3.6.</b>
+                    
+                    Feature Transformation
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="3.6.1" 
data-path="../ft_engineering/vectorization.html">
+            
+                <a href="../ft_engineering/vectorization.html">
+            
+                    
+                        <b>3.6.1.</b>
+                    
+                    Feature vectorization
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="3.6.2" 
data-path="../ft_engineering/quantify.html">
+            
+                <a href="../ft_engineering/quantify.html">
+            
+                    
+                        <b>3.6.2.</b>
+                    
+                    Quantify non-number features
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="3.7" 
data-path="../ft_engineering/tfidf.html">
+            
+                <a href="../ft_engineering/tfidf.html">
+            
+                    
+                        <b>3.7.</b>
+                    
+                    TF-IDF Calculation
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part IV - Evaluation</li>
+        
+        
+    
+        <li class="chapter " data-level="4.1" 
data-path="../eval/binary_classification_measures.html">
+            
+                <a href="../eval/binary_classification_measures.html">
+            
+                    
+                        <b>4.1.</b>
+                    
+                    Binary Classification Metrics
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="4.1.1" data-path="../eval/auc.html">
+            
+                <a href="../eval/auc.html">
+            
+                    
+                        <b>4.1.1.</b>
+                    
+                    Area under the ROC curve
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="4.2" 
data-path="../eval/multilabel_classification_measures.html">
+            
+                <a href="../eval/multilabel_classification_measures.html">
+            
+                    
+                        <b>4.2.</b>
+                    
+                    Multi-label Classification Metrics
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="4.3" 
data-path="../eval/regression.html">
+            
+                <a href="../eval/regression.html">
+            
+                    
+                        <b>4.3.</b>
+                    
+                    Regression Metrics
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="4.4" data-path="../eval/rank.html">
+            
+                <a href="../eval/rank.html">
+            
+                    
+                        <b>4.4.</b>
+                    
+                    Ranking Measures
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="4.5" data-path="../eval/datagen.html">
+            
+                <a href="../eval/datagen.html">
+            
+                    
+                        <b>4.5.</b>
+                    
+                    Data Generation
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="4.5.1" 
data-path="../eval/lr_datagen.html">
+            
+                <a href="../eval/lr_datagen.html">
+            
+                    
+                        <b>4.5.1.</b>
+                    
+                    Logistic Regression data generation
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part V - Supervised Learning</li>
+        
+        
+    
+        <li class="chapter " data-level="5.1" data-path="prediction.html">
+            
+                <a href="prediction.html">
+            
+                    
+                        <b>5.1.</b>
+                    
+                    How Prediction Works
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter active" data-level="5.2" data-path="tutorial.html">
+            
+                <a href="tutorial.html">
+            
+                    
+                        <b>5.2.</b>
+                    
+                    Step-by-Step Tutorial on Supervised Learning
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part VI - Binary Classification</li>
+        
+        
+    
+        <li class="chapter " data-level="6.1" 
data-path="../binaryclass/general.html">
+            
+                <a href="../binaryclass/general.html">
+            
+                    
+                        <b>6.1.</b>
+                    
+                    Binary Classification
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.2" 
data-path="../binaryclass/a9a.html">
+            
+                <a href="../binaryclass/a9a.html">
+            
+                    
+                        <b>6.2.</b>
+                    
+                    a9a Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.2.1" 
data-path="../binaryclass/a9a_dataset.html">
+            
+                <a href="../binaryclass/a9a_dataset.html">
+            
+                    
+                        <b>6.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.2.2" 
data-path="../binaryclass/a9a_lr.html">
+            
+                <a href="../binaryclass/a9a_lr.html">
+            
+                    
+                        <b>6.2.2.</b>
+                    
+                    Logistic Regression
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.2.3" 
data-path="../binaryclass/a9a_minibatch.html">
+            
+                <a href="../binaryclass/a9a_minibatch.html">
+            
+                    
+                        <b>6.2.3.</b>
+                    
+                    Mini-batch gradient descent
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="6.3" 
data-path="../binaryclass/news20.html">
+            
+                <a href="../binaryclass/news20.html">
+            
+                    
+                        <b>6.3.</b>
+                    
+                    News20 Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.3.1" 
data-path="../binaryclass/news20_dataset.html">
+            
+                <a href="../binaryclass/news20_dataset.html">
+            
+                    
+                        <b>6.3.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.3.2" 
data-path="../binaryclass/news20_pa.html">
+            
+                <a href="../binaryclass/news20_pa.html">
+            
+                    
+                        <b>6.3.2.</b>
+                    
+                    Perceptron, Passive Aggressive
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.3.3" 
data-path="../binaryclass/news20_scw.html">
+            
+                <a href="../binaryclass/news20_scw.html">
+            
+                    
+                        <b>6.3.3.</b>
+                    
+                    CW, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.3.4" 
data-path="../binaryclass/news20_adagrad.html">
+            
+                <a href="../binaryclass/news20_adagrad.html">
+            
+                    
+                        <b>6.3.4.</b>
+                    
+                    AdaGradRDA, AdaGrad, AdaDelta
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.3.5" 
data-path="../binaryclass/news20_rf.html">
+            
+                <a href="../binaryclass/news20_rf.html">
+            
+                    
+                        <b>6.3.5.</b>
+                    
+                    Random Forest
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="6.4" 
data-path="../binaryclass/kdd2010a.html">
+            
+                <a href="../binaryclass/kdd2010a.html">
+            
+                    
+                        <b>6.4.</b>
+                    
+                    KDD2010a Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.4.1" 
data-path="../binaryclass/kdd2010a_dataset.html">
+            
+                <a href="../binaryclass/kdd2010a_dataset.html">
+            
+                    
+                        <b>6.4.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.4.2" 
data-path="../binaryclass/kdd2010a_scw.html">
+            
+                <a href="../binaryclass/kdd2010a_scw.html">
+            
+                    
+                        <b>6.4.2.</b>
+                    
+                    PA, CW, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="6.5" 
data-path="../binaryclass/kdd2010b.html">
+            
+                <a href="../binaryclass/kdd2010b.html">
+            
+                    
+                        <b>6.5.</b>
+                    
+                    KDD2010b Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.5.1" 
data-path="../binaryclass/kdd2010b_dataset.html">
+            
+                <a href="../binaryclass/kdd2010b_dataset.html">
+            
+                    
+                        <b>6.5.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.5.2" 
data-path="../binaryclass/kdd2010b_arow.html">
+            
+                <a href="../binaryclass/kdd2010b_arow.html">
+            
+                    
+                        <b>6.5.2.</b>
+                    
+                    AROW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="6.6" 
data-path="../binaryclass/webspam.html">
+            
+                <a href="../binaryclass/webspam.html">
+            
+                    
+                        <b>6.6.</b>
+                    
+                    Webspam Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.6.1" 
data-path="../binaryclass/webspam_dataset.html">
+            
+                <a href="../binaryclass/webspam_dataset.html">
+            
+                    
+                        <b>6.6.1.</b>
+                    
+                    Data pareparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.6.2" 
data-path="../binaryclass/webspam_scw.html">
+            
+                <a href="../binaryclass/webspam_scw.html">
+            
+                    
+                        <b>6.6.2.</b>
+                    
+                    PA1, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="6.7" 
data-path="../binaryclass/titanic_rf.html">
+            
+                <a href="../binaryclass/titanic_rf.html">
+            
+                    
+                        <b>6.7.</b>
+                    
+                    Kaggle Titanic Tutorial
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.8" 
data-path="../binaryclass/criteo.html">
+            
+                <a href="../binaryclass/criteo.html">
+            
+                    
+                        <b>6.8.</b>
+                    
+                    Criteo Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="6.8.1" 
data-path="../binaryclass/criteo_dataset.html">
+            
+                <a href="../binaryclass/criteo_dataset.html">
+            
+                    
+                        <b>6.8.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="6.8.2" 
data-path="../binaryclass/criteo_ffm.html">
+            
+                <a href="../binaryclass/criteo_ffm.html">
+            
+                    
+                        <b>6.8.2.</b>
+                    
+                    Field-Aware Factorization Machines
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part VII - Multiclass Classification</li>
+        
+        
+    
+        <li class="chapter " data-level="7.1" 
data-path="../multiclass/news20.html">
+            
+                <a href="../multiclass/news20.html">
+            
+                    
+                        <b>7.1.</b>
+                    
+                    News20 Multiclass Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="7.1.1" 
data-path="../multiclass/news20_dataset.html">
+            
+                <a href="../multiclass/news20_dataset.html">
+            
+                    
+                        <b>7.1.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.1.2" 
data-path="../multiclass/news20_one-vs-the-rest_dataset.html">
+            
+                <a href="../multiclass/news20_one-vs-the-rest_dataset.html">
+            
+                    
+                        <b>7.1.2.</b>
+                    
+                    Data preparation for one-vs-the-rest classifiers
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.1.3" 
data-path="../multiclass/news20_pa.html">
+            
+                <a href="../multiclass/news20_pa.html">
+            
+                    
+                        <b>7.1.3.</b>
+                    
+                    PA
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.1.4" 
data-path="../multiclass/news20_scw.html">
+            
+                <a href="../multiclass/news20_scw.html">
+            
+                    
+                        <b>7.1.4.</b>
+                    
+                    CW, AROW, SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.1.5" 
data-path="../multiclass/news20_ensemble.html">
+            
+                <a href="../multiclass/news20_ensemble.html">
+            
+                    
+                        <b>7.1.5.</b>
+                    
+                    Ensemble learning
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.1.6" 
data-path="../multiclass/news20_one-vs-the-rest.html">
+            
+                <a href="../multiclass/news20_one-vs-the-rest.html">
+            
+                    
+                        <b>7.1.6.</b>
+                    
+                    one-vs-the-rest classifier
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="7.2" 
data-path="../multiclass/iris.html">
+            
+                <a href="../multiclass/iris.html">
+            
+                    
+                        <b>7.2.</b>
+                    
+                    Iris Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="7.2.1" 
data-path="../multiclass/iris_dataset.html">
+            
+                <a href="../multiclass/iris_dataset.html">
+            
+                    
+                        <b>7.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.2.2" 
data-path="../multiclass/iris_scw.html">
+            
+                <a href="../multiclass/iris_scw.html">
+            
+                    
+                        <b>7.2.2.</b>
+                    
+                    SCW
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="7.2.3" 
data-path="../multiclass/iris_randomforest.html">
+            
+                <a href="../multiclass/iris_randomforest.html">
+            
+                    
+                        <b>7.2.3.</b>
+                    
+                    Random Forest
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part VIII - Regression</li>
+        
+        
+    
+        <li class="chapter " data-level="8.1" 
data-path="../regression/general.html">
+            
+                <a href="../regression/general.html">
+            
+                    
+                        <b>8.1.</b>
+                    
+                    Regression
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.2" 
data-path="../regression/e2006.html">
+            
+                <a href="../regression/e2006.html">
+            
+                    
+                        <b>8.2.</b>
+                    
+                    E2006-tfidf Regression Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="8.2.1" 
data-path="../regression/e2006_dataset.html">
+            
+                <a href="../regression/e2006_dataset.html">
+            
+                    
+                        <b>8.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.2.2" 
data-path="../regression/e2006_arow.html">
+            
+                <a href="../regression/e2006_arow.html">
+            
+                    
+                        <b>8.2.2.</b>
+                    
+                    Passive Aggressive, AROW
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3" 
data-path="../regression/kddcup12tr2.html">
+            
+                <a href="../regression/kddcup12tr2.html">
+            
+                    
+                        <b>8.3.</b>
+                    
+                    KDDCup 2012 Track 2 CTR Prediction Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="8.3.1" 
data-path="../regression/kddcup12tr2_dataset.html">
+            
+                <a href="../regression/kddcup12tr2_dataset.html">
+            
+                    
+                        <b>8.3.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3.2" 
data-path="../regression/kddcup12tr2_lr.html">
+            
+                <a href="../regression/kddcup12tr2_lr.html">
+            
+                    
+                        <b>8.3.2.</b>
+                    
+                    Logistic Regression, Passive Aggressive
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3.3" 
data-path="../regression/kddcup12tr2_lr_amplify.html">
+            
+                <a href="../regression/kddcup12tr2_lr_amplify.html">
+            
+                    
+                        <b>8.3.3.</b>
+                    
+                    Logistic Regression with amplifier
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="8.3.4" 
data-path="../regression/kddcup12tr2_adagrad.html">
+            
+                <a href="../regression/kddcup12tr2_adagrad.html">
+            
+                    
+                        <b>8.3.4.</b>
+                    
+                    AdaGrad, AdaDelta
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part IX - Recommendation</li>
+        
+        
+    
+        <li class="chapter " data-level="9.1" data-path="../recommend/cf.html">
+            
+                <a href="../recommend/cf.html">
+            
+                    
+                        <b>9.1.</b>
+                    
+                    Collaborative Filtering
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="9.1.1" 
data-path="../recommend/item_based_cf.html">
+            
+                <a href="../recommend/item_based_cf.html">
+            
+                    
+                        <b>9.1.1.</b>
+                    
+                    Item-based collaborative filtering
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="9.2" 
data-path="../recommend/news20.html">
+            
+                <a href="../recommend/news20.html">
+            
+                    
+                        <b>9.2.</b>
+                    
+                    News20 Related Article Recommendation Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="9.2.1" 
data-path="../multiclass/news20_dataset.html">
+            
+                <a href="../multiclass/news20_dataset.html">
+            
+                    
+                        <b>9.2.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.2.2" 
data-path="../recommend/news20_jaccard.html">
+            
+                <a href="../recommend/news20_jaccard.html">
+            
+                    
+                        <b>9.2.2.</b>
+                    
+                    LSH/MinHash and Jaccard similarity
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.2.3" 
data-path="../recommend/news20_knn.html">
+            
+                <a href="../recommend/news20_knn.html">
+            
+                    
+                        <b>9.2.3.</b>
+                    
+                    LSH/MinHash and brute-force search
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.2.4" 
data-path="../recommend/news20_bbit_minhash.html">
+            
+                <a href="../recommend/news20_bbit_minhash.html">
+            
+                    
+                        <b>9.2.4.</b>
+                    
+                    kNN search using b-Bits MinHash
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="9.3" 
data-path="../recommend/movielens.html">
+            
+                <a href="../recommend/movielens.html">
+            
+                    
+                        <b>9.3.</b>
+                    
+                    MovieLens Movie Recommendation Tutorial
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="9.3.1" 
data-path="../recommend/movielens_dataset.html">
+            
+                <a href="../recommend/movielens_dataset.html">
+            
+                    
+                        <b>9.3.1.</b>
+                    
+                    Data preparation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.3.2" 
data-path="../recommend/movielens_cf.html">
+            
+                <a href="../recommend/movielens_cf.html">
+            
+                    
+                        <b>9.3.2.</b>
+                    
+                    Item-based collaborative filtering
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.3.3" 
data-path="../recommend/movielens_mf.html">
+            
+                <a href="../recommend/movielens_mf.html">
+            
+                    
+                        <b>9.3.3.</b>
+                    
+                    Matrix Factorization
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.3.4" 
data-path="../recommend/movielens_fm.html">
+            
+                <a href="../recommend/movielens_fm.html">
+            
+                    
+                        <b>9.3.4.</b>
+                    
+                    Factorization Machine
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.3.5" 
data-path="../recommend/movielens_slim.html">
+            
+                <a href="../recommend/movielens_slim.html">
+            
+                    
+                        <b>9.3.5.</b>
+                    
+                    SLIM for fast top-k recommendation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="9.3.6" 
data-path="../recommend/movielens_cv.html">
+            
+                <a href="../recommend/movielens_cv.html">
+            
+                    
+                        <b>9.3.6.</b>
+                    
+                    10-fold cross validation (Matrix Factorization)
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part X - Anomaly Detection</li>
+        
+        
+    
+        <li class="chapter " data-level="10.1" data-path="../anomaly/lof.html">
+            
+                <a href="../anomaly/lof.html">
+            
+                    
+                        <b>10.1.</b>
+                    
+                    Outlier Detection using Local Outlier Factor (LOF)
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="10.2" data-path="../anomaly/sst.html">
+            
+                <a href="../anomaly/sst.html">
+            
+                    
+                        <b>10.2.</b>
+                    
+                    Change-Point Detection using Singular Spectrum 
Transformation (SST)
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="10.3" 
data-path="../anomaly/changefinder.html">
+            
+                <a href="../anomaly/changefinder.html">
+            
+                    
+                        <b>10.3.</b>
+                    
+                    ChangeFinder: Detecting Outlier and Change-Point 
Simultaneously
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XI - Clustering</li>
+        
+        
+    
+        <li class="chapter " data-level="11.1" 
data-path="../clustering/lda.html">
+            
+                <a href="../clustering/lda.html">
+            
+                    
+                        <b>11.1.</b>
+                    
+                    Latent Dirichlet Allocation
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="11.2" 
data-path="../clustering/plsa.html">
+            
+                <a href="../clustering/plsa.html">
+            
+                    
+                        <b>11.2.</b>
+                    
+                    Probabilistic Latent Semantic Analysis
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XII - GeoSpatial Functions</li>
+        
+        
+    
+        <li class="chapter " data-level="12.1" 
data-path="../geospatial/latlon.html">
+            
+                <a href="../geospatial/latlon.html">
+            
+                    
+                        <b>12.1.</b>
+                    
+                    Lat/Lon functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XIII - Hivemall on Spark</li>
+        
+        
+    
+        <li class="chapter " data-level="13.1" 
data-path="../spark/getting_started/">
+            
+                <a href="../spark/getting_started/">
+            
+                    
+                        <b>13.1.</b>
+                    
+                    Getting Started
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="13.1.1" 
data-path="../spark/getting_started/installation.html">
+            
+                <a href="../spark/getting_started/installation.html">
+            
+                    
+                        <b>13.1.1.</b>
+                    
+                    Installation
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="13.2" 
data-path="../spark/binaryclass/">
+            
+                <a href="../spark/binaryclass/">
+            
+                    
+                        <b>13.2.</b>
+                    
+                    Binary Classification
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="13.2.1" 
data-path="../spark/binaryclass/a9a_df.html">
+            
+                <a href="../spark/binaryclass/a9a_df.html">
+            
+                    
+                        <b>13.2.1.</b>
+                    
+                    a9a tutorial for DataFrame
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="13.2.2" 
data-path="../spark/binaryclass/a9a_sql.html">
+            
+                <a href="../spark/binaryclass/a9a_sql.html">
+            
+                    
+                        <b>13.2.2.</b>
+                    
+                    a9a tutorial for SQL
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="13.3" 
data-path="../spark/binaryclass/">
+            
+                <a href="../spark/binaryclass/">
+            
+                    
+                        <b>13.3.</b>
+                    
+                    Regression
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="13.3.1" 
data-path="../spark/regression/e2006_df.html">
+            
+                <a href="../spark/regression/e2006_df.html">
+            
+                    
+                        <b>13.3.1.</b>
+                    
+                    E2006-tfidf regression tutorial for DataFrame
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="13.3.2" 
data-path="../spark/regression/e2006_sql.html">
+            
+                <a href="../spark/regression/e2006_sql.html">
+            
+                    
+                        <b>13.3.2.</b>
+                    
+                    E2006-tfidf regression tutorial for SQL
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="13.4" 
data-path="../spark/misc/misc.html">
+            
+                <a href="../spark/misc/misc.html">
+            
+                    
+                        <b>13.4.</b>
+                    
+                    Generic features
+            
+                </a>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="13.4.1" 
data-path="../spark/misc/topk_join.html">
+            
+                <a href="../spark/misc/topk_join.html">
+            
+                    
+                        <b>13.4.1.</b>
+                    
+                    Top-k join processing
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="13.4.2" 
data-path="../spark/misc/functions.html">
+            
+                <a href="../spark/misc/functions.html">
+            
+                    
+                        <b>13.4.2.</b>
+                    
+                    Other utility functions
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XIV - Hivemall on Docker</li>
+        
+        
+    
+        <li class="chapter " data-level="14.1" 
data-path="../docker/getting_started.html">
+            
+                <a href="../docker/getting_started.html">
+            
+                    
+                        <b>14.1.</b>
+                    
+                    Getting Started
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+        
+        <li class="header">Part XIV - External References</li>
+        
+        
+    
+        <li class="chapter " data-level="15.1" >
+            
+                <a target="_blank" 
href="https://github.com/daijyc/hivemall/wiki/PigHome";>
+            
+                    
+                        <b>15.1.</b>
+                    
+                    Hivemall on Apache Pig
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+    
+
+    <li class="divider"></li>
+
+    <li>
+        <a href="https://www.gitbook.com"; target="blank" class="gitbook-link">
+            Published with GitBook
+        </a>
+    </li>
+</ul>
+
+
+                </nav>
+            
+        
+    </div>
+
+    <div class="book-body">
+        
+            <div class="body-inner">
+                
+                    
+
+<div class="book-header" role="navigation">
+    
+
+    <!-- Title -->
+    <h1>
+        <i class="fa fa-circle-o-notch fa-spin"></i>
+        <a href=".." >Step-by-Step Tutorial on Supervised Learning</a>
+    </h1>
+</div>
+
+
+
+
+                    <div class="page-wrapper" tabindex="-1" role="main">
+                        <div class="page-inner">
+                            
+<div id="book-search-results">
+    <div class="search-noresults">
+    
+                                <section class="normal markdown-section">
+                                
+                                <!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<h1 id="step-by-step-tutorial-on-supervised-learning">Step-by-Step Tutorial on 
Supervised Learning</h1>
+<!-- toc --><div id="toc" class="toc">
+
+<ul>
+<li><a href="#what-is-hivemall">What is Hivemall?</a></li>
+<li><a href="#binary-classification">Binary classification</a><ul>
+<li><a href="#step-1-feature-representation">Step 1. Feature 
representation</a></li>
+<li><a href="#step-2-training">Step 2. Training</a></li>
+<li><a href="#step-3-prediction">Step 3. Prediction</a></li>
+<li><a href="#evaluation">Evaluation</a></li>
+</ul>
+</li>
+<li><a href="#regression">Regression</a><ul>
+<li><a href="#step-1-feature-representation-1">Step 1. Feature 
representation</a></li>
+<li><a href="#step-2-training-1">Step 2. Training</a></li>
+<li><a href="#step-3-prediction-1">Step 3. Prediction</a></li>
+<li><a href="#evaluation-1">Evaluation</a></li>
+</ul>
+</li>
+<li><a href="#next-steps">Next steps</a></li>
+</ul>
+
+</div><!-- tocstop -->
+<h2 id="what-is-hivemall">What is Hivemall?</h2>
+<p><a href="https://github.com/apache/incubator-hivemall"; 
target="_blank">Apache Hivemall</a> is a collection of user-defined functions 
(UDFs) for HiveQL which is strongly optimized for machine learning (ML) and 
data science. To give an example, you can efficiently build a logistic 
regression model with the stochastic gradient descent (SGD) optimization by 
issuing the following ~10 lines of query:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span>
+  train_classifier(
+    features,
+    label,
+    <span class="hljs-string">&apos;-loss_function logloss -optimizer 
SGD&apos;</span>
+  ) <span class="hljs-keyword">as</span> (feature, weight)
+<span class="hljs-keyword">FROM</span>
+  training
+;
+</code></pre>
+<p>Hivemall function <a 
href="../misc/funcs.html#others"><code>hivemall_version()</code></a> shows 
current Hivemall version, for example:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">select</span> 
hivemall_version();
+</code></pre>
+<blockquote>
+<p>&quot;0.5.1-incubating-SNAPSHOT&quot;</p>
+</blockquote>
+<p>Below we list ML and relevant problems that Hivemall can solve:</p>
+<ul>
+<li><a href="../binaryclass/general.html">Binary and multi-class 
classification</a></li>
+<li><a href="../regression/general.html">Regression</a></li>
+<li><a href="../recommend/cf.html">Recommendation</a></li>
+<li><a href="../anomaly/lof.html">Anomaly detection</a></li>
+<li><a href="../misc/tokenizer.html">Natural language processing</a></li>
+<li><a href="../misc/tokenizer.html">Clustering</a> (i.e., topic modeling)</li>
+<li><a href="../misc/funcs.html#sketching">Data sketching</a></li>
+<li>Evaluation</li>
+</ul>
+<p>Our <a href="https://www.youtube.com/watch?v=cMUsuA9KZ_c"; 
target="_blank">YouTube demo video</a> would be helpful to understand more 
about an overview of Hivemall.</p>
+<p>This tutorial explains the basic usage of Hivemall with examples of 
supervised learning of simple regressor and binary classifier.</p>
+<h2 id="binary-classification">Binary classification</h2>
+<p>Imagine a scenario that we like to build a binary classifier from the mock 
<code>purchase_history</code> data and predict unforeseen purchases to conduct 
a new campaign effectively:</p>
+<table>
+<thead>
+<tr>
+<th style="text-align:center">day_of_week</th>
+<th style="text-align:center">gender</th>
+<th style="text-align:center">price</th>
+<th style="text-align:center">category</th>
+<th style="text-align:left">label</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:center">Saturday</td>
+<td style="text-align:center">male</td>
+<td style="text-align:center">600</td>
+<td style="text-align:center">book</td>
+<td style="text-align:left">1</td>
+</tr>
+<tr>
+<td style="text-align:center">Friday</td>
+<td style="text-align:center">female</td>
+<td style="text-align:center">4800</td>
+<td style="text-align:center">sports</td>
+<td style="text-align:left">0</td>
+</tr>
+<tr>
+<td style="text-align:center">Friday</td>
+<td style="text-align:center">other</td>
+<td style="text-align:center">18000</td>
+<td style="text-align:center">entertainment</td>
+<td style="text-align:left">0</td>
+</tr>
+<tr>
+<td style="text-align:center">Thursday</td>
+<td style="text-align:center">male</td>
+<td style="text-align:center">200</td>
+<td style="text-align:center">food</td>
+<td style="text-align:left">0</td>
+</tr>
+<tr>
+<td style="text-align:center">Wednesday</td>
+<td style="text-align:center">female</td>
+<td style="text-align:center">1000</td>
+<td style="text-align:center">electronics</td>
+<td style="text-align:left">1</td>
+</tr>
+</tbody>
+</table>
+<p>You can create this table as follows:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">create</span> <span 
class="hljs-keyword">table</span> <span class="hljs-keyword">if</span> <span 
class="hljs-keyword">not</span> <span class="hljs-keyword">exists</span> 
purchase_history <span class="hljs-keyword">as</span>
+<span class="hljs-keyword">select</span> <span class="hljs-number">1</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;Saturday&quot;</span> <span 
class="hljs-keyword">as</span> day_of_week, <span 
class="hljs-string">&quot;male&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">600</span> 
<span class="hljs-keyword">as</span> price, <span 
class="hljs-string">&quot;book&quot;</span> <span 
class="hljs-keyword">as</span> <span class="hljs-keyword">category</span>, 
<span class="hljs-number">1</span> <span class="hljs-keyword">as</span> label
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">2</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;Friday&quot;</span> <span 
class="hljs-keyword">as</span> day_of_week, <span 
class="hljs-string">&quot;female&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">4800</span> 
<span class="hljs-keyword">as</span> price, <span 
class="hljs-string">&quot;sports&quot;</span> <span 
class="hljs-keyword">as</span> <span class="hljs-keyword">category</span>, 
<span class="hljs-number">0</span> <span class="hljs-keyword">as</span> label
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">3</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;Friday&quot;</span> <span 
class="hljs-keyword">as</span> day_of_week, <span 
class="hljs-string">&quot;other&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">18000</span> 
<span class="hljs-keyword">as</span> price, <span 
class="hljs-string">&quot;entertainment&quot;</span> <span 
class="hljs-keyword">as</span> <span class="hljs-keyword">category</span>, 
<span class="hljs-number">0</span> <span class="hljs-keyword">as</span> label
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">4</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;Thursday&quot;</span> <span 
class="hljs-keyword">as</span> day_of_week, <span 
class="hljs-string">&quot;male&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">200</span> 
<span class="hljs-keyword">as</span> price, <span 
class="hljs-string">&quot;food&quot;</span> <span 
class="hljs-keyword">as</span> <span class="hljs-keyword">category</span>, 
<span class="hljs-number">0</span> <span class="hljs-keyword">as</span> label
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">5</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;Wednesday&quot;</span> <span 
class="hljs-keyword">as</span> day_of_week, <span 
class="hljs-string">&quot;female&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">1000</span> 
<span class="hljs-keyword">as</span> price, <span 
class="hljs-string">&quot;electronics&quot;</span> <span 
class="hljs-keyword">as</span> <span class="hljs-keyword">category</span>, 
<span class="hljs-number">1</span> <span class="hljs-keyword">as</span> label
+;
+</code></pre>
+<p>Use Hivemall <a 
href="../misc/funcs.html#binary-classification"><code>train_classifier()</code></a>
 UDF to tackle the problem as follows.</p>
+<h3 id="step-1-feature-representation">Step 1. Feature representation</h3>
+<p>First of all, we have to convert the records into pairs of the feature 
vector and corresponding target value. Here, Hivemall requires you to represent 
input features in a specific format.</p>
+<p>To be more precise, Hivemall represents single feature in a concatenation 
of <strong>index</strong> (i.e., <strong>name</strong>) and its 
<strong>value</strong>:</p>
+<ul>
+<li>Quantitative feature: <code>&lt;index&gt;:&lt;value&gt;</code><ul>
+<li>e.g., <code>price:600.0</code></li>
+</ul>
+</li>
+<li>Categorical feature: <code>&lt;index&gt;#&lt;value&gt;</code><ul>
+<li>e.g., <code>gender#male</code></li>
+</ul>
+</li>
+</ul>
+<p>Feature index and feature value are separated by comma. When comma is 
omitted, the value is considered to be <code>1.0</code>. So, a categorical 
feature <code>gender#male</code> a <a 
href="https://www.quora.com/What-is-one-hot-encoding-and-when-is-it-used-in-data-science";
 target="_blank">one-hot representation</a> of <code>index := 
gender#male</code> and <code>value := 1.0</code>. Note that <code>#</code> is 
not a special character for categorical feature.</p>
+<p>Each of those features is a string value in Hive, and &quot;feature 
vector&quot; means an array of string values like:</p>
+<pre><code>[&quot;price:600.0&quot;, &quot;day of week#Saturday&quot;, 
&quot;gender#male&quot;, &quot;category#book&quot;]
+</code></pre><p>See also more detailed <a 
href="../getting_started/input-format.html">document for input format</a>.</p>
+<p>Therefore, what we first need to do is to convert the records into an array 
of feature strings, and Hivemall functions <a 
href="../getting_started/input-format.html#quantitative-features"><code>quantitative_features()</code></a>,
 <a 
href="../getting_started/input-format.html#categorical-features"><code>categorical_features()</code></a>
 and <a href="../misc/generic_funcs.html#array"><code>array_concat()</code></a> 
provide a simple way to create the pairs of feature vector and target value:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">create</span> <span 
class="hljs-keyword">table</span> <span class="hljs-keyword">if</span> <span 
class="hljs-keyword">not</span> <span class="hljs-keyword">exists</span> 
training <span class="hljs-keyword">as</span>
+<span class="hljs-keyword">select</span>
+  <span class="hljs-keyword">id</span>,
+  array_concat( <span class="hljs-comment">-- concatenate two arrays of 
quantitative and categorical features into single array</span>
+    quantitative_features(
+      <span class="hljs-built_in">array</span>(<span 
class="hljs-string">&quot;price&quot;</span>), <span class="hljs-comment">-- 
quantitative feature names</span>
+      price <span class="hljs-comment">-- corresponding column names</span>
+    ),
+    categorical_features(
+      <span class="hljs-built_in">array</span>(<span 
class="hljs-string">&quot;day of week&quot;</span>, <span 
class="hljs-string">&quot;gender&quot;</span>, <span 
class="hljs-string">&quot;category&quot;</span>), <span class="hljs-comment">-- 
categorical feature names</span>
+      day_of_week, gender, <span class="hljs-keyword">category</span> <span 
class="hljs-comment">-- corresponding column names</span>
+    )
+  ) <span class="hljs-keyword">as</span> features,
+  label
+<span class="hljs-keyword">from</span>
+  purchase_history
+;
+</code></pre>
+<p>The training table is as follows:</p>
+<table>
+<thead>
+<tr>
+<th style="text-align:center">id</th>
+<th style="text-align:left">features</th>
+<th style="text-align:left">label</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:center">1</td>
+<td style="text-align:left">[&quot;price:600.0&quot;,&quot;day of 
week#Saturday&quot;,&quot;gender#male&quot;,&quot;category#book&quot;]</td>
+<td style="text-align:left">1</td>
+</tr>
+<tr>
+<td style="text-align:center">2</td>
+<td style="text-align:left">[&quot;price:4800.0&quot;,&quot;day of 
week#Friday&quot;,&quot;gender#female&quot;,&quot;category#sports&quot;]</td>
+<td style="text-align:left">0</td>
+</tr>
+<tr>
+<td style="text-align:center">3</td>
+<td style="text-align:left">[&quot;price:18000.0&quot;,&quot;day of 
week#Friday&quot;,&quot;gender#other&quot;,&quot;category#entertainment&quot;]</td>
+<td style="text-align:left">0</td>
+</tr>
+<tr>
+<td style="text-align:center">4</td>
+<td style="text-align:left">[&quot;price:200.0&quot;,&quot;day of 
week#Thursday&quot;,&quot;gender#male&quot;,&quot;category#food&quot;]</td>
+<td style="text-align:left">0</td>
+</tr>
+<tr>
+<td style="text-align:center">5</td>
+<td style="text-align:left">[&quot;price:1000.0&quot;,&quot;day of 
week#Wednesday&quot;,&quot;gender#female&quot;,&quot;category#electronics&quot;]</td>
+<td style="text-align:left">1</td>
+</tr>
+</tbody>
+</table>
+<p>The output table <code>training</code> will be directly used as an input to 
Hivemall&apos;s ML functions in the next step.</p>
+<div class="panel panel-primary"><div class="panel-heading"><h3 
class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div 
class="panel-body"><p>You can apply extra Hivemall functions (e.g., <a 
href="../misc/funcs.html#feature-scaling"><code>rescale()</code></a>, <a 
href="../misc/funcs.html#feature-hashing"><code>feature_hashing()</code></a>, 
<a href="../misc/funcs.html#feature-scaling"><code>l1_normalize()</code></a>) 
for the features in this step to make your prediction model more accurate and 
stable; it is known as <em>feature engineering</em> in the context of ML. See 
our <a href="../ft_engineering/scaling.html">documentation</a> for more 
information.</p></div></div>
+<h3 id="step-2-training">Step 2. Training</h3>
+<p>Once the original table <code>purchase_history</code> has been converted 
into pairs of <code>features</code> and <code>label</code>, you can build a 
binary classifier by running the following query:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">create</span> <span 
class="hljs-keyword">table</span> <span class="hljs-keyword">if</span> <span 
class="hljs-keyword">not</span> <span class="hljs-keyword">exists</span> 
classifier <span class="hljs-keyword">as</span>
+<span class="hljs-keyword">select</span>
+  train_classifier(
+    features, <span class="hljs-comment">-- feature vector</span>
+    label, <span class="hljs-comment">-- target value</span>
+    <span class="hljs-string">&apos;-loss_function logloss -optimizer SGD 
-regularization l1&apos;</span> <span class="hljs-comment">-- 
hyper-parameters</span>
+  ) <span class="hljs-keyword">as</span> (feature, weight)
+<span class="hljs-keyword">from</span>
+  training
+;
+</code></pre>
+<p>What the above query does is to build a binary classifier with:</p>
+<ul>
+<li><code>-loss_function logloss</code><ul>
+<li>Use logistic loss i.e., logistic regression</li>
+</ul>
+</li>
+<li><code>-optimizer SGD</code><ul>
+<li>Learn model parameters with the SGD optimization</li>
+</ul>
+</li>
+<li><code>-regularization l1</code><ul>
+<li>Apply L1 regularization</li>
+</ul>
+</li>
+</ul>
+<p>Eventually, the output table <code>classifier</code> stores model 
parameters as:</p>
+<table>
+<thead>
+<tr>
+<th style="text-align:center">feature</th>
+<th style="text-align:center">weight</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:center">day of week#Wednesday</td>
+<td style="text-align:center">0.7443372011184692</td>
+</tr>
+<tr>
+<td style="text-align:center">day of week#Thursday</td>
+<td style="text-align:center">1.415687620465178e-07</td>
+</tr>
+<tr>
+<td style="text-align:center">day of week#Friday</td>
+<td style="text-align:center">-0.2697019577026367</td>
+</tr>
+<tr>
+<td style="text-align:center">day of week#Saturday</td>
+<td style="text-align:center">0.7337419390678406</td>
+</tr>
+<tr>
+<td style="text-align:center">category#book</td>
+<td style="text-align:center">0.7337419390678406</td>
+</tr>
+<tr>
+<td style="text-align:center">category#electronics</td>
+<td style="text-align:center">0.7443372011184692</td>
+</tr>
+<tr>
+<td style="text-align:center">category#entertainment</td>
+<td style="text-align:center">5.039264578954317e-07</td>
+</tr>
+<tr>
+<td style="text-align:center">category#food</td>
+<td style="text-align:center">1.415687620465178e-07</td>
+</tr>
+<tr>
+<td style="text-align:center">category#sports</td>
+<td style="text-align:center">-0.2697771489620209</td>
+</tr>
+<tr>
+<td style="text-align:center">gender#male</td>
+<td style="text-align:center">0.7336684465408325</td>
+</tr>
+<tr>
+<td style="text-align:center">gender#female</td>
+<td style="text-align:center">0.47442761063575745</td>
+</tr>
+<tr>
+<td style="text-align:center">gender#other</td>
+<td style="text-align:center">5.039264578954317e-07</td>
+</tr>
+<tr>
+<td style="text-align:center">price</td>
+<td style="text-align:center">-110.62307739257812</td>
+</tr>
+</tbody>
+</table>
+<p>Notice that weight is learned for each possible value in a categorical 
feature, and for every single quantitative feature.</p>
+<p>Of course, you can optimize hyper-parameters to build more accurate 
prediction model. Check the output of the following query to see all available 
options, including learning rate, number of iterations and regularization 
parameters, and their default values:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">select</span> 
train_classifier(<span class="hljs-built_in">array</span>(), <span 
class="hljs-number">0</span>, <span 
class="hljs-string">&apos;-help&apos;</span>);
+</code></pre>
+<h3 id="step-3-prediction">Step 3. Prediction</h3>
+<p>Now, the table <code>classifier</code> has liner coefficients for given 
features, and we can predict unforeseen samples by computing a weighted sum of 
their features.</p>
+<p>How about the probability of purchase by a <code>male</code> customer who 
sees a <code>food</code> product priced at <code>120</code> on 
<code>Friday</code>? Which product is more likely to be purchased by the 
customer on <code>Friday</code>?</p>
+<p>To differentiate potential purchases, create a 
<code>unforeseen_samples</code> table with these unknown combinations of 
features:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">create</span> <span 
class="hljs-keyword">table</span> <span class="hljs-keyword">if</span> <span 
class="hljs-keyword">not</span> <span class="hljs-keyword">exists</span> 
unforeseen_samples <span class="hljs-keyword">as</span>
+<span class="hljs-keyword">select</span> <span class="hljs-number">1</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-built_in">array</span>(<span 
class="hljs-string">&quot;gender#male&quot;</span>, <span 
class="hljs-string">&quot;category#food&quot;</span>, <span 
class="hljs-string">&quot;day of week#Friday&quot;</span>, <span 
class="hljs-string">&quot;price:120&quot;</span>) <span 
class="hljs-keyword">as</span> features
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">2</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-built_in">array</span>(<span 
class="hljs-string">&quot;gender#male&quot;</span>, <span 
class="hljs-string">&quot;category#sports&quot;</span>, <span 
class="hljs-string">&quot;day of week#Friday&quot;</span>, <span 
class="hljs-string">&quot;price:1000&quot;</span>) <span 
class="hljs-keyword">as</span> features
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">3</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-built_in">array</span>(<span 
class="hljs-string">&quot;gender#male&quot;</span>, <span 
class="hljs-string">&quot;category#electronics&quot;</span>, <span 
class="hljs-string">&quot;day of week#Friday&quot;</span>, <span 
class="hljs-string">&quot;price:540&quot;</span>) <span 
class="hljs-keyword">as</span> features
+;
+</code></pre>
+<p>Prediction for the feature vectors can be made by join operation between 
<code>unforeseen_samples</code> and <code>classifier</code> on each feature 
as:</p>
+<pre><code class="lang-sql">with features_exploded as (
+  <span class="hljs-keyword">select</span>
+    <span class="hljs-keyword">id</span>,
+    <span class="hljs-comment">-- split feature string into its name and 
value</span>
+    <span class="hljs-comment">-- to join with a model table</span>
+    extract_feature(fv) <span class="hljs-keyword">as</span> feature,
+    extract_weight(fv) <span class="hljs-keyword">as</span> <span 
class="hljs-keyword">value</span>
+  <span class="hljs-keyword">from</span> unforeseen_samples t1 LATERAL <span 
class="hljs-keyword">VIEW</span> explode(features) t2 <span 
class="hljs-keyword">as</span> fv
+)
+<span class="hljs-keyword">select</span>
+  t1.<span class="hljs-keyword">id</span>,
+  sigmoid( <span class="hljs-keyword">sum</span>(p1.weight * t1.<span 
class="hljs-keyword">value</span>) ) <span class="hljs-keyword">as</span> 
probability
+<span class="hljs-keyword">from</span>
+  features_exploded t1
+  <span class="hljs-keyword">LEFT</span> <span 
class="hljs-keyword">OUTER</span> <span class="hljs-keyword">JOIN</span> 
classifier p1 <span class="hljs-keyword">ON</span> (t1.feature = p1.feature)
+<span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
+  t1.<span class="hljs-keyword">id</span>
+;
+</code></pre>
+<div class="panel panel-primary"><div class="panel-heading"><h3 
class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div 
class="panel-body"><p><code>sigmoid()</code> should be applied only for 
logistic loss and you can&apos;t get a probability with other loss functions 
for a classification. See also <a 
href="https://www.coursera.org/lecture/machine-learning/decision-boundary-WuL1H";
 target="_blank">this video</a>.</p></div></div>
+<p>Output for single sample can be:</p>
+<table>
+<thead>
+<tr>
+<th style="text-align:right">id</th>
+<th style="text-align:right">probability</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:right">1</td>
+<td style="text-align:right">1.0261879540562902e-10</td>
+</tr>
+</tbody>
+</table>
+<h3 id="evaluation">Evaluation</h3>
+<p>If you have test samples for evaluation, use Hivemall&apos;s <a 
href="../eval/binary_classification_measures.html">evaluation UDFs</a> to 
measure the accuracy of prediction.</p>
+<p>For instance, prediction accuracy over the <code>training</code> samples 
can be measured as:</p>
+<pre><code class="lang-sql">with features_exploded as (
+  <span class="hljs-keyword">select</span>
+    <span class="hljs-keyword">id</span>,
+    extract_feature(fv) <span class="hljs-keyword">as</span> feature,
+    extract_weight(fv) <span class="hljs-keyword">as</span> <span 
class="hljs-keyword">value</span>
+  <span class="hljs-keyword">from</span> training t1 LATERAL <span 
class="hljs-keyword">VIEW</span> explode(features) t2 <span 
class="hljs-keyword">as</span> fv
+),
+predictions <span class="hljs-keyword">as</span> (
+  <span class="hljs-keyword">select</span>
+    t1.<span class="hljs-keyword">id</span>,
+    sigmoid( <span class="hljs-keyword">sum</span>(p1.weight * t1.<span 
class="hljs-keyword">value</span>) ) <span class="hljs-keyword">as</span> 
probability
+  <span class="hljs-keyword">from</span>
+    features_exploded t1
+    <span class="hljs-keyword">LEFT</span> <span 
class="hljs-keyword">OUTER</span> <span class="hljs-keyword">JOIN</span> 
classifier p1 <span class="hljs-keyword">ON</span> (t1.feature = p1.feature)
+  <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
+    t1.<span class="hljs-keyword">id</span>
+)
+<span class="hljs-keyword">select</span>
+  auc(probability, label) <span class="hljs-keyword">as</span> auc,
+  logloss(probability, label) <span class="hljs-keyword">as</span> logloss
+<span class="hljs-keyword">from</span> (
+  <span class="hljs-keyword">select</span> t1.probability, t2.label
+  <span class="hljs-keyword">from</span> predictions t1
+  <span class="hljs-keyword">join</span> training t2 <span 
class="hljs-keyword">on</span> (t1.<span class="hljs-keyword">id</span> = 
t2.<span class="hljs-keyword">id</span>)
+  <span class="hljs-keyword">ORDER</span> <span class="hljs-keyword">BY</span> 
probability <span class="hljs-keyword">DESC</span>
+) t
+;
+</code></pre>
+<table>
+<thead>
+<tr>
+<th style="text-align:right">auc</th>
+<th style="text-align:right">logloss</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:right">0.5</td>
+<td style="text-align:right">9.200000003614099</td>
+</tr>
+</tbody>
+</table>
+<p>Since we are trying to solve the binary classification problem, the 
accuracy is measured by <a 
href="https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve";
 target="_blank">Area Under the ROC Curve</a> <a 
href="../eval/auc.html"><code>auc()</code></a> and/or <a 
href="http://wiki.fast.ai/index.php/Log_Loss"; target="_blank">Logarithmic 
Loss</a> <a 
href="../eval/regression.html#logarithmic-loss"><code>logloss()</code></a>.</p>
+<h2 id="regression">Regression</h2>
+<p>If you use <a 
href="../misc/funcs.html#regression"><code>train_regressor()</code></a> instead 
of <a 
href="../misc/funcs.html#binary-classification"><code>train_classifier()</code></a>,
 you can also solve a regression problem with almost same queries.</p>
+<p>Imagine the following <code>customers</code> table:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">create</span> <span 
class="hljs-keyword">table</span> <span class="hljs-keyword">if</span> <span 
class="hljs-keyword">not</span> <span class="hljs-keyword">exists</span> 
customers <span class="hljs-keyword">as</span>
+<span class="hljs-keyword">select</span> <span class="hljs-number">1</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;male&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">23</span> 
<span class="hljs-keyword">as</span> age, <span 
class="hljs-string">&quot;Japan&quot;</span> <span 
class="hljs-keyword">as</span> country, <span class="hljs-number">12</span> 
<span class="hljs-keyword">as</span> num_purchases
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">2</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;female&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">43</span> 
<span class="hljs-keyword">as</span> age, <span 
class="hljs-string">&quot;US&quot;</span> <span class="hljs-keyword">as</span> 
country, <span class="hljs-number">4</span> <span 
class="hljs-keyword">as</span> num_purchases
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">3</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;other&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">19</span> 
<span class="hljs-keyword">as</span> age, <span 
class="hljs-string">&quot;UK&quot;</span> <span class="hljs-keyword">as</span> 
country, <span class="hljs-number">2</span> <span 
class="hljs-keyword">as</span> num_purchases
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">4</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;male&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">31</span> 
<span class="hljs-keyword">as</span> age, <span 
class="hljs-string">&quot;US&quot;</span> <span class="hljs-keyword">as</span> 
country, <span class="hljs-number">20</span> <span 
class="hljs-keyword">as</span> num_purchases
+<span class="hljs-keyword">union</span> all
+<span class="hljs-keyword">select</span> <span class="hljs-number">5</span> 
<span class="hljs-keyword">as</span> <span class="hljs-keyword">id</span>, 
<span class="hljs-string">&quot;female&quot;</span> <span 
class="hljs-keyword">as</span> gender, <span class="hljs-number">37</span> 
<span class="hljs-keyword">as</span> age, <span 
class="hljs-string">&quot;Australia&quot;</span> <span 
class="hljs-keyword">as</span> country, <span class="hljs-number">9</span> 
<span class="hljs-keyword">as</span> num_purchases
+;
+</code></pre>
+<table>
+<thead>
+<tr>
+<th style="text-align:center">gender</th>
+<th style="text-align:left">age</th>
+<th style="text-align:center">country</th>
+<th style="text-align:left">num_purchases</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:center">male</td>
+<td style="text-align:left">23</td>
+<td style="text-align:center">Japan</td>
+<td style="text-align:left">12</td>
+</tr>
+<tr>
+<td style="text-align:center">female</td>
+<td style="text-align:left">43</td>
+<td style="text-align:center">US</td>
+<td style="text-align:left">4</td>
+</tr>
+<tr>
+<td style="text-align:center">other</td>
+<td style="text-align:left">19</td>
+<td style="text-align:center">UK</td>
+<td style="text-align:left">2</td>
+</tr>
+<tr>
+<td style="text-align:center">male</td>
+<td style="text-align:left">31</td>
+<td style="text-align:center">US</td>
+<td style="text-align:left">20</td>
+</tr>
+<tr>
+<td style="text-align:center">female</td>
+<td style="text-align:left">37</td>
+<td style="text-align:center">Australia</td>
+<td style="text-align:left">9</td>
+</tr>
+</tbody>
+</table>
+<p>Now, our goal is to build a regression model to predict the number of 
purchases potentially done by new customers.</p>
+<h3 id="step-1-feature-representation">Step 1. Feature representation</h3>
+<p>Same as the classification example:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">insert</span> overwrite 
<span class="hljs-keyword">table</span> training
+<span class="hljs-keyword">select</span>
+  <span class="hljs-keyword">id</span>,
+  array_concat(
+    quantitative_features(
+      <span class="hljs-built_in">array</span>(<span 
class="hljs-string">&quot;age&quot;</span>),
+      age
+    ),
+    categorical_features(
+      <span class="hljs-built_in">array</span>(<span 
class="hljs-string">&quot;country&quot;</span>, <span 
class="hljs-string">&quot;gender&quot;</span>),
+      country, gender
+    )
+  ) <span class="hljs-keyword">as</span> features,
+  num_purchases
+<span class="hljs-keyword">from</span>
+  customers
+;
+</code></pre>
+<h3 id="step-2-training">Step 2. Training</h3>
+<p><a href="../misc/funcs.html#regression"><code>train_regressor()</code></a> 
requires you to specify an appropriate loss function. One option is to replace 
the classifier-specific loss function <code>logloss</code> with 
<code>squared</code> as:</p>
+<pre><code class="lang-sql"><span class="hljs-keyword">create</span> <span 
class="hljs-keyword">table</span> <span class="hljs-keyword">if</span> <span 
class="hljs-keyword">not</span> <span class="hljs-keyword">exists</span> 
regressor <span class="hljs-keyword">as</

<TRUNCATED>

Reply via email to