[28/51] [partial] incubator-madlib-site git commit: Add v1.11 docs

riyer Tue, 16 May 2017 13:30:19 -0700

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__crf.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__crf.html b/docs/v1.11/group__grp__crf.html
new file mode 100644
index 0000000..9b149c1
--- /dev/null
+++ b/docs/v1.11/group__grp__crf.html
@@ -0,0 +1,619 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Conditional Random Field</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__crf.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Conditional Random Field<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#train_feature">Training Feature Generation</a> </li>
+<li>
+<a href="#train">CRF Training Function</a> </li>
+<li>
+<a href="#test_feature">Testing Feature Generation</a> </li>
+<li>
+<a href="#inference">Inference using Viterbi</a> </li>
+<li>
+<a href="#usage">Using CRF</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#background">Technical Background</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>A conditional random field (CRF) is a type of discriminative, 
undirected probabilistic graphical model. A linear-chain CRF is a special type 
of CRF that assumes the current state depends only on the previous state.</p>
+<p>Feature extraction modules are provided for text-analysis tasks such as 
part-of-speech (POS) tagging and named-entity resolution (NER). Currently, six 
feature types are implemented:</p>
+<ul>
+<li>Edge Feature: transition feature that encodes the transition feature 
weight from current label to next label.</li>
+<li>Start Feature: fired when the current token is the first token in a 
sequence.</li>
+<li>End Feature: fired when the current token is the last token in a 
sequence.</li>
+<li>Word Feature: fired when the current token is observed in the trained 
dictionary.</li>
+<li>Unknown Feature: fired when the current token is not observed in the 
trained dictionary for at least a certain number of times (default 1).</li>
+<li>Regex Feature: fired when the current token can be matched by a regular 
expression.</li>
+</ul>
+<p>A Viterbi implementation is also provided to get the best label sequence 
and the conditional probability <img class="formulaInl" alt="$ \Pr( \text{best 
label sequence} \mid \text{sequence}) $" src="form_56.png"/>.</p>
+<p>Following steps are required for CRF Learning and Inference:</p><ol 
type="1">
+<li><a href="#train_feature">Training Feature Generation</a></li>
+<li><a href="#train">CRF Training</a></li>
+<li><a href="#test_feature">Testing Feature Generation</a></li>
+<li><a href="#inference">Inference using Viterbi</a></li>
+</ol>
+<p><a class="anchor" id="train_feature"></a></p><dl class="section 
user"><dt>Training Feature Generation</dt><dd>The function takes 
<code>train_segment_tbl</code> and <code>regex_tbl</code> as input and does 
feature generation generating three tables <code>dictionary_tbl</code>, 
<code>train_feature_tbl</code> and <code>train_featureset_tbl</code>, that are 
required as an input for CRF training. <pre class="syntax">
+crf_train_fgen(train_segment_tbl,
+               regex_tbl,
+               label_tbl,
+               dictionary_tbl,
+               train_feature_tbl,
+               train_featureset_tbl)
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>train_segment_tbl </dt>
+<dd>TEXT. Name of the training segment table. The table is expected to have 
the following columns: <table class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id column  </td></tr>
+<tr>
+<th>start_pos </th><td>INTEGER. Index of a particular term in the respective 
document  </td></tr>
+<tr>
+<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in 
the document  </td></tr>
+<tr>
+<th>label </th><td>INTEGER. Label id for the term corresponding to the actual 
label from <code>label_tbl</code>   </td></tr>
+</table>
+</dd>
+<dt>regex_tbl </dt>
+<dd>TEXT. Name of the regular expression table. The table is expected to have 
the following columns: <table class="output">
+<tr>
+<th>pattern </th><td>TEXT. Regular Expression  </td></tr>
+<tr>
+<th>name </th><td>TEXT. Regular Expression name  </td></tr>
+</table>
+</dd>
+<dt>label_tbl </dt>
+<dd>TEXT. Name of the table containing unique labels and their id's. The table 
is expected to have the following columns: <table class="output">
+<tr>
+<th>id </th><td>INTEGER. Unique label id. NOTE: Must range from 0 to total 
number of labels in the table - 1.   </td></tr>
+<tr>
+<th>label </th><td>TEXT. Label name  </td></tr>
+</table>
+</dd>
+<dt>dictionary_tbl </dt>
+<dd>TEXT. Name of the dictionary table to be created containing unique terms 
along with their counts. The table will have the following columns: <table 
class="output">
+<tr>
+<th>token </th><td>TEXT. Contains all the unique terms found in 
<code>train_segment_tbl</code>   </td></tr>
+<tr>
+<th>total </th><td>INTEGER. Respective counts for the terms  </td></tr>
+</table>
+</dd>
+<dt>train_feature_tbl</dt>
+<dd></dd>
+<dt></dt>
+<dd><p class="startdd">TEXT. Name of the training feature table to be created. 
The table will have the following columns: </p><table class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id  </td></tr>
+<tr>
+<th>f_size </th><td>INTEGER. Feature set size. This value will be same for all 
the tuples in the table  </td></tr>
+<tr>
+<th>sparse_r </th><td>DOUBLE PRECISION[]. Array union of individual single 
state features (previous label, label, feature index, start position, training 
existance indicator), ordered by their start position.  </td></tr>
+<tr>
+<th>dense_m </th><td>DOUBLE PRECISION[]. Array union of (previous label, 
label, feature index, start position, training existance indicator) of edge 
features ordered by start position.  </td></tr>
+<tr>
+<th>sparse_m </th><td>DOUBLE PRECISION[]. Array union of (feature index, 
previous label, label) of edge features ordered by feature index.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>train_featureset_tbl </dt>
+<dd>TEXT. Name of the table to be created containing distinct featuresets 
generated from training feature extraction. The table will have the following 
columns: <table class="output">
+<tr>
+<th>f_index </th><td>INTEGER. Column containing distinct featureset ids  
</td></tr>
+<tr>
+<th>f_name </th><td>TEXT. Feature name   </td></tr>
+<tr>
+<th>feature </th><td>ARRAY. Feature value. The value is of the form [L1, L2] 
<br />
+ - If L1 = -1: represents single state feature with L2 being the current label 
id. <br />
+ - If L1 != -1: represents transition feature with L1 be the previous label 
and L2 be the current label.    </td></tr>
+</table>
+</dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Linear 
Chain CRF Training Function</dt><dd>The function takes 
<code>train_feature_tbl</code> and <code>train_featureset_tbl</code> tables 
generated in the training feature generation steps as input along with other 
required parameters and produces two output tables <code>crf_stats_tbl</code> 
and <code>crf_weights_tbl</code>.</dd></dl>
+<pre class="syntax">
+lincrf_train(train_feature_tbl,
+             train_featureset_tbl,
+             label_tbl,
+             crf_stats_tbl,
+             crf_weights_tbl
+             max_iterations
+            )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>train_feature_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the feature table generated during 
training feature generation</p>
+<p class="enddd"></p>
+</dd>
+<dt>train_featureset_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the featureset table generated during 
training feature generation</p>
+<p class="enddd"></p>
+</dd>
+<dt>label_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the label table used</p>
+<p class="enddd"></p>
+</dd>
+<dt>crf_stats_table </dt>
+<dd>TEXT. Name of the table to be created containing statistics for CRF 
training. The table has the following columns: <table class="output">
+<tr>
+<th>coef </th><td>DOUBLE PRECISION[]. Array of coefficients  </td></tr>
+<tr>
+<th>log_likelihood </th><td>DOUBLE. Log-likelihood  </td></tr>
+<tr>
+<th>num_iterations </th><td>INTEGER. The number of iterations at which the 
algorithm terminated  </td></tr>
+</table>
+</dd>
+<dt>crf_weights_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to be created creating learned 
feature weights. The table has the following columns: </p><table class="output">
+<tr>
+<th>id </th><td>INTEGER. Feature set id  </td></tr>
+<tr>
+<th>name </th><td>TEXT. Feature name  </td></tr>
+<tr>
+<th>prev_label_id </th><td>INTEGER. Label for the previous token encountered  
</td></tr>
+<tr>
+<th>label_id </th><td>INTEGER. Label of the token with the respective feature  
</td></tr>
+<tr>
+<th>weight </th><td>DOUBLE PRECISION. Weight for the respective feature set  
</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>max_iterations </dt>
+<dd>INTEGER. The maximum number of iterations </dd>
+</dl>
+<p><a class="anchor" id="test_feature"></a></p><dl class="section 
user"><dt>Testing Feature Generation</dt><dd></dd></dl>
+<pre class="syntax">
+crf_test_fgen(test_segment_tbl,
+              dictionary_tbl,
+              label_tbl,
+              regex_tbl,
+              crf_weights_tbl,
+              viterbi_mtbl,
+              viterbi_rtbl
+             )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>test_segment_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the testing segment table. The table is 
expected to have the following columns: </p><table class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id column  </td></tr>
+<tr>
+<th>start_pos </th><td>INTEGER. Index of a particular term in the respective 
document  </td></tr>
+<tr>
+<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in 
the document  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>dictionary_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the dictionary table created during 
training feature generation (<code>crf_train_fgen</code>)</p>
+<p class="enddd"></p>
+</dd>
+<dt>label_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the label table</p>
+<p class="enddd"></p>
+</dd>
+<dt>regex_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the regular expression table</p>
+<p class="enddd"></p>
+</dd>
+<dt>crf_weights_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the weights table generated during CRF 
training (<code>lincrf_train</code>)</p>
+<p class="enddd"></p>
+</dd>
+<dt>viterbi_mtbl </dt>
+<dd><p class="startdd">TEXT. Name of the Viterbi M table to be created</p>
+<p class="enddd"></p>
+</dd>
+<dt>viterbi_rtbl </dt>
+<dd>TEXT. Name of the Viterbi R table to be created </dd>
+</dl>
+<p><a class="anchor" id="inference"></a></p><dl class="section 
user"><dt>Inference using Viterbi</dt><dd><pre class="syntax">
+vcrf_label(test_segment_tbl,
+           viterbi_mtbl,
+           viterbi_rtbl,
+           label_tbl,
+           result_tbl)
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>test_segment_tbl </dt>
+<dd>TEXT. Name of the testing segment table. For required table schema, please 
refer to arguments in previous section </dd>
+<dt>viterbi_mtbl </dt>
+<dd>TEXT. Name of the table <code>viterbi_mtbl</code> generated from testing 
feature generation <code>crf_test_fgen</code>. </dd>
+<dt>viterbi_rtbl </dt>
+<dd>TEXT. Name of the table <code>viterbi_rtbl</code> generated from testing 
feature generation <code>crf_test_fgen</code>. </dd>
+<dt>label_tbl </dt>
+<dd>TEXT. Name of the label table. </dd>
+<dt>result_tbl </dt>
+<dd>TEXT. Name of the result table to be created containing extracted best 
label sequences. </dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="usage"></a></p><dl class="section user"><dt>Using 
CRF</dt><dd></dd></dl>
+<p>Generate text features, calculate their weights, and output the best label 
sequence for test data:<br />
+</p><ol type="1">
+<li>Perform feature generation on training data i.e. 
<code>train_segment_tbl</code> generating <code>train_feature_tbl</code> and 
<code>train_featureset_tbl</code>. <pre>SELECT madlib.crf_train_fgen(
+         '<em>train_segment_tbl</em>',
+         '<em>regex_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>dictionary_tbl</em>',
+         '<em>train_feature_tbl</em>',
+         '<em>train_featureset_tbl</em>');</pre></li>
+<li>Use linear-chain CRF for training providing <code>train_feature_tbl</code> 
and <code>train_featureset_tbl</code> generated from previous step as an input. 
<pre>SELECT madlib.lincrf_train(
+         '<em>train_feature_tbl</em>',
+         '<em>train_featureset_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>crf_stats_tbl</em>',
+         '<em>crf_weights_tbl</em>',
+         <em>max_iterations</em>);</pre></li>
+<li>Perform feature generation on testing data <code>test_segment_tbl</code> 
generating <code>viterbi_mtbl</code> and <code>viterbi_rtbl</code> required for 
inferencing. <pre>SELECT madlib.crf_test_fgen(
+         '<em>test_segment_tbl</em>',
+         '<em>dictionary_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>regex_tbl</em>',
+         '<em>crf_weights_tbl</em>',
+         '<em>viterbi_mtbl</em>',
+         '<em>viterbi_rtbl</em>');</pre></li>
+<li>Run the Viterbi function to get the best label sequence and the 
conditional probability <img class="formulaInl" alt="$ \Pr( \text{best label 
sequence} \mid \text{sequence}) $" src="form_56.png"/>. <pre>SELECT 
madlib.vcrf_label(
+         '<em>test_segment_tbl</em>',
+         '<em>viterbi_mtbl</em>',
+         '<em>viterbi_rtbl</em>',
+         '<em>label_tbl</em>',
+         '<em>result_tbl</em>');</pre></li>
+</ol>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd>This example uses a trivial training and test data 
set.</dd></dl>
+<ol type="1">
+<li>Load the label table, the regular expressions table, and the training 
segment table: <pre class="example">
+SELECT * FROM crf_label ORDER BY id;
+</pre> Result: <pre class="result">
+ id | label
+&#160;---+-------
+  0 | #
+  1 | $
+  2 | ''
+...
+  8 | CC
+  9 | CD
+ 10 | DT
+ 11 | EX
+ 12 | FW
+ 13 | IN
+ 14 | JJ
+...
+</pre> The regular expressions table: <pre class="example">
+SELECT * from crf_regex;
+</pre> <pre class="result">
+    pattern    |         name
+&#160;--------------+----------------------
+ ^.+ing$       | endsWithIng
+ ^[A-Z][a-z]+$ | InitCapital
+ ^[A-Z]+$      | isAllCapital
+ ^.*[0-9]+.*$  | containsDigit
+...
+</pre> The training segment table: <pre class="example">
+SELECT * from train_segmenttbl ORDER BY doc_id, start_pos;
+</pre> <pre class="result">
+ doc_id | start_pos |  seg_text  | label
+&#160;-------+-----------+------------+-------
+      0 |         0 | Confidence |    18
+      0 |         1 | in         |    13
+      0 |         2 | the        |    10
+      0 |         3 | pound      |    18
+      0 |         4 | is         |    38
+      0 |         5 | widely     |    26
+...
+      1 |         0 | Chancellor |    19
+      1 |         1 | of         |    13
+      1 |         2 | the        |    10
+      1 |         3 | Exchequer  |    19
+      1 |         4 | Nigel      |    19
+...
+</pre></li>
+<li>Generate the training features: <pre class="example">
+SELECT crf_train_fgen( 'train_segmenttbl',
+                       'crf_regex',
+                       'crf_label',
+                       'crf_dictionary',
+                       'train_featuretbl',
+                       'train_featureset'
+                     );
+SELECT * from crf_dictionary;
+</pre> Result: <pre class="result">
+     token       | total
+&#160;----------------+-------
+ Hawthorne       |     1
+ Mercedes-Benzes |     1
+ Wolf            |     3
+ best-known      |     1
+ hairline        |     1
+ accepting       |     2
+ purchases       |    14
+ trash           |     5
+ co-venture      |     1
+ restaurants     |     7
+...
+</pre> <pre class="example">
+SELECT * from train_featuretbl;
+</pre> Result: <pre class="result">
+ doc_id | f_size |            sparse_r           |             dense_m         
    |       sparse_m
+&#160;-------+--------+-------------------------------+---------------------------------+-----------------------
+      2 |     87 | {-1,13,12,0,1,-1,13,9,0,1,..} | 
{13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...}
+      1 |     87 | {-1,13,0,0,1,-1,13,9,0,1,...} | 
{13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...}
+</pre> <pre class="example">
+SELECT * from train_featureset;
+</pre> <pre class="result">
+ f_index |    f_name     | feature
+&#160;--------+---------------+---------
+       1 | R_endsWithED  | {-1,29}
+      13 | W_outweigh    | {-1,26}
+      29 | U             | {-1,5}
+      31 | U             | {-1,29}
+      33 | U             | {-1,12}
+      35 | W_a           | {-1,2}
+      37 | W_possible    | {-1,6}
+      15 | W_signaled    | {-1,29}
+      17 | End.          | {-1,43}
+      49 | W_'s          | {-1,16}
+      63 | W_acquire     | {-1,26}
+      51 | E.            | {26,2}
+      69 | E.            | {29,17}
+      71 | E.            | {2,11}
+      83 | W_the         | {-1,2}
+      85 | E.            | {16,11}
+       4 | W_return      | {-1,11}
+...
+</pre></li>
+<li>Train using linear CRF: <pre class="example">
+SELECT lincrf_train( 'train_featuretbl',
+                     'train_featureset',
+                     'crf_label',
+                     'crf_stats_tbl',
+                     'crf_weights_tbl',
+                     20
+             );
+</pre> <pre class="result">
+                                lincrf_train
+&#160;-----------------------------------------------------------------------------------
+ CRF Train successful. Results stored in the specified CRF stats and weights 
table
+ lincrf
+</pre> View the feature weight table. <pre class="example">
+SELECT * from crf_weights_tbl;
+</pre> Result: <pre class="result">
+ id |     name      | prev_label_id | label_id |      weight
+&#160;---+---------------+---------------+----------+-------------------
+  1 | R_endsWithED  |            -1 |       29 |  1.54128249293937
+ 13 | W_outweigh    |            -1 |       26 |  1.70691232223653
+ 29 | U             |            -1 |        5 |  1.40708515869008
+ 31 | U             |            -1 |       29 | 0.830356200936407
+ 33 | U             |            -1 |       12 | 0.769587378281239
+ 35 | W_a           |            -1 |        2 |  2.68470625883726
+ 37 | W_possible    |            -1 |        6 |  3.41773107604468
+ 15 | W_signaled    |            -1 |       29 |  1.68187039165771
+ 17 | End.          |            -1 |       43 |  3.07687845517082
+ 49 | W_'s          |            -1 |       16 |  2.61430312229883
+ 63 | W_acquire     |            -1 |       26 |  1.67247047385797
+ 51 | E.            |            26 |        2 |   3.0114240119435
+ 69 | E.            |            29 |       17 |  2.82385531733866
+ 71 | E.            |             2 |       11 |  3.00970493772732
+ 83 | W_the         |            -1 |        2 |  2.58742315259326
+...
+</pre></li>
+<li>To find the best labels for a test set using the trained linear CRF model, 
repeat steps #1-2 and generate the test features, except instead of creating a 
new dictionary, use the dictionary generated from the training set. <pre 
class="example">
+SELECT * from test_segmenttbl ORDER BY doc_id, start_pos;
+</pre> Result: <pre class="result">
+ doc_id | start_pos |   seg_text
+&#160;-------+-----------+---------------
+      0 |         0 | Rockwell
+      0 |         1 | International
+      0 |         2 | Corp.
+      0 |         3 | 's
+      0 |         4 | Tulsa
+      0 |         5 | unit
+      0 |         6 | said
+...
+      1 |         0 | Rockwell
+      1 |         1 | said
+      1 |         2 | the
+      1 |         3 | agreement
+      1 |         4 | calls
+...
+</pre> <pre class="example">
+SELECT crf_test_fgen( 'test_segmenttbl',
+                      'crf_dictionary',
+                      'crf_label',
+                      'crf_regex',
+                      'crf_weights_tbl',
+                      'viterbi_mtbl',
+                      'viterbi_rtbl'
+                    );
+</pre></li>
+<li>Calculate the best label sequence and save in the table 
<code>extracted_best_labels</code>. <pre class="example">
+SELECT vcrf_label( 'test_segmenttbl',
+                   'viterbi_mtbl',
+                   'viterbi_rtbl',
+                   'crf_label',
+                   'extracted_best_labels'
+                 );
+</pre> View the best labels. <pre class="example">
+SELECT * FROM extracted_best_labels;
+</pre> Result: <pre class="result">
+ doc_id | start_pos |   seg_text    | label | id | max_pos |   prob
+&#160;-------+-----------+---------------+-------+----+---------+----------
+      0 |         0 | Rockwell      | NNP   | 19 |      27 | 0.000269
+      0 |         1 | International | NNP   | 19 |      27 | 0.000269
+      0 |         2 | Corp.         | NNP   | 19 |      27 | 0.000269
+      0 |         3 | 's            | NNP   | 19 |      27 | 0.000269
+...
+      1 |         0 | Rockwell      | NNP   | 19 |      16 | 0.000168
+      1 |         1 | said          | NNP   | 19 |      16 | 0.000168
+      1 |         2 | the           | DT    | 10 |      16 | 0.000168
+      1 |         3 | agreement     | JJ    | 14 |      16 | 0.000168
+      1 |         4 | calls         | NNS   | 21 |      16 | 0.000168
+...
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>Specifically, a linear-chain CRF is a distribution defined by </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ p_\lambda(\boldsymbol y | \boldsymbol x) = 
\frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol 
y)}}{Z_\lambda(\boldsymbol x)} \,. \]" src="form_57.png"/>
+</p>
+<p>where</p><ul>
+<li><img class="formulaInl" alt="$ F_m(\boldsymbol x, \boldsymbol y) = 
\sum_{i=1}^n f_m(y_i,y_{i-1},x_i) $" src="form_58.png"/> is a global feature 
function that is a sum along a sequence <img class="formulaInl" alt="$ 
\boldsymbol x $" src="form_59.png"/> of length <img class="formulaInl" alt="$ n 
$" src="form_11.png"/></li>
+<li><img class="formulaInl" alt="$ f_m(y_i,y_{i-1},x_i) $" src="form_60.png"/> 
is a local feature function dependent on the current token label <img 
class="formulaInl" alt="$ y_i $" src="form_61.png"/>, the previous token label 
<img class="formulaInl" alt="$ y_{i-1} $" src="form_62.png"/>, and the 
observation <img class="formulaInl" alt="$ x_i $" src="form_63.png"/></li>
+<li><img class="formulaInl" alt="$ \lambda_m $" src="form_64.png"/> is the 
corresponding feature weight</li>
+<li><img class="formulaInl" alt="$ Z_\lambda(\boldsymbol x) $" 
src="form_65.png"/> is an instance-specific normalizer <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol 
y'} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y')} \]" 
src="form_66.png"/>
+</p>
+</li>
+</ul>
+<p>A linear-chain CRF estimates the weights <img class="formulaInl" alt="$ 
\lambda_m $" src="form_64.png"/> by maximizing the log-likelihood of a given 
training set <img class="formulaInl" alt="$ T=\{(x_k,y_k)\}_{k=1}^N $" 
src="form_67.png"/>.</p>
+<p>The log-likelihood is defined as </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) 
=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] \]" 
src="form_68.png"/>
+</p>
+<p>and the zero of its gradient </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \nabla 
\ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] \]" 
src="form_69.png"/>
+</p>
+<p>is found since the maximum likelihood is reached when the empirical average 
of the global feature vector equals its model expectation. The MADlib 
implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of 
the BroydenâFletcherâGoldfarbâShanno (BFGS) update, a quasi-Newton method 
for unconstrained optimization.</p>
+<p><img class="formulaInl" alt="$E_{p_\lambda(Y|x)}[F(x,Y)]$" 
src="form_70.png"/> is found by using a variant of the forward-backward 
algorithm: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y 
p_\lambda(y|x)F(x,y) = 
\sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)} \]" 
src="form_71.png"/>
+</p>
+ <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ Z_\lambda(x) = \alpha_n.1^T \]" 
src="form_72.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$\alpha_i$" src="form_73.png"/> and 
<img class="formulaInl" alt="$ \beta_i$" src="form_74.png"/> are the forward 
and backward state cost vectors defined by </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \alpha_i = \begin{cases} \alpha_{i-1}M_i, 
&amp; 0&lt;i&lt;=n\\ 1, &amp; i=0 \end{cases}\\ \]" src="form_75.png"/>
+</p>
+ <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \beta_i^T = \begin{cases} 
M_{i+1}\beta_{i+1}^T, &amp; 1&lt;=i&lt;n\\ 1, &amp; i=n \end{cases} \]" 
src="form_76.png"/>
+</p>
+<p>To avoid overfitting, we penalize the likelihood with a spherical Gaussian 
weight prior: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M 
\lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda 
\rVert^2}{2\sigma ^2} \]" src="form_77.png"/>
+</p>
+<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) 
- E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2} \]" 
src="form_78.png"/>
+</p>
+<dl class="section user"><dt>Literature</dt><dd>[1] F. Sha, F. Pereira. 
Shallow Parsing with Conditional Random Fields, <a 
href="http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf";>http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf</a></dd></dl>
+<p>[2] Wikipedia, Conditional Random Field, <a 
href="http://en.wikipedia.org/wiki/Conditional_random_field";>http://en.wikipedia.org/wiki/Conditional_random_field</a></p>
+<p>[3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, <a 
href="http://crf.sourceforge.net/";>http://crf.sourceforge.net/</a></p>
+<p>[4] D. Wang, ViterbiCRF, <a 
href="http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html";>http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html</a></p>
+<p>[5] Wikipedia, Viterbi Algorithm, <a 
href="http://en.wikipedia.org/wiki/Viterbi_algorithm";>http://en.wikipedia.org/wiki/Viterbi_algorithm</a></p>
+<p>[6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), 
Mathematics of Computation 35, pp. 773-782</p>
+<p>[7] J. Nocedal, Software for Large-scale Unconstrained Optimization, <a 
href="http://users.eecs.northwestern.edu/~nocedal/lbfgs.html";>http://users.eecs.northwestern.edu/~nocedal/lbfgs.html</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="crf_8sql__in.html" title="SQL functions for 
conditional random field. ">crf.sql_in</a> <a class="el" 
href="crf__feature__gen_8sql__in.html" title="SQL function for POS/NER feature 
extraction. ">crf_feature_gen.sql_in</a> <a class="el" 
href="viterbi_8sql__in.html" title="concatenate a set of input values into 
arrays to feed into viterbi c function and create a human 
read...">viterbi.sql_in</a> (documenting the SQL functions) </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__datatrans.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__datatrans.html 
b/docs/v1.11/group__grp__datatrans.html
new file mode 100644
index 0000000..4865a54
--- /dev/null
+++ b/docs/v1.11/group__grp__datatrans.html
@@ -0,0 +1,145 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Data Types and Transformations</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__datatrans.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Data Types and Transformations</div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>Data types and transformation operations </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__arraysmatrix"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__arraysmatrix.html">Arrays and 
Matrices</a></td></tr>
+<tr class="memdesc:group__grp__arraysmatrix"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Mathematical operations for 
arrays and matrices. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__pca"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__pca.html">Dimensionality Reduction</a></td></tr>
+<tr class="memdesc:group__grp__pca"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">A collection of methods for dimensionality reduction. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__encode__categorical"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__encode__categorical.html">Encoding Categorical 
Variables</a></td></tr>
+<tr class="memdesc:group__grp__encode__categorical"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Provides functions to 
encode categorical variables. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__pivot"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__pivot.html">Pivot</a></td></tr>
+<tr class="memdesc:group__grp__pivot"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides pivoting functions helpful for data preparation 
before modeling. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__stemmer"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__stemmer.html">Stemming</a></td></tr>
+<tr class="memdesc:group__grp__stemmer"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides porter stemmer operations supporting other MADlib 
modules. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__datatrans.js
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__datatrans.js 
b/docs/v1.11/group__grp__datatrans.js
new file mode 100644
index 0000000..484bba6
--- /dev/null
+++ b/docs/v1.11/group__grp__datatrans.js
@@ -0,0 +1,8 @@
+var group__grp__datatrans =
+[
+    [ "Arrays and Matrices", "group__grp__arraysmatrix.html", 
"group__grp__arraysmatrix" ],
+    [ "Dimensionality Reduction", "group__grp__pca.html", "group__grp__pca" ],
+    [ "Encoding Categorical Variables", 
"group__grp__encode__categorical.html", null ],
+    [ "Pivot", "group__grp__pivot.html", null ],
+    [ "Stemming", "group__grp__stemmer.html", null ]
+];
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__decision__tree.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__decision__tree.html 
b/docs/v1.11/group__grp__decision__tree.html
new file mode 100644
index 0000000..725d187
--- /dev/null
+++ b/docs/v1.11/group__grp__decision__tree.html
@@ -0,0 +1,728 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Decision Tree</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__decision__tree.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Decision Tree<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" 
href="group__grp__tree.html">Tree Methods</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#display">Display Function</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>A decision tree is a supervised learning method that can be used for 
classification and regression. It consists of a structure in which internal 
nodes represent tests on attributes, and the branches from nodes represent the 
result of those tests. Each leaf node is a class label and the paths from root 
to leaf nodes define the set of classification or regression rules.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function</dt><dd>We implement the decision tree using the CART algorithm, 
introduced by Breiman et al. [1]. The training function has the following 
syntax: <pre class="syntax">
+tree_train(
+    training_table_name,
+    output_table_name,
+    id_col_name,
+    dependent_variable,
+    list_of_features,
+    list_of_features_to_exclude,
+    split_criterion,
+    grouping_cols,
+    weights,
+    max_depth,
+    min_split,
+    min_bucket,
+    num_splits,
+    pruning_params,
+    surrogate_params,
+    verbosity
+    )
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>training_table_name </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the training 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table_name </dt>
+<dd><p class="startdd">TEXT. The name of the generated table containing the 
model. If a table with the same name already exists, then the function will 
return an error.</p>
+<p>The model table produced by the training function contains the following 
columns:</p>
+<table class="output">
+<tr>
+<th>&lt;...&gt; </th><td>Grouping columns, if provided as input, in the same 
types as the training table. This could be multiple columns depending on the 
<code>grouping_cols</code> input.  </td></tr>
+<tr>
+<th>tree </th><td>BYTEA8. Trained decision tree model stored in a binary 
format.  </td></tr>
+<tr>
+<th>cat_levels_in_text </th><td>TEXT[]. Ordered levels of categorical 
variables.  </td></tr>
+<tr>
+<th>cat_n_levels </th><td><p class="starttd">INTEGER[]. Number of levels for 
each categorical variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>tree_depth </th><td><p class="starttd">INTEGER. The maximum depth the tree 
obtained after training (root has depth 0). </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>pruning_cp </th><td><p class="starttd">DOUBLE PRECISION. The 
cost-complexity parameter used for pruning the trained tree(s). This would be 
different from the 'input_cp' value if cross-validation is used.  </p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p>A summary table named <em>&lt;model_table&gt;_summary</em> is also created 
at the same time, which has the following columns: </p><table class="output">
+<tr>
+<th>method </th><td><p class="starttd">TEXT. 'tree_train' </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>is_classification </th><td><p class="starttd">BOOLEAN. TRUE if the 
decision trees are for classification, FALSE if for regression. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>source_table </th><td><p class="starttd">TEXT. The data source table name. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>model_table </th><td><p class="starttd">TEXT. The model table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>id_col_name </th><td><p class="starttd">TEXT. The ID column name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_varname </th><td><p class="starttd">TEXT. The dependent 
variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_varname </th><td><p class="starttd">TEXT. The independent 
variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_features </th><td><p class="starttd">TEXT. The list of categorical 
feature names as a comma-separated string. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>con_features </th><td><p class="starttd">TEXT. The list of continuous 
feature names as a comma-separated string. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>grouping_col </th><td><p class="starttd">TEXT. Names of grouping columns. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_all_groups </th><td><p class="starttd">INTEGER. Number of groups in 
decision tree training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_failed_groups </th><td><p class="starttd">INTEGER. Number of failed 
groups in decision tree training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_processed </th><td><p class="starttd">BIGINT. Total numbers of 
rows processed in all groups. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_skipped </th><td><p class="starttd">BIGINT. Total numbers of 
rows skipped in all groups due to missing values or failures. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_levels </th><td><p class="starttd">TEXT. For classification, 
the distinct levels of the dependent variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_type </th><td><p class="starttd">TEXT. The type of dependent 
variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>input_cp </th><td><p class="starttd">DOUBLE PRECISION. The complexity 
parameter (cp) used for pruning the trained tree(s) before cross-validation is 
run. This is same as the cp value input using the <em>pruning_params</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_var_types </th><td><p class="starttd">TEXT. A comma separated 
string for the types of independent variables. </p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">TEXT. Name of the column containing id information in 
the training data. This is a mandatory argument and is used for prediction and 
cross-validation. The values are expected to be unique for each row. </p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_variable </dt>
+<dd><p class="startdd">TEXT. Name of the column that contains the output 
(response) for training. Boolean, integer and text types are considered 
classification outputs, while double precision values are considered regression 
outputs. The response variable for a classification tree can be multinomial, 
but the time and space complexity of the training function increases linearly 
as the number of response classes increases.</p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of column names to use as 
predictors. Can also be a '*' implying all columns are to be used as predictors 
(except the ones included in the next argument). The types of the features can 
be mixed where boolean, integer, and text columns are considered categorical 
and double precision columns are considered continuous. The categorical 
variables are not encoded and used as is for the training.</p>
+<p>It is important to note that we don't test for every combination of levels 
of a categorical variable when evaluating a split. We order the levels of the 
non-integer categorical variable by the entropy of the variable in predicting 
the response. The split at each node is evaluated between these ordered levels. 
Integer categorical variables are ordered by their value. </p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features_to_exclude </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of column names to exclude 
from the predictors list. If the <em>dependent_variable</em> is an expression 
(including cast of a column name), then this list should include all columns 
present in the <em>dependent_variable</em> expression, otherwise those columns 
will be included in the features. The names in this parameter should be 
identical to the names used in the table and quoted appropriately.</p>
+<p class="enddd"></p>
+</dd>
+<dt>split_criterion </dt>
+<dd><p class="startdd">TEXT, default = 'gini' for classification, 'mse' for 
regression. Impurity function to compute the feature to use for the split. 
Supported criteria are 'gini', 'entropy', 'misclassification' for 
classification trees. For regression trees, split_criterion of 'mse' is always 
used (irrespective of the input for this argument). </p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_cols (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. Comma-separated list of column 
names to group the data by. This will result in multiple decision trees, one 
for each group.</p>
+<p class="enddd"></p>
+</dd>
+<dt>weights (optional) </dt>
+<dd><p class="startdd">TEXT. Column name containing weights for each 
observation.</p>
+<p class="enddd"></p>
+</dd>
+<dt>max_depth (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 7. Maximum depth of any node of the 
final tree, with the root node counted as depth 0. A deeper tree can lead to 
better prediction but will also result in longer processing time and higher 
memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_split (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 20. Minimum number of observations 
that must exist in a node for a split to be attempted. The best value for this 
parameter depends on the number of tuples in the dataset.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_bucket (optional) </dt>
+<dd><p class="startdd">INTEGER, default: min_split/3. Minimum number of 
observations in any terminal node. If only one of min_bucket or min_split is 
specified, min_split is set to min_bucket*3 or min_bucket to min_split/3, as 
appropriate.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_splits (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 20. Continuous-valued features are 
binned into discrete quantiles to compute split boundaries. This global 
parameter is used to compute the resolution of splits for continuous features. 
Higher number of bins will lead to better prediction, but will also result in 
longer processing time and higher memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>pruning_params (optional) </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of key-value pairs giving 
the parameters for pruning the tree. The parameters currently accepted are: 
</p><table class="output">
+<tr>
+<th>cp </th><td><p class="starttd">Default: 0. A split on a node is attempted 
only if it decreases the overall lack of fit by a factor of 'cp', else the 
split is pruned away. This value is used to create an initial tree before 
running cross-validation (see below).</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>n_folds </th><td><p class="starttd">Default: 0 (i.e. no cross-validation). 
Number of cross-validation folds to use to compute the best value of 
<em>cp</em>. To perform cross-validation, a positive value of <em>n_folds</em> 
(greater than 2) should be given. An additional output table 
<em>&lt;model_table&gt;_cv</em> is created containing the values of evaluated 
<em>cp</em> and the cross-validation error. The tree returned in the output 
table corresponds to the <em>cp</em> with the lowest cross-validation error (we 
pick the maximum <em>cp</em> if multiple values have same error).</p>
+<p>The list of <em>cp</em> values is automatically computed by parsing through 
the tree initially trained on the complete dataset. The tree output is a subset 
of this initial tree corresponding to the best computed <em>cp</em>.</p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>surrogate_params </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of key-value pairs 
controlling the behavior of surrogate splits for each node. A surrogate 
variable is another predictor variable that is associated (correlated) with the 
primary predictor variable for a split. The surrogate variable comes into use 
when the primary predictior value is NULL. This parameter currently accepts one 
argument: </p><table class="output">
+<tr>
+<th>max_surrogates </th><td>Default: 0. Number of surrogates to store for each 
node.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>verbosity (optional) </dt>
+<dd>BOOLEAN, default: FALSE. Provides verbose output of the training result. 
</dd>
+</dl>
+</dd></dl>
+<dl class="section note"><dt>Note</dt><dd><ul>
+<li>Many of the parameters are designed to be similar to the popular R package 
'rpart'. An important distinction between rpart and the MADlib function is that 
for both response and feature variables, MADlib considers integer values as 
categorical values, while rpart considers them as continuous. To use integers 
as continuous, please cast them to double precision.</li>
+<li>Integer values are ordered by value for computing the split boundaries. 
Please cast to TEXT if the entropy-based ordering method is desired.</li>
+<li>When using no surrogates (<em>max_surrogates</em>=0), all rows containing 
NULL values for any of the features used for training will be ignored from 
training and prediction.</li>
+<li>When cross-validation is not used (<em>n_folds</em>=0), each tree output 
is pruned by the input cost-complextity (<em>cp</em>). With cross-validation, 
the input <em>cp</em> is the minimum value of all the explored values of 'cp'. 
During cross-validation, we train an initial tree using the provided 
<em>cp</em> and explore all possible sub-trees (up to a single-node tree) to 
compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding 
to this optimal sub-tree is placed in the <em>output_table</em>, with the 
columns named as <em>tree</em> and <em>pruning_cp</em> respectively.</li>
+<li>The main parameters that affect memory usage are: depth of tree 
(âmax_depthâ), number of features, number of values per categorical 
feature, and number of bins for continuous features (ânum_splitsâ). If you 
are hitting memory limits, consider reducing one or more of these 
parameters.</li>
+</ul>
+</dd></dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function estimates the 
conditional mean given a new predictor. It has the following syntax: <pre 
class="syntax">
+tree_predict(tree_model,
+             new_data_table,
+             output_table,
+             type)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>tree_model </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the decision tree 
model. This should be the output table returned from <em>tree_train.</em></p>
+<p class="enddd"></p>
+</dd>
+<dt>new_data_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing prediction data. 
This table is expected to contain the same features that were used during 
training. The table should also contain <em>id_col_name</em> used for 
identifying each row.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to output prediction results. 
If this table already exists, an error is returned. The table contains the 
<em>id_col_name</em> column giving the 'id' for each prediction and the 
prediction columns for the dependent variable.</p>
+<p>If <em>type</em> = 'response', then the table has a single additional 
column with the prediction value of the response. The type of this column 
depends on the type of the response variable used during training.</p>
+<p>If <em>type</em> = 'prob', then the table has multiple additional columns, 
one for each possible value of the response variable. The columns are labeled 
as 'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents 
each value of the response variable.</p>
+<p class="enddd"></p>
+</dd>
+<dt>type </dt>
+<dd>TEXT, optional, default: 'response'. For regression trees, the output is 
always the predicted value of the dependent variable. For classification trees, 
the <em>type</em> variable can be 'response', giving the classification 
prediction as output, or 'prob', giving the class probabilities as output. For 
each value of the dependent variable, a column with the probabilities is added 
to the output table.  </dd>
+</dl>
+<dl class="section note"><dt>Note</dt><dd>If the <em>new_data_table</em> 
contains categories of categorical variables not seen in the training data, the 
prediction for that row will be NULL.</dd></dl>
+<p><a class="anchor" id="display"></a></p><dl class="section user"><dt>Display 
Function</dt><dd>The display function outputs a graph representation of the 
decision tree. The output can either be in the popular 'dot' format that can be 
visualized using various programs including those in the GraphViz package, or 
in a simple text format. The details of the text format are output with the 
tree. <pre class="syntax">
+tree_display(tree_model, dot_format, verbosity)
+</pre></dd></dl>
+<p>An additional display function is provided to output the surrogate splits 
chosen for each internal node: </p><pre class="syntax">
+tree_surr_display(tree_model)
+</pre><p>The output contains the list of surrogate splits for each internal 
node. The nodes are sorted in ascending order by id. This is equivalent to 
viewing the tree in a breadth-first manner. For each surrogate, we output the 
surrogate split (variable and threshold) and also give the number of rows that 
were common between the primary split and the surrogate split. Finally, the 
number of rows present in the majority branch of the primary split is also 
shown. Only surrogates that perform better than this majority branch are 
included in the surrogate list. When the primary variable has a NULL value the 
surrogate variables are used in order to compute the split for that node. If 
all surrogates variables are NULL, then the majority branch is used to compute 
the split for a tuple.</p>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>tree_model </dt>
+<dd>TEXT. Name of the table containing the decision tree model. </dd>
+<dt>dot_format </dt>
+<dd>BOOLEAN, default = TRUE. Output can either be in a dot format or a text 
format. If TRUE, the result is in the dot format, else output is in text 
format. </dd>
+<dt>verbosity </dt>
+<dd>BOOLEAN, default = FALSE. If set to TRUE, the dot format output will 
contain additional information (impurity, sample size, number of weighted rows 
for each response variable, classification or prediction if the tree was pruned 
at this level) </dd>
+</dl>
+<p>The output is always returned as a 'TEXT'. For the dot format, the output 
can be redirected to a file on the client side and then rendered using 
visualization programs.</p>
+<p>To export the dot format result to an external file, use the method below. 
Use unaligned table output mode for psql with '-A' flag. And inside the psql 
client, both '\t' and '\o' should be used):</p>
+<pre class="example">
+&gt; # under bash
+&gt; psql -A my_database
+# -- in psql now
+# \t
+# \o test.dot -- export to a file
+# select madlib.tree_display('tree_out');
+# \o
+# \t
+</pre><p>After the dot file has been generated, use third-party plotting 
software to plot the trees in a nice format: </p><pre class="example">
+&gt; # under bash, convert the dot file into a PDF file
+&gt; dot -Tpdf test.dot &gt; test.pdf
+&gt; xpdf test.pdf&amp;
+</pre><p>Please see the examples below for more details on the contents of the 
tree output formats.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd><h4>Decision Tree Classification Example</h4>
+</dd></dl>
+<ol type="1">
+<li>Prepare input data: <pre class="example">
+DROP TABLE IF EXISTS dt_golf;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    windy text,
+    class text
+);
+</pre> <pre class="example">
+COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH 
DELIMITER '|';
+1|sunny|85|85|'false'|'Don't Play'
+2|sunny|80|90|'true'|'Don't Play'
+3|overcast|83|78|'false'|'Play'
+4|rain|70|96|'false'|'Play'
+5|rain|68|80|'false'|'Play'
+6|rain|65|70|'true'|'Don't Play'
+7|overcast|64|65|'true'|'Play'
+8|sunny|72|95|'false'|'Don't Play'
+9|sunny|69|70|'false'|'Play'
+10|rain|75|80|'false'|'Play'
+11|sunny|75|70|'true'|'Play'
+12|overcast|72|90|'true'|'Play'
+13|overcast|81|75|'false'|'Play'
+14|rain|71|80|'true'|'Don't Play'
+\.
+</pre></li>
+<li>Run the decision tree training function: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('dt_golf',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'class',           -- response
+                         '"OUTLOOK", temperature, humidity, windy',   -- 
features
+                         NULL::text,        -- exclude columns
+                         'gini',            -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights
+                         5,                 -- max depth
+                         3,                 -- min split
+                         1,                 -- min bucket
+                         10                 -- number of bins per continuous 
variable
+                         );
+</pre></li>
+<li>Predict output categories for the same data that was used for input: <pre 
class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',          -- tree model
+                           'dt_golf',               -- new data table
+                           'prediction_results',    -- output table
+                           'response');             -- show prediction
+SELECT * FROM prediction_results ORDER BY id;
+</pre> Result: <pre class="result">
+ id | estimated_class
+----+-----------------
+  1 | 'Don't Play'
+  2 | 'Don't Play'
+  3 | 'Play'
+  4 | 'Play'
+  5 | 'Play'
+  6 | 'Don't Play'
+  7 | 'Play'
+  8 | 'Don't Play'
+  9 | 'Play'
+ 10 | 'Play'
+ 11 | 'Play'
+ 12 | 'Play'
+ 13 | 'Play'
+ 14 | 'Don't Play'
+(14 rows)
+</pre></li>
+<li>Create a text display of the tree: <pre class="example">
+SELECT madlib.tree_display('train_output', FALSE);
+</pre> Result: <pre class="result">
+&#160;-------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Leaf nodes have a * while internal nodes have the split condition at 
the end.
+&#160;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
+&#160;- For each split the first indented child (2i+1) is the 'True' node and
+second indented child (2i+2) is the 'False' node.
+&#160;- Number of (weighted) rows for each response variable inside [].
+&#160;- Order of values = ['"Don\'t Play"', '"Play"']
+&#160;-------------------------------------
+(0)[ 5 9]  "OUTLOOK"&lt;={overcast}
+  (1)[ 0 4]  *
+  (2)[ 5 5]  temperature&lt;=75
+    (5)[ 3 5]  temperature&lt;=65
+      (11)[ 1 0]  *
+      (12)[ 2 5]  temperature&lt;=70
+        (25)[ 0 3]  *
+        (26)[ 2 2]  temperature&lt;=72
+          (53)[ 2 0]  *
+          (54)[ 0 2]  *
+    (6)[ 2 0]  *
+&#160;-------------------------------------
+</pre> Here are some more details on how to interpret the tree display 
above... Node numbering starts at 0 for the root node and would be contiguous 
1,2...n if the tree was completely full (no pruning). Since the tree has been 
pruned, the node numbering is not contiguous. The order of values [x y] 
indicates the number of weighted rows that correspond to ["Don't play" "Play"] 
<em>before</em> the node test. For example, at the root node 0, there are 5 
rows that "Don't play" and 9 rows that "Play" in the raw data. If we apply the 
test of "OUTLOOK" being overcast, then the True result is leaf node 1 which is 
"Play". There are 0 "Don't play" rows and 4 "Play" rows that correspond to this 
case (overcast). The remaining 5 "Don't play" rows and 5 "Play rows" are then 
tested at node 2 on temperature&lt;=75. And so on down the tree.</li>
+<li>Create a dot format display of the tree: <pre class="example">
+SELECT madlib.tree_display('train_output', TRUE);
+</pre> Result: <pre class="result">
+digraph "Classification tree for dt_golf" {
+         subgraph "cluster0"{
+         label=""
+"g0_0" [label="\"OUTLOOK"&lt;={overcast}", shape=ellipse];
+"g0_0" -&gt; "g0_1"[label="yes"];
+"g0_1" [label=""Play"",shape=box];
+"g0_0" -&gt; "g0_2"[label="no"];
+"g0_2" [label="temperature&lt;=75", shape=ellipse];
+"g0_2" -&gt; "g0_5"[label="yes"];
+"g0_2" -&gt; "g0_6"[label="no"];
+"g0_6" [label=""Don't Play"",shape=box];
+"g0_5" [label="temperature&lt;=65", shape=ellipse];
+"g0_5" -&gt; "g0_11"[label="yes"];
+"g0_11" [label=""Don't Play"",shape=box];
+"g0_5" -&gt; "g0_12"[label="no"];
+"g0_12" [label="temperature&lt;=70", shape=ellipse];
+"g0_12" -&gt; "g0_25"[label="yes"];
+"g0_25" [label=""Play"",shape=box];
+"g0_12" -&gt; "g0_26"[label="no"];
+"g0_26" [label="temperature&lt;=72", shape=ellipse];
+"g0_26" -&gt; "g0_53"[label="yes"];
+"g0_53" [label=""Don't Play"",shape=box];
+"g0_26" -&gt; "g0_54"[label="no"];
+"g0_54" [label=""Play"",shape=box];
+&#160;&#160;&#160;} //--- end of subgraph------------
+&#160;} //---end of digraph---------
+</pre></li>
+<li>Now create a dot format display of the tree with additional information: 
<pre class="example">
+SELECT madlib.tree_display('train_output', TRUE, TRUE);
+</pre> Result: <pre class="result">
+digraph "Classification tree for dt_golf" {
+         subgraph "cluster0"{
+         label=""
+"g0_0" [label="\"OUTLOOK" in {overcast}\n impurity = 0.459184\n samples = 14\n 
value = [5 9]\n class = "'Play'"", shape=ellipse];
+"g0_0" -&gt; "g0_1"[label="yes"];
+"g0_1" [label=""'Play'"\n samples = 4\n value = [0 4]",shape=box];
+"g0_0" -&gt; "g0_2"[label="no"];
+"g0_2" [label="temperature &lt;= 75\n impurity = 0.5\n samples = 10\n value = 
[5 5]\n class = "'Don't Play'"", shape=ellipse];
+"g0_2" -&gt; "g0_5"[label="yes"];
+"g0_2" -&gt; "g0_6"[label="no"];
+"g0_6" [label=""'Don't Play'"\n samples = 2\n value = [2 0]",shape=box];
+"g0_5" [label="temperature &lt;= 65\n impurity = 0.46875\n samples = 8\n value 
= [3 5]\n class = "'Play'"", shape=ellipse];
+"g0_5" -&gt; "g0_11"[label="yes"];
+"g0_11" [label=""'Don't Play'"\n samples = 1\n value = [1 0]",shape=box];
+"g0_5" -&gt; "g0_12"[label="no"];
+"g0_12" [label="temperature &lt;= 70\n impurity = 0.408163\n samples = 7\n 
value = [2 5]\n class = "'Play'"", shape=ellipse];
+"g0_12" -&gt; "g0_25"[label="yes"];
+"g0_25" [label=""'Play'"\n samples = 3\n value = [0 3]",shape=box];
+"g0_12" -&gt; "g0_26"[label="no"];
+"g0_26" [label="temperature &lt;= 72\n impurity = 0.5\n samples = 4\n value = 
[2 2]\n class = "'Don't Play'"", shape=ellipse];
+"g0_26" -&gt; "g0_53"[label="yes"];
+"g0_53" [label=""'Don't Play'"\n samples = 2\n value = [2 0]",shape=box];
+"g0_26" -&gt; "g0_54"[label="no"];
+"g0_54" [label=""'Play'"\n samples = 2\n value = [0 2]",shape=box];
+&#160;&#160;&#160;} //--- end of subgraph------------
+&#160;} //---end of digraph---------
+</pre> The additional information in each node is: impurity, sample size, 
number of weighted rows for each response variable, and classification if the 
tree was pruned at this level.</li>
+</ol>
+<h4>Decision Tree Regression Example</h4>
+<ol type="1">
+<li>Prepare input data. <pre class="example">
+DROP TABLE IF EXISTS mt_cars;
+CREATE TABLE mt_cars (
+    id integer NOT NULL,
+    mpg double precision,
+    cyl integer,
+    disp double precision,
+    hp integer,
+    drat double precision,
+    wt double precision,
+    qsec double precision,
+    vs integer,
+    am integer,
+    gear integer,
+    carb integer
+);
+</pre> <pre class="example">
+COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH 
DELIMITER '|' NULL 'null';
+1|18.7|8|360|175|3.15|3.44|17.02|0|0|3|2
+2|21|6|160|110|3.9|2.62|16.46|0|1|4|4
+3|24.4|4|146.7|62|3.69|3.19|20|1|0|4|2
+4|21|6|160|110|3.9|2.875|17.02|0|1|4|4
+5|17.8|6|167.6|123|3.92|3.44|18.9|1|0|4|4
+6|16.4|8|275.8|180|3.078|4.07|17.4|0|0|3|3
+7|22.8|4|108|93|3.85|2.32|18.61|1|1|4|1
+8|17.3|8|275.8|180|3.078|3.73|17.6|0|0|3|3
+9|21.4|null|258|110|3.08|3.215|19.44|1|0|3|1
+10|15.2|8|275.8|180|3.078|3.78|18|0|0|3|3
+11|18.1|6|225|105|2.768|3.46|20.22|1|0|3|1
+12|32.4|4|78.7|66|4.08|2.20|19.47|1|1|4|1
+13|14.3|8|360|245|3.21|3.578|15.84|0|0|3|4
+14|22.8|4|140.8|95|3.92|3.15|22.9|1|0|4|2
+15|30.4|4|75.7|52|4.93|1.615|18.52|1|1|4|2
+16|19.2|6|167.6|123|3.92|3.44|18.3|1|0|4|4
+17|33.9|4|71.14|65|4.22|1.835|19.9|1|1|4|1
+18|15.2|null|304|150|3.15|3.435|17.3|0|0|3|2
+19|10.4|8|472|205|2.93|5.25|17.98|0|0|3|4
+20|27.3|4|79|66|4.08|1.935|18.9|1|1|4|1
+21|10.4|8|460|215|3|5.424|17.82|0|0|3|4
+22|26|4|120.3|91|4.43|2.14|16.7|0|1|5|2
+23|14.7|8|440|230|3.23|5.345|17.42|0|0|3|4
+24|30.4|4|95.14|113|3.77|1.513|16.9|1|1|5|2
+25|21.5|4|120.1|97|3.70|2.465|20.01|1|0|3|1
+26|15.8|8|351|264|4.22|3.17|14.5|0|1|5|4
+27|15.5|8|318|150|2.768|3.52|16.87|0|0|3|2
+28|15|8|301|335|3.54|3.578|14.6|0|1|5|8
+29|13.3|8|350|245|3.73|3.84|15.41|0|0|3|4
+30|19.2|8|400|175|3.08|3.845|17.05|0|0|3|2
+31|19.7|6|145|175|3.62|2.77|15.5|0|1|5|6
+32|21.4|4|121|109|4.11|2.78|18.6|1|1|4|2
+\.
+</pre></li>
+<li>Run the decision tree training function: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('mt_cars',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'mpg',             -- dependent variable
+                         '*',               -- features
+                         'id, hp, drat, am, gear, carb',  -- exclude columns
+                         'mse',             -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights
+                         10,                -- max depth
+                         8,                 -- min split
+                         3,                 -- number of bins per continuous 
variable
+                         10,                -- number of splits
+                         NULL,              -- pruning parameters
+                         'max_surrogates=2' -- number of surrogates
+                         );
+</pre></li>
+<li>Display the decision tree in basic text format: <pre class="example">
+SELECT madlib.tree_display('train_output', FALSE);
+</pre> Result: <pre class="result">
+&#160; -------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Each internal nodes has the split condition at the end, while each
+&#160;    leaf node has a * at the end.
+&#160;- For each internal node (i), its child nodes are indented by 1 level
+&#160;    with ids (2i+1) for True node and (2i+2) for False node.
+&#160;- Number of rows and average response value inside []. For a leaf node, 
this is the prediction.
+&#160;-------------------------------------
+ (0)[32, 20.0906]  cyl in {8,6}
+    (1)[21, 16.6476]  disp &lt;= 258
+       (3)[7, 19.7429]  *
+       (4)[14, 15.1]  qsec &lt;= 17.42
+          (9)[10, 15.81]  qsec &lt;= 16.9
+             (19)[5, 14.78]  *
+             (20)[5, 16.84]  *
+          (10)[4, 13.325]  *
+    (2)[11, 26.6636]  wt &lt;= 2.2
+       (5)[6, 30.0667]  *
+       (6)[5, 22.58]  *
+ &#160;-------------------------------------
+(1 row)
+</pre></li>
+<li>Display the surrogates in the decision tree: <pre class="example">
+SELECT madlib.tree_surr_display('train_output');
+</pre> Result: <pre class="result">
+&#160;-------------------------------------
+       Surrogates for internal nodes
+&#160;-------------------------------------
+ (0) cyl in {8,6}
+      1: disp &gt; 146.7    [common rows = 29]
+      2: vs in {0}    [common rows = 26]
+      [Majority branch = 19 ]
+ (1) disp &lt;= 258
+      1: cyl in {6,4}    [common rows = 19]
+      2: vs in {1}    [common rows = 18]
+      [Majority branch = 14 ]
+ (2) wt &lt;= 2.2
+      1: disp &lt;= 108    [common rows = 9]
+      2: qsec &lt;= 18.52    [common rows = 8]
+      [Majority branch = 6 ]
+ (4) qsec &lt;= 17.42
+      1: disp &gt; 275.8    [common rows = 11]
+      2: vs in {0}    [common rows = 10]
+      [Majority branch = 10 ]
+ (9) qsec &lt;= 16.9
+      1: wt &lt;= 3.84    [common rows = 8]
+      2: disp &lt;= 360    [common rows = 7]
+      [Majority branch = 5 ]
+&#160;-------------------------------------
+(1 row)
+</pre> <dl class="section note"><dt>Note</dt><dd>The 'cyl' parameter above has 
two tuples with null values. In the prediction example below, the surrogate 
splits for the <em>cyl in {8, 6}</em> split are used to predict those two 
tuples (<em>id = 9</em> and <em>id = 18</em>). The splits are used in 
descending order till a surrogate variable is found that is not NULL. In this 
case, the two tuples have non-NULL values for <em>disp</em>, hence the <em>disp 
&gt; 146.7</em> split is used to make the prediction. If all the surrogate 
variables had been NULL then the majority branch would have been 
followed.</dd></dl>
+</li>
+<li>Predict regression output for the same data and compare with original: 
<pre class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',
+                           'mt_cars',
+                           'prediction_results',
+                           'response');
+SELECT s.id, mpg, estimated_mpg FROM prediction_results p, mt_cars s where 
s.id = p.id ORDER BY id;
+</pre> Result: <pre class="result">
+  id | mpg  |  estimated_mpg
+----+------+------------------
+  1 | 18.7 |            16.84
+  2 |   21 | 19.7428571428571
+  3 | 24.4 |            22.58
+  4 |   21 | 19.7428571428571
+  5 | 17.8 | 19.7428571428571
+  6 | 16.4 |            16.84
+  7 | 22.8 |            22.58
+  8 | 17.3 |           13.325
+  9 | 21.4 | 19.7428571428571
+ 10 | 15.2 |           13.325
+ 11 | 18.1 | 19.7428571428571
+ 12 | 32.4 | 30.0666666666667
+ 13 | 14.3 |            14.78
+ 14 | 22.8 |            22.58
+ 15 | 30.4 | 30.0666666666667
+ 16 | 19.2 | 19.7428571428571
+ 17 | 33.9 | 30.0666666666667
+ 18 | 15.2 |            16.84
+ 19 | 10.4 |           13.325
+ 20 | 27.3 | 30.0666666666667
+ 21 | 10.4 |           13.325
+ 22 |   26 | 30.0666666666667
+ 23 | 14.7 |            16.84
+ 24 | 30.4 | 30.0666666666667
+ 25 | 21.5 |            22.58
+ 26 | 15.8 |            14.78
+ 27 | 15.5 |            14.78
+ 28 |   15 |            14.78
+ 29 | 13.3 |            14.78
+ 30 | 19.2 |            16.84
+ 31 | 19.7 | 19.7428571428571
+ 32 | 21.4 |            22.58
+(32 rows)
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd>[1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; 
Stone, C. J. (1984). Classification and regression trees. Monterey, CA: 
Wadsworth &amp; Brooks/Cole Advanced Books &amp; Software.</dd></dl>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" 
href="decision__tree_8sql__in.html">decision_tree.sql_in</a> documenting the 
training function</p>
+<p><a class="el" href="group__grp__random__forest.html">Random Forest</a></p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

[28/51] [partial] incubator-madlib-site git commit: Add v1.11 docs

Reply via email to