[26/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

nkak Mon, 15 Oct 2018 12:15:25 -0700

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__crf.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__crf.html 
b/docs/v1.15.1/group__grp__crf.html
new file mode 100644
index 0000000..d0c215d
--- /dev/null
+++ b/docs/v1.15.1/group__grp__crf.html
@@ -0,0 +1,632 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Conditional Random Field</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__crf.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Conditional Random Field<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#train_feature">Training Feature Generation</a> </li>
+<li>
+<a href="#train">CRF Training Function</a> </li>
+<li>
+<a href="#test_feature">Testing Feature Generation</a> </li>
+<li>
+<a href="#inference">Inference using Viterbi</a> </li>
+<li>
+<a href="#usage">Using CRF</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#background">Technical Background</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>A conditional random field (CRF) is a type of discriminative, 
undirected probabilistic graphical model. A linear-chain CRF is a special type 
of CRF that assumes the current state depends only on the previous state.</p>
+<p>Feature extraction modules are provided for text-analysis tasks such as 
part-of-speech (POS) tagging and named-entity resolution (NER). Currently, six 
feature types are implemented:</p>
+<ul>
+<li>Edge Feature: transition feature that encodes the transition feature 
weight from current label to next label.</li>
+<li>Start Feature: fired when the current token is the first token in a 
sequence.</li>
+<li>End Feature: fired when the current token is the last token in a 
sequence.</li>
+<li>Word Feature: fired when the current token is observed in the trained 
dictionary.</li>
+<li>Unknown Feature: fired when the current token is not observed in the 
trained dictionary for at least a certain number of times (default 1).</li>
+<li>Regex Feature: fired when the current token can be matched by a regular 
expression.</li>
+</ul>
+<p>A Viterbi implementation is also provided to get the best label sequence 
and the conditional probability \( \Pr( \text{best label sequence} \mid 
\text{sequence}) \).</p>
+<p>Following steps are required for CRF Learning and Inference:</p><ol 
type="1">
+<li><a href="#train_feature">Training Feature Generation</a></li>
+<li><a href="#train">CRF Training</a></li>
+<li><a href="#test_feature">Testing Feature Generation</a></li>
+<li><a href="#inference">Inference using Viterbi</a></li>
+</ol>
+<p><a class="anchor" id="train_feature"></a></p><dl class="section 
user"><dt>Training Feature Generation</dt><dd>The function takes 
<code>train_segment_tbl</code> and <code>regex_tbl</code> as input and does 
feature generation generating three tables <code>dictionary_tbl</code>, 
<code>train_feature_tbl</code> and <code>train_featureset_tbl</code>, that are 
required as an input for CRF training. <pre class="syntax">
+crf_train_fgen(train_segment_tbl,
+               regex_tbl,
+               label_tbl,
+               dictionary_tbl,
+               train_feature_tbl,
+               train_featureset_tbl)
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>train_segment_tbl </dt>
+<dd>TEXT. Name of the training segment table. The table is expected to have 
the following columns: <table class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id column  </td></tr>
+<tr>
+<th>start_pos </th><td>INTEGER. Index of a particular term in the respective 
document  </td></tr>
+<tr>
+<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in 
the document  </td></tr>
+<tr>
+<th>label </th><td>INTEGER. Label id for the term corresponding to the actual 
label from <code>label_tbl</code>   </td></tr>
+</table>
+</dd>
+<dt>regex_tbl </dt>
+<dd>TEXT. Name of the regular expression table. The table is expected to have 
the following columns: <table class="output">
+<tr>
+<th>pattern </th><td>TEXT. Regular Expression  </td></tr>
+<tr>
+<th>name </th><td>TEXT. Regular Expression name  </td></tr>
+</table>
+</dd>
+<dt>label_tbl </dt>
+<dd>TEXT. Name of the table containing unique labels and their id's. The table 
is expected to have the following columns: <table class="output">
+<tr>
+<th>id </th><td>INTEGER. Unique label id. NOTE: Must range from 0 to total 
number of labels in the table - 1.   </td></tr>
+<tr>
+<th>label </th><td>TEXT. Label name  </td></tr>
+</table>
+</dd>
+<dt>dictionary_tbl </dt>
+<dd>TEXT. Name of the dictionary table to be created containing unique terms 
along with their counts. The table will have the following columns: <table 
class="output">
+<tr>
+<th>token </th><td>TEXT. Contains all the unique terms found in 
<code>train_segment_tbl</code>   </td></tr>
+<tr>
+<th>total </th><td>INTEGER. Respective counts for the terms  </td></tr>
+</table>
+</dd>
+<dt>train_feature_tbl</dt>
+<dd></dd>
+<dt></dt>
+<dd><p class="startdd">TEXT. Name of the training feature table to be created. 
The table will have the following columns: </p><table class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id  </td></tr>
+<tr>
+<th>f_size </th><td>INTEGER. Feature set size. This value will be same for all 
the tuples in the table  </td></tr>
+<tr>
+<th>sparse_r </th><td>DOUBLE PRECISION[]. Array union of individual single 
state features (previous label, label, feature index, start position, training 
existance indicator), ordered by their start position.  </td></tr>
+<tr>
+<th>dense_m </th><td>DOUBLE PRECISION[]. Array union of (previous label, 
label, feature index, start position, training existance indicator) of edge 
features ordered by start position.  </td></tr>
+<tr>
+<th>sparse_m </th><td>DOUBLE PRECISION[]. Array union of (feature index, 
previous label, label) of edge features ordered by feature index.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>train_featureset_tbl </dt>
+<dd>TEXT. Name of the table to be created containing distinct featuresets 
generated from training feature extraction. The table will have the following 
columns: <table class="output">
+<tr>
+<th>f_index </th><td>INTEGER. Column containing distinct featureset ids  
</td></tr>
+<tr>
+<th>f_name </th><td>TEXT. Feature name   </td></tr>
+<tr>
+<th>feature </th><td>ARRAY. Feature value. The value is of the form [L1, L2] 
<br />
+ - If L1 = -1: represents single state feature with L2 being the current label 
id. <br />
+ - If L1 != -1: represents transition feature with L1 be the previous label 
and L2 be the current label.    </td></tr>
+</table>
+</dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Linear 
Chain CRF Training Function</dt><dd>The function takes 
<code>train_feature_tbl</code> and <code>train_featureset_tbl</code> tables 
generated in the training feature generation steps as input along with other 
required parameters and produces two output tables <code>crf_stats_tbl</code> 
and <code>crf_weights_tbl</code>.</dd></dl>
+<pre class="syntax">
+lincrf_train(train_feature_tbl,
+             train_featureset_tbl,
+             label_tbl,
+             crf_stats_tbl,
+             crf_weights_tbl
+             max_iterations
+            )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>train_feature_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the feature table generated during 
training feature generation</p>
+<p class="enddd"></p>
+</dd>
+<dt>train_featureset_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the featureset table generated during 
training feature generation</p>
+<p class="enddd"></p>
+</dd>
+<dt>label_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the label table used</p>
+<p class="enddd"></p>
+</dd>
+<dt>crf_stats_table </dt>
+<dd>TEXT. Name of the table to be created containing statistics for CRF 
training. The table has the following columns: <table class="output">
+<tr>
+<th>coef </th><td>DOUBLE PRECISION[]. Array of coefficients  </td></tr>
+<tr>
+<th>log_likelihood </th><td>DOUBLE. Log-likelihood  </td></tr>
+<tr>
+<th>num_iterations </th><td>INTEGER. The number of iterations at which the 
algorithm terminated  </td></tr>
+</table>
+</dd>
+<dt>crf_weights_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to be created creating learned 
feature weights. The table has the following columns: </p><table class="output">
+<tr>
+<th>id </th><td>INTEGER. Feature set id  </td></tr>
+<tr>
+<th>name </th><td>TEXT. Feature name  </td></tr>
+<tr>
+<th>prev_label_id </th><td>INTEGER. Label for the previous token encountered  
</td></tr>
+<tr>
+<th>label_id </th><td>INTEGER. Label of the token with the respective feature  
</td></tr>
+<tr>
+<th>weight </th><td>DOUBLE PRECISION. Weight for the respective feature set  
</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>max_iterations </dt>
+<dd>INTEGER. The maximum number of iterations </dd>
+</dl>
+<p><a class="anchor" id="test_feature"></a></p><dl class="section 
user"><dt>Testing Feature Generation</dt><dd></dd></dl>
+<pre class="syntax">
+crf_test_fgen(test_segment_tbl,
+              dictionary_tbl,
+              label_tbl,
+              regex_tbl,
+              crf_weights_tbl,
+              viterbi_mtbl,
+              viterbi_rtbl
+             )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>test_segment_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the testing segment table. The table is 
expected to have the following columns: </p><table class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id column  </td></tr>
+<tr>
+<th>start_pos </th><td>INTEGER. Index of a particular term in the respective 
document  </td></tr>
+<tr>
+<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in 
the document  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>dictionary_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the dictionary table created during 
training feature generation (<code>crf_train_fgen</code>)</p>
+<p class="enddd"></p>
+</dd>
+<dt>label_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the label table</p>
+<p class="enddd"></p>
+</dd>
+<dt>regex_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the regular expression table</p>
+<p class="enddd"></p>
+</dd>
+<dt>crf_weights_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the weights table generated during CRF 
training (<code>lincrf_train</code>)</p>
+<p class="enddd"></p>
+</dd>
+<dt>viterbi_mtbl </dt>
+<dd><p class="startdd">TEXT. Name of the Viterbi M table to be created</p>
+<p class="enddd"></p>
+</dd>
+<dt>viterbi_rtbl </dt>
+<dd>TEXT. Name of the Viterbi R table to be created </dd>
+</dl>
+<p><a class="anchor" id="inference"></a></p><dl class="section 
user"><dt>Inference using Viterbi</dt><dd><pre class="syntax">
+vcrf_label(test_segment_tbl,
+           viterbi_mtbl,
+           viterbi_rtbl,
+           label_tbl,
+           result_tbl)
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>test_segment_tbl </dt>
+<dd>TEXT. Name of the testing segment table. For required table schema, please 
refer to arguments in previous section </dd>
+<dt>viterbi_mtbl </dt>
+<dd>TEXT. Name of the table <code>viterbi_mtbl</code> generated from testing 
feature generation <code>crf_test_fgen</code>. </dd>
+<dt>viterbi_rtbl </dt>
+<dd>TEXT. Name of the table <code>viterbi_rtbl</code> generated from testing 
feature generation <code>crf_test_fgen</code>. </dd>
+<dt>label_tbl </dt>
+<dd>TEXT. Name of the label table. </dd>
+<dt>result_tbl </dt>
+<dd>TEXT. Name of the result table to be created containing extracted best 
label sequences. </dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="usage"></a></p><dl class="section user"><dt>Using 
CRF</dt><dd></dd></dl>
+<p>Generate text features, calculate their weights, and output the best label 
sequence for test data:<br />
+</p><ol type="1">
+<li>Perform feature generation on training data i.e. 
<code>train_segment_tbl</code> generating <code>train_feature_tbl</code> and 
<code>train_featureset_tbl</code>. <pre>SELECT madlib.crf_train_fgen(
+         '<em>train_segment_tbl</em>',
+         '<em>regex_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>dictionary_tbl</em>',
+         '<em>train_feature_tbl</em>',
+         '<em>train_featureset_tbl</em>');</pre></li>
+<li>Use linear-chain CRF for training providing <code>train_feature_tbl</code> 
and <code>train_featureset_tbl</code> generated from previous step as an input. 
<pre>SELECT madlib.lincrf_train(
+         '<em>train_feature_tbl</em>',
+         '<em>train_featureset_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>crf_stats_tbl</em>',
+         '<em>crf_weights_tbl</em>',
+         <em>max_iterations</em>);</pre></li>
+<li>Perform feature generation on testing data <code>test_segment_tbl</code> 
generating <code>viterbi_mtbl</code> and <code>viterbi_rtbl</code> required for 
inferencing. <pre>SELECT madlib.crf_test_fgen(
+         '<em>test_segment_tbl</em>',
+         '<em>dictionary_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>regex_tbl</em>',
+         '<em>crf_weights_tbl</em>',
+         '<em>viterbi_mtbl</em>',
+         '<em>viterbi_rtbl</em>');</pre></li>
+<li>Run the Viterbi function to get the best label sequence and the 
conditional probability \( \Pr( \text{best label sequence} \mid 
\text{sequence}) \). <pre>SELECT madlib.vcrf_label(
+         '<em>test_segment_tbl</em>',
+         '<em>viterbi_mtbl</em>',
+         '<em>viterbi_rtbl</em>',
+         '<em>label_tbl</em>',
+         '<em>result_tbl</em>');</pre></li>
+</ol>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd>This example uses a trivial training and test data 
set.</dd></dl>
+<ol type="1">
+<li>Load the label table, the regular expressions table, and the training 
segment table: <pre class="example">
+SELECT * FROM crf_label ORDER BY id;
+</pre> Result: <pre class="result">
+ id | label
+&#160;---+-------
+  0 | #
+  1 | $
+  2 | ''
+...
+  8 | CC
+  9 | CD
+ 10 | DT
+ 11 | EX
+ 12 | FW
+ 13 | IN
+ 14 | JJ
+...
+</pre> The regular expressions table: <pre class="example">
+SELECT * from crf_regex;
+</pre> <pre class="result">
+    pattern    |         name
+&#160;--------------+----------------------
+ ^.+ing$       | endsWithIng
+ ^[A-Z][a-z]+$ | InitCapital
+ ^[A-Z]+$      | isAllCapital
+ ^.*[0-9]+.*$  | containsDigit
+...
+</pre> The training segment table: <pre class="example">
+SELECT * from train_segmenttbl ORDER BY doc_id, start_pos;
+</pre> <pre class="result">
+ doc_id | start_pos |  seg_text  | label
+&#160;-------+-----------+------------+-------
+      0 |         0 | Confidence |    18
+      0 |         1 | in         |    13
+      0 |         2 | the        |    10
+      0 |         3 | pound      |    18
+      0 |         4 | is         |    38
+      0 |         5 | widely     |    26
+...
+      1 |         0 | Chancellor |    19
+      1 |         1 | of         |    13
+      1 |         2 | the        |    10
+      1 |         3 | Exchequer  |    19
+      1 |         4 | Nigel      |    19
+...
+</pre></li>
+<li>Generate the training features: <pre class="example">
+SELECT crf_train_fgen( 'train_segmenttbl',
+                       'crf_regex',
+                       'crf_label',
+                       'crf_dictionary',
+                       'train_featuretbl',
+                       'train_featureset'
+                     );
+SELECT * from crf_dictionary;
+</pre> Result: <pre class="result">
+     token       | total
+&#160;----------------+-------
+ Hawthorne       |     1
+ Mercedes-Benzes |     1
+ Wolf            |     3
+ best-known      |     1
+ hairline        |     1
+ accepting       |     2
+ purchases       |    14
+ trash           |     5
+ co-venture      |     1
+ restaurants     |     7
+...
+</pre> <pre class="example">
+SELECT * from train_featuretbl;
+</pre> Result: <pre class="result">
+ doc_id | f_size |            sparse_r           |             dense_m         
    |       sparse_m
+&#160;-------+--------+-------------------------------+---------------------------------+-----------------------
+      2 |     87 | {-1,13,12,0,1,-1,13,9,0,1,..} | 
{13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...}
+      1 |     87 | {-1,13,0,0,1,-1,13,9,0,1,...} | 
{13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...}
+</pre> <pre class="example">
+SELECT * from train_featureset;
+</pre> <pre class="result">
+ f_index |    f_name     | feature
+&#160;--------+---------------+---------
+       1 | R_endsWithED  | {-1,29}
+      13 | W_outweigh    | {-1,26}
+      29 | U             | {-1,5}
+      31 | U             | {-1,29}
+      33 | U             | {-1,12}
+      35 | W_a           | {-1,2}
+      37 | W_possible    | {-1,6}
+      15 | W_signaled    | {-1,29}
+      17 | End.          | {-1,43}
+      49 | W_'s          | {-1,16}
+      63 | W_acquire     | {-1,26}
+      51 | E.            | {26,2}
+      69 | E.            | {29,17}
+      71 | E.            | {2,11}
+      83 | W_the         | {-1,2}
+      85 | E.            | {16,11}
+       4 | W_return      | {-1,11}
+...
+</pre></li>
+<li>Train using linear CRF: <pre class="example">
+SELECT lincrf_train( 'train_featuretbl',
+                     'train_featureset',
+                     'crf_label',
+                     'crf_stats_tbl',
+                     'crf_weights_tbl',
+                     20
+             );
+</pre> <pre class="result">
+                                lincrf_train
+&#160;-----------------------------------------------------------------------------------
+ CRF Train successful. Results stored in the specified CRF stats and weights 
table
+ lincrf
+</pre> View the feature weight table. <pre class="example">
+SELECT * from crf_weights_tbl;
+</pre> Result: <pre class="result">
+ id |     name      | prev_label_id | label_id |      weight
+&#160;---+---------------+---------------+----------+-------------------
+  1 | R_endsWithED  |            -1 |       29 |  1.54128249293937
+ 13 | W_outweigh    |            -1 |       26 |  1.70691232223653
+ 29 | U             |            -1 |        5 |  1.40708515869008
+ 31 | U             |            -1 |       29 | 0.830356200936407
+ 33 | U             |            -1 |       12 | 0.769587378281239
+ 35 | W_a           |            -1 |        2 |  2.68470625883726
+ 37 | W_possible    |            -1 |        6 |  3.41773107604468
+ 15 | W_signaled    |            -1 |       29 |  1.68187039165771
+ 17 | End.          |            -1 |       43 |  3.07687845517082
+ 49 | W_'s          |            -1 |       16 |  2.61430312229883
+ 63 | W_acquire     |            -1 |       26 |  1.67247047385797
+ 51 | E.            |            26 |        2 |   3.0114240119435
+ 69 | E.            |            29 |       17 |  2.82385531733866
+ 71 | E.            |             2 |       11 |  3.00970493772732
+ 83 | W_the         |            -1 |        2 |  2.58742315259326
+...
+</pre></li>
+<li>To find the best labels for a test set using the trained linear CRF model, 
repeat steps #1-2 and generate the test features, except instead of creating a 
new dictionary, use the dictionary generated from the training set. <pre 
class="example">
+SELECT * from test_segmenttbl ORDER BY doc_id, start_pos;
+</pre> Result: <pre class="result">
+ doc_id | start_pos |   seg_text
+&#160;-------+-----------+---------------
+      0 |         0 | Rockwell
+      0 |         1 | International
+      0 |         2 | Corp.
+      0 |         3 | 's
+      0 |         4 | Tulsa
+      0 |         5 | unit
+      0 |         6 | said
+...
+      1 |         0 | Rockwell
+      1 |         1 | said
+      1 |         2 | the
+      1 |         3 | agreement
+      1 |         4 | calls
+...
+</pre> <pre class="example">
+SELECT crf_test_fgen( 'test_segmenttbl',
+                      'crf_dictionary',
+                      'crf_label',
+                      'crf_regex',
+                      'crf_weights_tbl',
+                      'viterbi_mtbl',
+                      'viterbi_rtbl'
+                    );
+</pre></li>
+<li>Calculate the best label sequence and save in the table 
<code>extracted_best_labels</code>. <pre class="example">
+SELECT vcrf_label( 'test_segmenttbl',
+                   'viterbi_mtbl',
+                   'viterbi_rtbl',
+                   'crf_label',
+                   'extracted_best_labels'
+                 );
+</pre> View the best labels. <pre class="example">
+SELECT * FROM extracted_best_labels;
+</pre> Result: <pre class="result">
+ doc_id | start_pos |   seg_text    | label | id | max_pos |   prob
+&#160;-------+-----------+---------------+-------+----+---------+----------
+      0 |         0 | Rockwell      | NNP   | 19 |      27 | 0.000269
+      0 |         1 | International | NNP   | 19 |      27 | 0.000269
+      0 |         2 | Corp.         | NNP   | 19 |      27 | 0.000269
+      0 |         3 | 's            | NNP   | 19 |      27 | 0.000269
+...
+      1 |         0 | Rockwell      | NNP   | 19 |      16 | 0.000168
+      1 |         1 | said          | NNP   | 19 |      16 | 0.000168
+      1 |         2 | the           | DT    | 10 |      16 | 0.000168
+      1 |         3 | agreement     | JJ    | 14 |      16 | 0.000168
+      1 |         4 | calls         | NNS   | 21 |      16 | 0.000168
+...
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>Specifically, a linear-chain CRF is a distribution defined by </p><p 
class="formulaDsp">
+\[ p_\lambda(\boldsymbol y | \boldsymbol x) = \frac{\exp{\sum_{m=1}^M 
\lambda_m F_m(\boldsymbol x, \boldsymbol y)}}{Z_\lambda(\boldsymbol x)} \,. \]
+</p>
+<p>where</p><ul>
+<li>\( F_m(\boldsymbol x, \boldsymbol y) = \sum_{i=1}^n f_m(y_i,y_{i-1},x_i) 
\) is a global feature function that is a sum along a sequence \( \boldsymbol x 
\) of length \( n \)</li>
+<li>\( f_m(y_i,y_{i-1},x_i) \) is a local feature function dependent on the 
current token label \( y_i \), the previous token label \( y_{i-1} \), and the 
observation \( x_i \)</li>
+<li>\( \lambda_m \) is the corresponding feature weight</li>
+<li>\( Z_\lambda(\boldsymbol x) \) is an instance-specific normalizer <p 
class="formulaDsp">
+\[ Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol y&#39;} \exp{\sum_{m=1}^M 
\lambda_m F_m(\boldsymbol x, \boldsymbol y&#39;)} \]
+</p>
+</li>
+</ul>
+<p>A linear-chain CRF estimates the weights \( \lambda_m \) by maximizing the 
log-likelihood of a given training set \( T=\{(x_k,y_k)\}_{k=1}^N \).</p>
+<p>The log-likelihood is defined as </p><p class="formulaDsp">
+\[ \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) =\sum_k[\sum_{m=1}^M 
\lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] \]
+</p>
+<p>and the zero of its gradient </p><p class="formulaDsp">
+\[ \nabla \ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] \]
+</p>
+<p>is found since the maximum likelihood is reached when the empirical average 
of the global feature vector equals its model expectation. The MADlib 
implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of 
the BroydenâFletcherâGoldfarbâShanno (BFGS) update, a quasi-Newton method 
for unconstrained optimization.</p>
+<p>\(E_{p_\lambda(Y|x)}[F(x,Y)]\) is found by using a variant of the 
forward-backward algorithm: </p><p class="formulaDsp">
+\[ E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y p_\lambda(y|x)F(x,y) = 
\sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)} \]
+</p>
+ <p class="formulaDsp">
+\[ Z_\lambda(x) = \alpha_n.1^T \]
+</p>
+<p> where \(\alpha_i\) and \( \beta_i\) are the forward and backward state 
cost vectors defined by </p><p class="formulaDsp">
+\[ \alpha_i = \begin{cases} \alpha_{i-1}M_i, &amp; 0&lt;i&lt;=n\\ 1, &amp; i=0 
\end{cases}\\ \]
+</p>
+ <p class="formulaDsp">
+\[ \beta_i^T = \begin{cases} M_{i+1}\beta_{i+1}^T, &amp; 1&lt;=i&lt;n\\ 1, 
&amp; i=n \end{cases} \]
+</p>
+<p>To avoid overfitting, we penalize the likelihood with a spherical Gaussian 
weight prior: </p><p class="formulaDsp">
+\[ \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log 
Z_\lambda(x_k)] - \frac{\lVert \lambda \rVert^2}{2\sigma ^2} \]
+</p>
+<p class="formulaDsp">
+\[ \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) - 
E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2} \]
+</p>
+<dl class="section user"><dt>Literature</dt><dd>[1] F. Sha, F. Pereira. 
Shallow Parsing with Conditional Random Fields, <a 
href="http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf";>http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf</a></dd></dl>
+<p>[2] Wikipedia, Conditional Random Field, <a 
href="http://en.wikipedia.org/wiki/Conditional_random_field";>http://en.wikipedia.org/wiki/Conditional_random_field</a></p>
+<p>[3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, <a 
href="http://crf.sourceforge.net/";>http://crf.sourceforge.net/</a></p>
+<p>[4] D. Wang, ViterbiCRF, <a 
href="http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html";>http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html</a></p>
+<p>[5] Wikipedia, Viterbi Algorithm, <a 
href="http://en.wikipedia.org/wiki/Viterbi_algorithm";>http://en.wikipedia.org/wiki/Viterbi_algorithm</a></p>
+<p>[6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), 
Mathematics of Computation 35, pp. 773-782</p>
+<p>[7] J. Nocedal, Software for Large-scale Unconstrained Optimization, <a 
href="http://users.eecs.northwestern.edu/~nocedal/lbfgs.html";>http://users.eecs.northwestern.edu/~nocedal/lbfgs.html</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="crf_8sql__in.html" title="SQL functions for 
conditional random field. ">crf.sql_in</a> <a class="el" 
href="crf__feature__gen_8sql__in.html" title="SQL function for POS/NER feature 
extraction. ">crf_feature_gen.sql_in</a> <a class="el" 
href="viterbi_8sql__in.html" title="concatenate a set of input values into 
arrays to feed into viterbi c function and create a human 
read...">viterbi.sql_in</a> (documenting the SQL functions) </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__datatrans.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__datatrans.html 
b/docs/v1.15.1/group__grp__datatrans.html
new file mode 100644
index 0000000..0b7dede
--- /dev/null
+++ b/docs/v1.15.1/group__grp__datatrans.html
@@ -0,0 +1,161 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Data Types and Transformations</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__datatrans.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Data Types and Transformations</div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>Data types and operations that transform and shape data. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__arraysmatrix"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__arraysmatrix.html">Arrays and 
Matrices</a></td></tr>
+<tr class="memdesc:group__grp__arraysmatrix"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Mathematical operations for 
arrays and matrices. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__encode__categorical"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__encode__categorical.html">Encoding Categorical 
Variables</a></td></tr>
+<tr class="memdesc:group__grp__encode__categorical"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Functions to encode 
categorical variables to prepare data for input into predictive algorithms. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__path"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__path.html">Path</a></td></tr>
+<tr class="memdesc:group__grp__path"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">A function to perform complex pattern matching across rows 
and extract useful information about the matches. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__pivot"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__pivot.html">Pivot</a></td></tr>
+<tr class="memdesc:group__grp__pivot"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Pivoting and data summarization tools for preparing data for 
modeling operations. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__sessionize"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__sessionize.html">Sessionize</a></td></tr>
+<tr class="memdesc:group__grp__sessionize"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Session reconstruction of 
data consisting of a time stampled sequence of events. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__stemmer"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__stemmer.html">Stemming</a></td></tr>
+<tr class="memdesc:group__grp__stemmer"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides porter stemmer operations supporting other MADlib 
modules. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__datatrans.js
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__datatrans.js 
b/docs/v1.15.1/group__grp__datatrans.js
new file mode 100644
index 0000000..4900455
--- /dev/null
+++ b/docs/v1.15.1/group__grp__datatrans.js
@@ -0,0 +1,9 @@
+var group__grp__datatrans =
+[
+    [ "Arrays and Matrices", "group__grp__arraysmatrix.html", 
"group__grp__arraysmatrix" ],
+    [ "Encoding Categorical Variables", 
"group__grp__encode__categorical.html", null ],
+    [ "Path", "group__grp__path.html", null ],
+    [ "Pivot", "group__grp__pivot.html", null ],
+    [ "Sessionize", "group__grp__sessionize.html", null ],
+    [ "Stemming", "group__grp__stemmer.html", null ]
+];
\ No newline at end of file

[26/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

Reply via email to