http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__countmin.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__countmin.html 
b/docs/v1.9.1/group__grp__countmin.html
new file mode 100644
index 0000000..b1e754a
--- /dev/null
+++ b/docs/v1.9.1/group__grp__countmin.html
@@ -0,0 +1,259 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: CountMin (Cormode-Muthukrishnan)</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__countmin.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">CountMin (Cormode-Muthukrishnan)<div class="ingroups"><a 
class="el" href="group__grp__early__stage.html">Early Stage Development</a> 
&raquo; <a class="el" href="group__grp__sketches.html">Cardinality 
Estimators</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#syntax">Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method 
is still in early stage development. There may be some issues that will be 
addressed in a future version. Interface and implementation is subject to 
change. </em></dd></dl>
+<p>This module implements Cormode-Muthukrishnan <em>CountMin</em> sketches on 
integer values, implemented as a user-defined aggregate. It also provides 
scalar functions over the sketches to produce approximate counts, order 
statistics, and histograms.</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section 
user"><dt>Syntax</dt><dd><ul>
+<li>Get a sketch of a selected column specified by <em>col_name</em>. <pre 
class="syntax">
+cmsketch( col_name )
+</pre></li>
+<li>Get the number of rows where <em>col_name = p</em>, computed from the 
sketch obtained from <code>cmsketch</code>. <pre class="syntax">
+cmsketch_count( cmsketch,
+                p
+              )
+</pre></li>
+<li>Get the number of rows where <em>col_name</em> is between <em>m</em> and 
<em>n</em> inclusive. <pre class="syntax">
+cmsketch_rangecount( cmsketch,
+                     m,
+                     n
+                   )
+</pre></li>
+<li>Get the <em>k</em>th percentile of <em>col_name</em> where <em>count</em> 
specifies number of rows. <em>k</em> should be an integer between 1 to 99. <pre 
class="syntax">
+cmsketch_centile( cmsketch,
+                  k,
+                  count
+                )
+</pre></li>
+<li>Get the median of col_name where <em>count</em> specifies number of rows. 
This is equivalent to <code><a class="el" 
href="sketch_8sql__in.html#a2f2ab2fe3244515f5f73d49690e73b39">cmsketch_centile</a>(<em>cmsketch</em>,50,<em>count</em>)</code>.
 <pre class="syntax">
+cmsketch_median( cmsketch,
+                 count
+               )
+</pre></li>
+<li>Get an n-bucket histogram for values between min and max for the column 
where each bucket has approximately the same width. The output is a text string 
containing triples {lo, hi, count} representing the buckets; counts are 
approximate. <pre class="syntax">
+cmsketch_width_histogram( cmsketch,
+                          min,
+                          max,
+                          n
+                        )
+</pre></li>
+<li>Get an n-bucket histogram for the column where each bucket has 
approximately the same count. The output is a text string containing triples 
{lo, hi, count} representing the buckets; counts are approximate. Note that an 
equi-depth histogram is equivalent to a spanning set of equi-spaced centiles. 
<pre class="syntax">
+cmsketch_depth_histogram( cmsketch,
+                          n
+                        )
+</pre></li>
+</ul>
+</dd></dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Generate some data. <pre class="example">
+CREATE TABLE data(class INT, a1 INT);
+INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
+INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
+INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
+INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
+INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
+</pre></li>
+<li>Count number of rows where a1 = 2 in each class. <pre class="example">
+SELECT class,
+       cmsketch_count(
+                       cmsketch( a1 ),
+                       2
+                      )
+FROM data GROUP BY data.class;
+</pre> Result: <pre class="result">
+ class | cmsketch_count
+&#160;------+----------------
+     2 |              0
+     1 |          15000
+(2 rows)
+</pre></li>
+<li>Count number of rows where a1 is between 3 and 6. <pre class="example">
+SELECT class,
+       cmsketch_rangecount(
+                            cmsketch(a1),
+                            3,
+                            6
+                          )
+FROM data GROUP BY data.class;
+</pre> Result: <pre class="result">
+ class | cmsketch_rangecount
+&#160;------+---------------------
+     2 |                2000
+     1 |               10000
+(2 rows)
+</pre></li>
+<li>Compute the 90th percentile of all of a1. <pre class="example">
+SELECT cmsketch_centile(
+                         cmsketch( a1 ),
+                         90,
+                         count(*)
+                       )
+FROM data;
+</pre> Result: <pre class="result">
+ cmsketch_centile
+&#160;-----------------
+                3
+(1 row)
+</pre></li>
+<li>Produce an equi-width histogram with 2 bins between 0 and 10. <pre 
class="example">
+SELECT cmsketch_width_histogram(
+                                 cmsketch( a1 ),
+                                 0,
+                                 10,
+                                 2
+                               )
+FROM data;
+</pre> Result: <pre class="result">
+      cmsketch_width_histogram
+&#160;-----------------------------------
+ [[0L, 4L, 35000], [5L, 10L, 2000]]
+(1 row)
+</pre></li>
+<li>Produce an equi-depth histogram of a1 with 2 bins of approximately equal 
depth. <pre class="example">
+SELECT cmsketch_depth_histogram(
+                                 cmsketch( a1 ),
+                                 2
+                               )
+FROM data;
+</pre> Result: <pre class="result">
+                       cmsketch_depth_histogram
+&#160;----------------------------------------------------------------------
+ [[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]]
+(1 row)
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The 
count-min sketch and its applications. LATIN 2004, J. Algorithm 55(1): 58-75 
(2005) . <a 
href="http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html";>http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html</a></p>
+<p>[2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. 
T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 
2009. <a 
href="http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html";>http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html</a></p>
+<p><a class="anchor" id="related"></a>File <a class="el" 
href="sketch_8sql__in.html" title="SQL functions for sketch-based 
approximations of descriptive statistics. ">sketch.sql_in</a> documenting the 
SQL functions.</p>
+<p>Module grp_quantile for a different implementation of quantile function. 
</p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__cox__prop__hazards.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__cox__prop__hazards.html 
b/docs/v1.9.1/group__grp__cox__prop__hazards.html
new file mode 100644
index 0000000..c222769
--- /dev/null
+++ b/docs/v1.9.1/group__grp__cox__prop__hazards.html
@@ -0,0 +1,470 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Cox-Proportional Hazards Regression</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__cox__prop__hazards.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Cox-Proportional Hazards Regression<div class="ingroups"><a 
class="el" href="group__grp__super.html">Supervised Learning</a> &raquo; <a 
class="el" href="group__grp__regml.html">Regression Models</a></div></div>  
</div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li class="level1">
+<a href="#training">Training Function</a> </li>
+<li class="level1">
+<a href="#cox_zph">PHA Test Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#background">Technical Background</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>Proportional-Hazard models enable the comparison of various survival 
models. These survival models are functions describing the probability of a 
one-item event (prototypically, this event is death) with respect to time. The 
interval of time before the occurrence of death can be called the survival 
time. Let T be a random variable representing the survival time, with a 
cumulative probability function P(t). Informally, P(t) is the probability that 
death has happened before time t.</p>
+<p><a class="anchor" id="training"></a></p><dl class="section 
user"><dt>Training Function</dt><dd></dd></dl>
+<p>Following is the syntax for the <a class="el" 
href="cox__prop__hazards_8sql__in.html#a737450bbfe0f10204b0074a9d45b0cef" 
title="Compute cox-regression coefficients and diagnostic statistics. 
">coxph_train()</a> training function: </p><pre class="syntax">
+coxph_train( source_table,
+             output_table,
+             dependent_variable,
+             independent_variable,
+             right_censoring_status,
+             strata,
+             optimizer_params
+           )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd>TEXT. The name of the table containing input data. </dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. The name of the table where the output model is 
saved. The output is saved in the table named by the <em>output_table</em> 
argument. It has the following columns: </p><table  class="output">
+<tr>
+<th>coef </th><td>FLOAT8[]. Vector of the coefficients.  </td></tr>
+<tr>
+<th>loglikelihood </th><td>FLOAT8. Log-likelihood value of the MLE estimate.  
</td></tr>
+<tr>
+<th>std_err </th><td>FLOAT8[]. Vector of the standard error of the 
coefficients.  </td></tr>
+<tr>
+<th>stats </th><td>FLOAT8[]. Vector of the statistics of the coefficients.  
</td></tr>
+<tr>
+<th>p_values </th><td>FLOAT8[]. Vector of the p-values of the coefficients.  
</td></tr>
+<tr>
+<th>hessian </th><td>FLOAT8[]. The Hessian matrix computed using the final 
solution.  </td></tr>
+<tr>
+<th>num_iterations </th><td>INTEGER. The number of iterations performed by the 
optimizer.  </td></tr>
+</table>
+<p>Additionally, a summary output table is generated that contains a summary 
of the parameters used for building the Cox model. It is stored in a table 
named &lt;output_table&gt;_summary. It has the following columns: </p><table  
class="output">
+<tr>
+<th>source_table </th><td>The source table name.  </td></tr>
+<tr>
+<th>dependent_variable </th><td>The dependent variable name.  </td></tr>
+<tr>
+<th>independent_variable </th><td>The independent variable name.  </td></tr>
+<tr>
+<th>right_censoring_status </th><td>The right censoring status  </td></tr>
+<tr>
+<th>strata </th><td>The stratification columns  </td></tr>
+<tr>
+<th>num_processed </th><td>The number of rows that were actually used in the 
computation.  </td></tr>
+<tr>
+<th>num_missing_rows_skipped </th><td>The number of rows that were skipped in 
the computation due to NULL values in them.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_variable </dt>
+<dd>TEXT. A string containing the name of a column that contains an array of 
numeric values, or a string expression in the format 'ARRAY[1, x1, x2, x3]', 
where <em>x1</em>, <em>x2</em> and <em>x3</em> are column names. Dependent 
variables refer to the time of death. There is no need to pre-sort the data. 
</dd>
+<dt>independent_variable </dt>
+<dd>TEXT. The name of the independent variable. </dd>
+<dt>right_censoring_status (optional) </dt>
+<dd>TEXT, default: TRUE for all observations. A string containing an 
expression that evaluates to the right-censoring status for the 
observation&mdash;TRUE if the observation is not censored and FALSE if the 
observation is censored. The string could contain the name of the column 
containing the right-censoring status, a fixed Boolean expression (i.e., 
'true', 'false', '0', '1') that applies to all observations, or a Boolean 
expression such as 'column_name &lt; 10' that can be evaluated for each 
observation. </dd>
+<dt>strata (optional) </dt>
+<dd>VARCHAR, default: NULL, which does not do any stratifications. A string of 
comma-separated column names that are the strata ID variables used to do 
stratification. </dd>
+<dt>optimizer_params (optional) </dt>
+<dd><p class="startdd">VARCHAR, default: NULL, which uses the default values 
of optimizer parameters: max_iter=100, optimizer=newton, tolerance=1e-8, 
array_agg_size=10000000, sample_size=1000000. It should be a string that 
contains 'key=value' pairs separated by commas. The meanings of these 
parameters are:</p>
+<ul>
+<li>max_iter &mdash; The maximum number of iterations. The computation stops 
if the number of iterations exceeds this, which usually means that there is no 
convergence.</li>
+<li>optimizer &mdash; The optimization method. Right now, "newton" is the only 
one supported.</li>
+<li>tolerance &mdash; The stopping criteria. When the difference between the 
log-likelihoods of two consecutive iterations is smaller than this number, the 
computation has already converged and stops.</li>
+<li>array_agg_size &mdash; To speed up the computation, the original data 
table is cut into multiple pieces, and each pieces of the data is aggregated 
into one big row. In the process of computation, the whole big row is loaded 
into memory and thus speed up the computation. This parameter controls 
approximately how many numbers we want to put into one big row. Larger value of 
array_agg_size may speed up more, but the size of the big row cannot exceed 1GB 
due to the restriction of PostgreSQL databases.</li>
+<li>sample_size &mdash; To cut the data into approximate equal pieces, we 
first sample the data, and then find out the break points using this sampled 
data. A larger sample_size produces more accurate break points.  </li>
+</ul>
+</dd>
+</dl>
+<p><a class="anchor" id="cox_zph"></a></p><dl class="section 
user"><dt>Proportional Hazards Assumption Test Function</dt><dd></dd></dl>
+<p>The <a class="el" 
href="cox__prop__hazards_8sql__in.html#a682d95d5475ce33e47937067cadc2766" 
title="Test the proportional hazards assumption for a Cox regression model fit 
(coxph_train) ...">cox_zph()</a> function tests the proportional hazards 
assumption (PHA) of a Cox regression.</p>
+<p>Proportional-hazard models enable the comparison of various survival 
models. These PH models, however, assume that the hazard for a given individual 
is a fixed proportion of the hazard for any other individual, and the ratio of 
the hazards is constant across time. MADlib does not currently have support for 
performing any transformation of the time to compute the correlation.</p>
+<p>The <a class="el" 
href="cox__prop__hazards_8sql__in.html#a682d95d5475ce33e47937067cadc2766" 
title="Test the proportional hazards assumption for a Cox regression model fit 
(coxph_train) ...">cox_zph()</a> function is used to test this assumption by 
computing the correlation of the residual of the <a class="el" 
href="cox__prop__hazards_8sql__in.html#a737450bbfe0f10204b0074a9d45b0cef" 
title="Compute cox-regression coefficients and diagnostic statistics. 
">coxph_train()</a> model with time.</p>
+<p>Following is the syntax for the <a class="el" 
href="cox__prop__hazards_8sql__in.html#a682d95d5475ce33e47937067cadc2766" 
title="Test the proportional hazards assumption for a Cox regression model fit 
(coxph_train) ...">cox_zph()</a> function: </p><pre class="syntax">
+cox_zph(cox_model_table, output_table)
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>cox_model_table </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the Cox 
Proportional-Hazards model.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd>TEXT. The name of the table where the test statistics are saved. The 
output table is named by the <em>output_table</em> argument and has the 
following columns: <table  class="output">
+<tr>
+<th>covariate </th><td>TEXT. The independent variables.  </td></tr>
+<tr>
+<th>rho </th><td>FLOAT8[]. Vector of the correlation coefficients between 
survival time and the scaled Schoenfeld residuals.  </td></tr>
+<tr>
+<th>chi_square </th><td>FLOAT8[]. Chi-square test statistic for the 
correlation analysis.  </td></tr>
+<tr>
+<th>p_value </th><td>FLOAT8[]. Two-side p-value for the chi-square statistic.  
</td></tr>
+</table>
+</dd>
+</dl>
+<p>Additionally, the residual values are outputted to the table named 
<em>output_table</em>_residual. The table contains the following columns: 
</p><table  class="output">
+<tr>
+<th>&lt;dep_column_name&gt; </th><td>FLOAT8. Time values (dependent variable) 
present in the original source table.   </td></tr>
+<tr>
+<th>residual </th><td>FLOAT8[]. Difference between the original covariate 
values and the expectation of the covariates obtained from the coxph_train 
model.  </td></tr>
+<tr>
+<th>scaled_residual </th><td>Residual values scaled by the variance of the 
coefficients.  </td></tr>
+</table>
+<p><a class="anchor" id="notes"></a></p><dl class="section 
user"><dt>Notes</dt><dd></dd></dl>
+<ul>
+<li>Table names can be optionally schema qualified (current_schemas() is used 
if a schema name is not provided) and table and column names should follow 
case-sensitivity and quoting rules per the database. For instance, 'mytable' 
and 'MyTable' both resolve to the same entity&mdash;'mytable'. If mixed-case or 
multi-byte characters are desired for entity names then the string should be 
double-quoted; in this case the input would be '"MyTable"'.</li>
+<li>The <a class="el" 
href="cox__prop__hazards_8sql__in.html#a3310cf98478b7c1e400e8fb1b3965d30">cox_prop_hazards_regr()</a>
 and <a class="el" 
href="cox__prop__hazards_8sql__in.html#ad778b289eb19ae0bb2b7e02a89bab3bc" 
title="Cox regression training function. ">cox_prop_hazards()</a> functions 
have been deprecated; <a class="el" 
href="cox__prop__hazards_8sql__in.html#a737450bbfe0f10204b0074a9d45b0cef" 
title="Compute cox-regression coefficients and diagnostic statistics. 
">coxph_train()</a> should be used instead.</li>
+</ul>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function is provided to 
calculate the linear predictionors, risk or the linear terms for the given 
prediction data. It has the following syntax: <pre class="syntax">
+coxph_predict(model_table,
+              source_table,
+              id_col_name,
+              output_table,
+              pred_type,
+              reference)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>model_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the cox model.</p>
+<p class="enddd"></p>
+</dd>
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the prediction 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">TEXT. Name of the id column in the source table.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to store the prediction results 
in. The output table is named by the <em>output_table</em> argument and has the 
following columns: </p><table  class="output">
+<tr>
+<th>id </th><td>TEXT. The id column name from the source table.  </td></tr>
+<tr>
+<th>predicted_result </th><td>DOUBLE PRECISION. Result of prediction based of 
the value of the prediction type parameter.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>pred_type </dt>
+<dd><p class="startdd">TEXT, OPTIONAL. Type of prediction. This can be one of 
'linear_predictors', 'risk', or 'terms'. DEFAULT='linear_predictors'.</p><ul>
+<li>'linear_predictors' calculates the dot product of the independent 
variables and the coefficients.</li>
+<li>'risk' is the exponentiated value of the linear prediction.</li>
+<li>'terms' correspond to the linear terms obtained by multiplying the 
independent variables with their corresponding coefficients values (without 
further calculating the sum of these terms) </li>
+</ul>
+<p class="enddd"></p>
+</dd>
+<dt>reference </dt>
+<dd>TEXT, OPTIONAL. Reference level to use for centering predictions. Can be 
one of 'strata', 'overall'. DEFAULT='strata'. Note that R uses 'sample' instead 
of 'overall' when referring to the overall mean value of the covariates as 
being the reference level. </dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>View online help for the proportional hazards training method. <pre 
class="example">
+SELECT madlib.coxph_train();
+</pre></li>
+<li>Create an input data set. <pre class="example">
+DROP TABLE IF EXISTS sample_data;
+CREATE TABLE sample_data (
+    id INTEGER NOT NULL,
+    grp DOUBLE PRECISION,
+    wbc DOUBLE PRECISION,
+    timedeath INTEGER,
+    status BOOLEAN
+);
+COPY sample_data FROM STDIN WITH DELIMITER '|';
+  0 |   0 | 1.45 |        35 | t
+  1 |   0 | 1.47 |        34 | t
+  3 |   0 |  2.2 |        32 | t
+  4 |   0 | 1.78 |        25 | t
+  5 |   0 | 2.57 |        23 | t
+  6 |   0 | 2.32 |        22 | t
+  7 |   0 | 2.01 |        20 | t
+  8 |   0 | 2.05 |        19 | t
+  9 |   0 | 2.16 |        17 | t
+ 10 |   0 |  3.6 |        16 | t
+ 11 |   1 |  2.3 |        15 | t
+ 12 |   0 | 2.88 |        13 | t
+ 13 |   1 |  1.5 |        12 | t
+ 14 |   0 |  2.6 |        11 | t
+ 15 |   0 |  2.7 |        10 | t
+ 16 |   0 |  2.8 |         9 | t
+ 17 |   1 | 2.32 |         8 | t
+ 18 |   0 | 4.43 |         7 | t
+ 19 |   0 | 2.31 |         6 | t
+ 20 |   1 | 3.49 |         5 | t
+ 21 |   1 | 2.42 |         4 | t
+ 22 |   1 | 4.01 |         3 | t
+ 23 |   1 | 4.91 |         2 | t
+ 24 |   1 |    5 |         1 | t
+\.
+</pre></li>
+<li>Run the Cox regression function. <pre class="example">
+SELECT madlib.coxph_train( 'sample_data',
+                           'sample_cox',
+                           'timedeath',
+                           'ARRAY[grp,wbc]',
+                           'status'
+                         );
+</pre></li>
+<li>View the results of the regression. <pre class="example">
+\x on
+SELECT * FROM sample_cox;
+</pre> Results: <pre class="result">
+-[ RECORD 1 
]--+----------------------------------------------------------------------------
+coef           | {2.54407073265254,1.67172094779487}
+loglikelihood  | -37.8532498733
+std_err        | {0.677180599294897,0.387195514577534}
+z_stats        | {3.7568570855419,4.31751114064138}
+p_values       | {0.000172060691513886,1.5779844638453e-05}
+hessian        | 
{{2.78043065745617,-2.25848560642414},{-2.25848560642414,8.50472838284472}}
+num_iterations | 5
+</pre></li>
+<li>Computing predictions using cox model. (This example uses the original 
data table to perform the prediction. Typically a different test dataset with 
the same features as the original training dataset would be used.) <pre 
class="example">
+\x off
+-- Display the linear predictors for the original dataset
+SELECT madlib.coxph_predict('sample_cox',
+                            'sample_data',
+                            'id',
+                            'sample_pred');
+</pre> <pre class="result">
+SELECT * FROM sample_pred;
+ id |  predicted_value
+----+--------------------
+  0 |  -2.97110918125034
+  4 |  -2.41944126847803
+  6 |   -1.5167119566688
+  8 |  -1.96807661257341
+ 10 |  0.623090856508638
+ 12 |  -0.58054822590367
+ 14 |  -1.04863009128623
+ 16 | -0.714285901727259
+ 18 |   2.01061924317838
+ 20 |   2.98327228490375
+ 22 |   3.85256717775708
+ 24 |     5.507570916074
+  1 |  -2.93767476229444
+  3 |  -1.71731847040418
+  5 |  -1.09878171972008
+  7 |  -2.03494545048521
+  9 |  -1.78418730831598
+ 15 | -0.881457996506747
+ 19 |  -1.53342916614675
+ 11 |  0.993924357027849
+ 13 | -0.343452401208048
+ 17 |   1.02735877598375
+ 21 |   1.19453087076323
+ 23 |   5.35711603077246
+(24 rows)
+</pre> <pre class="example">
+-- Display the relative risk for the original dataset
+SELECT madlib.coxph_predict('sample_cox',
+                            'sample_data',
+                            'id',
+                            'sample_pred',
+                            'risk');
+</pre> <pre class="result">
+ id |  predicted_value
+ ----+--------------------
+  1 | 0.0529887971503509
+  3 |  0.179546963459175
+  5 |   0.33327686110022
+  7 |  0.130687611255372
+  9 |  0.167933483703554
+ 15 |  0.414178600294289
+ 19 |  0.215794402223054
+ 11 |   2.70181658768287
+ 13 |  0.709317242984782
+ 17 |   2.79367735395696
+ 21 |   3.30200833843654
+ 23 |   212.112338046551
+  0 | 0.0512464372091503
+  4 | 0.0889713146524469
+  6 |  0.219432204682557
+  8 |  0.139725343898993
+ 10 |   1.86468261037506
+ 12 |  0.559591499901241
+ 14 |  0.350417460388247
+ 16 |  0.489541567796517
+ 18 |   7.46794038691975
+ 20 |   19.7523463121038
+ 22 |   47.1138577624204
+ 24 |   246.551504798816
+(24 rows)
+</pre></li>
+<li>Run the test for Proportional Hazards assumption to obtain correlation 
between residuals and time. <pre class="example">
+SELECT madlib.cox_zph( 'sample_cox',
+                       'sample_zph_output'
+                     );
+</pre></li>
+<li>View results of the PHA test. <pre class="example">
+SELECT * FROM sample_zph_output;
+</pre> Results: <pre class="result">
+-[ RECORD 1 ]-----------------------------------------
+covariate  | ARRAY[grp,wbc]
+rho        | {0.00237308357328641,0.0375600568840431}
+chi_square | {0.000100675718191977,0.0232317400546175}
+p_value    | {0.991994376850758,0.878855984657948}
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>Generally, proportional-hazard models start with a list of <img 
class="formulaInl" alt="$ \boldsymbol n $" src="form_382.png"/> observations, 
each with <img class="formulaInl" alt="$ \boldsymbol m $" src="form_383.png"/> 
covariates and a time of death. From this <img class="formulaInl" alt="$ 
\boldsymbol n \times m $" src="form_384.png"/> matrix, we would like to derive 
the correlation between the covariates and the hazard function. This amounts to 
finding the parameters <img class="formulaInl" alt="$ \boldsymbol \beta $" 
src="form_385.png"/> that best fit the model described below.</p>
+<p>Let us define:</p><ul>
+<li><img class="formulaInl" alt="$ \boldsymbol t \in \mathbf R^{m} $" 
src="form_386.png"/> denote the vector of observed dependent variables, with 
<img class="formulaInl" alt="$ n $" src="form_10.png"/> rows.</li>
+<li><img class="formulaInl" alt="$ X \in \mathbf R^{m} $" src="form_387.png"/> 
denote the design matrix with <img class="formulaInl" alt="$ m $" 
src="form_292.png"/> columns and <img class="formulaInl" alt="$ n $" 
src="form_10.png"/> rows, containing all observed vectors of independent 
variables <img class="formulaInl" alt="$ \boldsymbol x_i $" src="form_99.png"/> 
as rows.</li>
+<li><img class="formulaInl" alt="$ R(t_i) $" src="form_388.png"/> denote the 
set of observations still alive at time <img class="formulaInl" alt="$ t_i $" 
src="form_389.png"/></li>
+</ul>
+<p>Note that this model <b>does not</b> include a <b>constant term</b>, and 
the data cannot contain a column of 1s.</p>
+<p>By definition, </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ P[T_k = t_i | \boldsymbol R(t_i)] = 
\frac{e^{\beta^T x_k} }{ \sum_{j \in R(t_i)} e^{\beta^T x_j}}. \,. \]" 
src="form_390.png"/>
+</p>
+<p>The <b>partial likelihood </b>function can now be generated as the product 
of conditional probabilities: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \mathcal L = \prod_{i = 1}^n \left( 
\frac{e^{\beta^T x_i}}{ \sum_{j \in R(t_i)} e^{\beta^T x_j}} \right). \]" 
src="form_391.png"/>
+</p>
+<p>The log-likelihood form of this equation is </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ L = \sum_{i = 1}^n \left[ \beta^T x_i - 
\log\left(\sum_{j \in R(t_i)} e^{\beta^T x_j }\right) \right]. \]" 
src="form_392.png"/>
+</p>
+<p>Using this score function and Hessian matrix, the partial likelihood can be 
maximized using the <b> Newton-Raphson algorithm</b>. <b>Breslow's method</b> 
is used to resolved tied times of deaths. The time of death for two records are 
considered "equal" if they differ by less than 1.0e-6</p>
+<p>The inverse of the Hessian matrix, evaluated at the estimate of <img 
class="formulaInl" alt="$ \boldsymbol \beta $" src="form_385.png"/>, can be 
used as an <b>approximate variance-covariance matrix </b> for the estimate, and 
used to produce approximate <b>standard errors</b> for the regression 
coefficients.</p>
+<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \mathit{se}(c_i) = \left( (H)^{-1} 
\right)_{ii} \,. \]" src="form_393.png"/>
+</p>
+<p> The Wald z-statistic is </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ z_i = \frac{c_i}{\mathit{se}(c_i)} \,. \]" 
src="form_109.png"/>
+</p>
+<p>The Wald <img class="formulaInl" alt="$ p $" src="form_110.png"/>-value for 
coefficient <img class="formulaInl" alt="$ i $" src="form_32.png"/> gives the 
probability (under the assumptions inherent in the Wald test) of seeing a value 
at least as extreme as the one observed, provided that the null hypothesis ( 
<img class="formulaInl" alt="$ c_i = 0 $" src="form_111.png"/>) is true. 
Letting <img class="formulaInl" alt="$ F $" src="form_112.png"/> denote the 
cumulative density function of a standard normal distribution, the Wald <img 
class="formulaInl" alt="$ p $" src="form_110.png"/>-value for coefficient <img 
class="formulaInl" alt="$ i $" src="form_32.png"/> is therefore </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( 
|z_i| )) \]" src="form_113.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$ Z $" src="form_114.png"/> is a 
standard normally distributed random variable.</p>
+<p>The condition number is computed as <img class="formulaInl" alt="$ 
\kappa(H) $" src="form_394.png"/> during the iteration immediately 
<em>preceding</em> convergence (i.e., <img class="formulaInl" alt="$ A $" 
src="form_13.png"/> is computed using the coefficients of the previous 
iteration). A large condition number (say, more than 1000) indicates the 
presence of significant multicollinearity.</p>
+<p><a class="anchor" id="Literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>A somewhat random selection of nice write-ups, with valuable pointers into 
further literature:</p>
+<p>[1] John Fox: Cox Proportional-Hazards Regression for Survival Data, 
Appendix to An R and S-PLUS companion to Applied Regression Feb 2012, <a 
href="http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf";>http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf</a></p>
+<p>[2] Stephen J Walters: What is a Cox model? <a 
href="http://www.medicine.ox.ac.uk/bandolier/painres/download/whatis/cox_model.pdf";>http://www.medicine.ox.ac.uk/bandolier/painres/download/whatis/cox_model.pdf</a></p>
+<p><a class="anchor" id="notes"></a></p><dl class="section 
user"><dt>Notes</dt><dd></dd></dl>
+<p>If number of ties in the source table is very large, a memory allocation 
error may be raised. The limitation is about <img class="formulaInl" 
alt="$(10^8 / m)$" src="form_395.png"/>, where <img class="formulaInl" 
alt="$m$" src="form_314.png"/> is number of featrues. For instance, if there 
are 100 featrues, the number of ties should be fewer than 1 million.</p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="cox__prop__hazards_8sql__in.html" title="SQL 
functions for cox proportional hazards. ">cox_prop_hazards.sql_in</a> 
documenting the functions</p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__crf.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__crf.html b/docs/v1.9.1/group__grp__crf.html
new file mode 100644
index 0000000..fa0d65a
--- /dev/null
+++ b/docs/v1.9.1/group__grp__crf.html
@@ -0,0 +1,620 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Conditional Random Field</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__crf.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Conditional Random Field<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#train_feature">Training Feature Generation</a> </li>
+<li>
+<a href="#train">CRF Training Function</a> </li>
+<li>
+<a href="#test_feature">Testing Feature Generation</a> </li>
+<li>
+<a href="#inference">Inference using Viterbi</a> </li>
+<li>
+<a href="#usage">Using CRF</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#background">Technical Background</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>A conditional random field (CRF) is a type of discriminative, 
undirected probabilistic graphical model. A linear-chain CRF is a special type 
of CRF that assumes the current state depends only on the previous state.</p>
+<p>Feature extraction modules are provided for text-analysis tasks such as 
part-of-speech (POS) tagging and named-entity resolution (NER). Currently, six 
feature types are implemented:</p>
+<ul>
+<li>Edge Feature: transition feature that encodes the transition feature 
weight from current label to next label.</li>
+<li>Start Feature: fired when the current token is the first token in a 
sequence.</li>
+<li>End Feature: fired when the current token is the last token in a 
sequence.</li>
+<li>Word Feature: fired when the current token is observed in the trained 
dictionary.</li>
+<li>Unknown Feature: fired when the current token is not observed in the 
trained dictionary for at least a certain number of times (default 1).</li>
+<li>Regex Feature: fired when the current token can be matched by a regular 
expression.</li>
+</ul>
+<p>A Viterbi implementation is also provided to get the best label sequence 
and the conditional probability <img class="formulaInl" alt="$ \Pr( \text{best 
label sequence} \mid \text{sequence}) $" src="form_55.png"/>.</p>
+<p>Following steps are required for CRF Learning and Inference:</p><ol 
type="1">
+<li><a href="#train_feature">Training Feature Generation</a></li>
+<li><a href="#train">CRF Training</a></li>
+<li><a href="#test_feature">Testing Feature Generation</a></li>
+<li><a href="#inference">Inference using Viterbi</a></li>
+</ol>
+<p><a class="anchor" id="train_feature"></a></p><dl class="section 
user"><dt>Training Feature Generation</dt><dd>The function takes 
<code>train_segment_tbl</code> and <code>regex_tbl</code> as input and does 
feature generation generating three tables <code>dictionary_tbl</code>, 
<code>train_feature_tbl</code> and <code>train_featureset_tbl</code>, that are 
required as an input for CRF training. <pre class="syntax">
+crf_train_fgen(train_segment_tbl,
+               regex_tbl,
+               label_tbl,
+               dictionary_tbl,
+               train_feature_tbl,
+               train_featureset_tbl)
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>train_segment_tbl </dt>
+<dd>TEXT. Name of the training segment table. The table is expected to have 
the following columns: <table  class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id column  </td></tr>
+<tr>
+<th>start_pos </th><td>INTEGER. Index of a particular term in the respective 
document  </td></tr>
+<tr>
+<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in 
the document  </td></tr>
+<tr>
+<th>label </th><td>INTEGER. Label id for the term corresponding to the actual 
label from <code>label_tbl</code>   </td></tr>
+</table>
+</dd>
+<dt>regex_tbl </dt>
+<dd>TEXT. Name of the regular expression table. The table is expected to have 
the following columns: <table  class="output">
+<tr>
+<th>pattern </th><td>TEXT. Regular Expression  </td></tr>
+<tr>
+<th>name </th><td>TEXT. Regular Expression name  </td></tr>
+</table>
+</dd>
+<dt>label_tbl </dt>
+<dd>TEXT. Name of the table containing unique labels and their id's. The table 
is expected to have the following columns: <table  class="output">
+<tr>
+<th>id </th><td>INTEGER. Unique label id. NOTE: Must range from 0 to total 
number of labels in the table - 1.   </td></tr>
+<tr>
+<th>label </th><td>TEXT. Label name  </td></tr>
+</table>
+</dd>
+<dt>dictionary_tbl </dt>
+<dd>TEXT. Name of the dictionary table to be created containing unique terms 
along with their counts. The table will have the following columns: <table  
class="output">
+<tr>
+<th>token </th><td>TEXT. Contains all the unique terms found in 
<code>train_segment_tbl</code>   </td></tr>
+<tr>
+<th>total </th><td>INTEGER. Respective counts for the terms  </td></tr>
+</table>
+</dd>
+<dt>train_feature_tbl</dt>
+<dd></dd>
+<dt></dt>
+<dd><p class="startdd">TEXT. Name of the training feature table to be created. 
The table will have the following columns: </p><table  class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id  </td></tr>
+<tr>
+<th>f_size </th><td>INTEGER. Feature set size. This value will be same for all 
the tuples in the table  </td></tr>
+<tr>
+<th>sparse_r </th><td>DOUBLE PRECISION[]. Array union of individual single 
state features (previous label, label, feature index, start position, training 
existance indicator), ordered by their start position.  </td></tr>
+<tr>
+<th>dense_m </th><td>DOUBLE PRECISION[]. Array union of (previous label, 
label, feature index, start position, training existance indicator) of edge 
features ordered by start position.  </td></tr>
+<tr>
+<th>sparse_m </th><td>DOUBLE PRECISION[]. Array union of (feature index, 
previous label, label) of edge features ordered by feature index.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>train_featureset_tbl </dt>
+<dd>TEXT. Name of the table to be created containing distinct featuresets 
generated from training feature extraction. The table will have the following 
columns: <table  class="output">
+<tr>
+<th>f_index </th><td>INTEGER. Column containing distinct featureset ids  
</td></tr>
+<tr>
+<th>f_name </th><td>TEXT. Feature name   </td></tr>
+<tr>
+<th>feature </th><td>ARRAY. Feature value. The value is of the form [L1, L2] 
<br />
+ - If L1 = -1: represents single state feature with L2 being the current label 
id. <br />
+ - If L1 != -1: represents transition feature with L1 be the previous label 
and L2 be the current label.    </td></tr>
+</table>
+</dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Linear 
Chain CRF Training Function</dt><dd>The function takes 
<code>train_feature_tbl</code> and <code>train_featureset_tbl</code> tables 
generated in the training feature generation steps as input along with other 
required parameters and produces two output tables <code>crf_stats_tbl</code> 
and <code>crf_weights_tbl</code>.</dd></dl>
+<pre class="syntax">
+lincrf_train(train_feature_tbl,
+             train_featureset_tbl,
+             label_tbl,
+             crf_stats_tbl,
+             crf_weights_tbl
+             max_iterations
+            )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>train_feature_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the feature table generated during 
training feature generation</p>
+<p class="enddd"></p>
+</dd>
+<dt>train_featureset_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the featureset table generated during 
training feature generation</p>
+<p class="enddd"></p>
+</dd>
+<dt>label_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the label table used</p>
+<p class="enddd"></p>
+</dd>
+<dt>crf_stats_table </dt>
+<dd>TEXT. Name of the table to be created containing statistics for CRF 
training. The table has the following columns: <table  class="output">
+<tr>
+<th>coef </th><td>DOUBLE PRECISION[]. Array of coefficients  </td></tr>
+<tr>
+<th>log_likelihood </th><td>DOUBLE. Log-likelihood  </td></tr>
+<tr>
+<th>num_iterations </th><td>INTEGER. The number of iterations at which the 
algorithm terminated  </td></tr>
+</table>
+</dd>
+<dt>crf_weights_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to be created creating learned 
feature weights. The table has the following columns: </p><table  
class="output">
+<tr>
+<th>id </th><td>INTEGER. Feature set id  </td></tr>
+<tr>
+<th>name </th><td>TEXT. Feature name  </td></tr>
+<tr>
+<th>prev_label_id </th><td>INTEGER. Label for the previous token encountered  
</td></tr>
+<tr>
+<th>label_id </th><td>INTEGER. Label of the token with the respective feature  
</td></tr>
+<tr>
+<th>weight </th><td>DOUBLE PRECISION. Weight for the respective feature set  
</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>max_iterations </dt>
+<dd>INTEGER. The maximum number of iterations </dd>
+</dl>
+<p><a class="anchor" id="test_feature"></a></p><dl class="section 
user"><dt>Testing Feature Generation</dt><dd></dd></dl>
+<pre class="syntax">
+crf_test_fgen(test_segment_tbl,
+              dictionary_tbl,
+              label_tbl,
+              regex_tbl,
+              crf_weights_tbl,
+              viterbi_mtbl,
+              viterbi_rtbl
+             )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>test_segment_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the testing segment table. The table is 
expected to have the following columns: </p><table  class="output">
+<tr>
+<th>doc_id </th><td>INTEGER. Document id column  </td></tr>
+<tr>
+<th>start_pos </th><td>INTEGER. Index of a particular term in the respective 
document  </td></tr>
+<tr>
+<th>seg_text </th><td>TEXT. Term at the respective <code>start_pos</code> in 
the document  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>dictionary_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the dictionary table created during 
training feature generation (<code>crf_train_fgen</code>)</p>
+<p class="enddd"></p>
+</dd>
+<dt>label_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the label table</p>
+<p class="enddd"></p>
+</dd>
+<dt>regex_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the regular expression table</p>
+<p class="enddd"></p>
+</dd>
+<dt>crf_weights_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the weights table generated during CRF 
training (<code>lincrf_train</code>)</p>
+<p class="enddd"></p>
+</dd>
+<dt>viterbi_mtbl </dt>
+<dd><p class="startdd">TEXT. Name of the Viterbi M table to be created</p>
+<p class="enddd"></p>
+</dd>
+<dt>viterbi_rtbl </dt>
+<dd>TEXT. Name of the Viterbi R table to be created </dd>
+</dl>
+<p><a class="anchor" id="inference"></a></p><dl class="section 
user"><dt>Inference using Viterbi</dt><dd><pre class="syntax">
+vcrf_label(test_segment_tbl,
+           viterbi_mtbl,
+           viterbi_rtbl,
+           label_tbl,
+           result_tbl)
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>test_segment_tbl </dt>
+<dd>TEXT. Name of the testing segment table. For required table schema, please 
refer to arguments in previous section </dd>
+<dt>viterbi_mtbl </dt>
+<dd>TEXT. Name of the table <code>viterbi_mtbl</code> generated from testing 
feature generation <code>crf_test_fgen</code>. </dd>
+<dt>viterbi_rtbl </dt>
+<dd>TEXT. Name of the table <code>viterbi_rtbl</code> generated from testing 
feature generation <code>crf_test_fgen</code>. </dd>
+<dt>label_tbl </dt>
+<dd>TEXT. Name of the label table. </dd>
+<dt>result_tbl </dt>
+<dd>TEXT. Name of the result table to be created containing extracted best 
label sequences. </dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="usage"></a></p><dl class="section user"><dt>Using 
CRF</dt><dd></dd></dl>
+<p>Generate text features, calculate their weights, and output the best label 
sequence for test data:<br />
+</p><ol type="1">
+<li>Perform feature generation on training data i.e. 
<code>train_segment_tbl</code> generating <code>train_feature_tbl</code> and 
<code>train_featureset_tbl</code>. <pre>SELECT madlib.crf_train_fgen(
+         '<em>train_segment_tbl</em>',
+         '<em>regex_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>dictionary_tbl</em>',
+         '<em>train_feature_tbl</em>',
+         '<em>train_featureset_tbl</em>');</pre></li>
+<li>Use linear-chain CRF for training providing <code>train_feature_tbl</code> 
and <code>train_featureset_tbl</code> generated from previous step as an input. 
<pre>SELECT madlib.lincrf_train(
+         '<em>train_feature_tbl</em>',
+         '<em>train_featureset_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>crf_stats_tbl</em>',
+         '<em>crf_weights_tbl</em>',
+         <em>max_iterations</em>);</pre></li>
+<li>Perform feature generation on testing data <code>test_segment_tbl</code> 
generating <code>viterbi_mtbl</code> and <code>viterbi_rtbl</code> required for 
inferencing. <pre>SELECT madlib.crf_test_fgen(
+         '<em>test_segment_tbl</em>',
+         '<em>dictionary_tbl</em>',
+         '<em>label_tbl</em>',
+         '<em>regex_tbl</em>',
+         '<em>crf_weights_tbl</em>',
+         '<em>viterbi_mtbl</em>',
+         '<em>viterbi_rtbl</em>');</pre></li>
+<li>Run the Viterbi function to get the best label sequence and the 
conditional probability <img class="formulaInl" alt="$ \Pr( \text{best label 
sequence} \mid \text{sequence}) $" src="form_55.png"/>. <pre>SELECT 
madlib.vcrf_label(
+         '<em>test_segment_tbl</em>',
+         '<em>viterbi_mtbl</em>',
+         '<em>viterbi_rtbl</em>',
+         '<em>label_tbl</em>',
+         '<em>result_tbl</em>');</pre></li>
+</ol>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd>This example uses a trivial training and test data 
set.</dd></dl>
+<ol type="1">
+<li>Load the label table, the regular expressions table, and the training 
segment table: <pre class="example">
+SELECT * FROM crf_label ORDER BY id;
+</pre> Result: <pre class="result">
+ id | label
+&#160;---+-------
+  0 | #
+  1 | $
+  2 | ''
+...
+  8 | CC
+  9 | CD
+ 10 | DT
+ 11 | EX
+ 12 | FW
+ 13 | IN
+ 14 | JJ
+...
+</pre> The regular expressions table: <pre class="example">
+SELECT * from crf_regex;
+</pre> <pre class="result">
+    pattern    |         name
+&#160;--------------+----------------------
+ ^.+ing$       | endsWithIng
+ ^[A-Z][a-z]+$ | InitCapital
+ ^[A-Z]+$      | isAllCapital
+ ^.*[0-9]+.*$  | containsDigit
+...
+</pre> The training segment table: <pre class="example">
+SELECT * from train_segmenttbl ORDER BY doc_id, start_pos;
+</pre> <pre class="result">
+ doc_id | start_pos |  seg_text  | label
+&#160;-------+-----------+------------+-------
+      0 |         0 | Confidence |    18
+      0 |         1 | in         |    13
+      0 |         2 | the        |    10
+      0 |         3 | pound      |    18
+      0 |         4 | is         |    38
+      0 |         5 | widely     |    26
+...
+      1 |         0 | Chancellor |    19
+      1 |         1 | of         |    13
+      1 |         2 | the        |    10
+      1 |         3 | Exchequer  |    19
+      1 |         4 | Nigel      |    19
+...
+</pre></li>
+<li>Generate the training features: <pre class="example">
+SELECT crf_train_fgen( 'train_segmenttbl',
+                       'crf_regex',
+                       'crf_label',
+                       'crf_dictionary',
+                       'train_featuretbl',
+                       'train_featureset'
+                     );
+SELECT * from crf_dictionary;
+</pre> Result: <pre class="result">
+     token       | total
+&#160;----------------+-------
+ Hawthorne       |     1
+ Mercedes-Benzes |     1
+ Wolf            |     3
+ best-known      |     1
+ hairline        |     1
+ accepting       |     2
+ purchases       |    14
+ trash           |     5
+ co-venture      |     1
+ restaurants     |     7
+...
+</pre> <pre class="example">
+SELECT * from train_featuretbl;
+</pre> Result: <pre class="result">
+ doc_id | f_size |            sparse_r           |             dense_m         
    |       sparse_m
+&#160;-------+--------+-------------------------------+---------------------------------+-----------------------
+      2 |     87 | {-1,13,12,0,1,-1,13,9,0,1,..} | 
{13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...}
+      1 |     87 | {-1,13,0,0,1,-1,13,9,0,1,...} | 
{13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...}
+</pre> <pre class="example">
+SELECT * from train_featureset;
+</pre> <pre class="result">
+ f_index |    f_name     | feature
+&#160;--------+---------------+---------
+       1 | R_endsWithED  | {-1,29}
+      13 | W_outweigh    | {-1,26}
+      29 | U             | {-1,5}
+      31 | U             | {-1,29}
+      33 | U             | {-1,12}
+      35 | W_a           | {-1,2}
+      37 | W_possible    | {-1,6}
+      15 | W_signaled    | {-1,29}
+      17 | End.          | {-1,43}
+      49 | W_'s          | {-1,16}
+      63 | W_acquire     | {-1,26}
+      51 | E.            | {26,2}
+      69 | E.            | {29,17}
+      71 | E.            | {2,11}
+      83 | W_the         | {-1,2}
+      85 | E.            | {16,11}
+       4 | W_return      | {-1,11}
+...
+</pre></li>
+<li>Train using linear CRF: <pre class="example">
+SELECT lincrf_train( 'train_featuretbl',
+                     'train_featureset',
+                     'crf_label',
+                     'crf_stats_tbl',
+                     'crf_weights_tbl',
+                     20
+             );
+</pre> <pre class="result">
+                                lincrf_train
+&#160;-----------------------------------------------------------------------------------
+ CRF Train successful. Results stored in the specified CRF stats and weights 
table
+ lincrf
+</pre> View the feature weight table. <pre class="example">
+SELECT * from crf_weights_tbl;
+</pre> Result: <pre class="result">
+ id |     name      | prev_label_id | label_id |      weight
+&#160;---+---------------+---------------+----------+-------------------
+  1 | R_endsWithED  |            -1 |       29 |  1.54128249293937
+ 13 | W_outweigh    |            -1 |       26 |  1.70691232223653
+ 29 | U             |            -1 |        5 |  1.40708515869008
+ 31 | U             |            -1 |       29 | 0.830356200936407
+ 33 | U             |            -1 |       12 | 0.769587378281239
+ 35 | W_a           |            -1 |        2 |  2.68470625883726
+ 37 | W_possible    |            -1 |        6 |  3.41773107604468
+ 15 | W_signaled    |            -1 |       29 |  1.68187039165771
+ 17 | End.          |            -1 |       43 |  3.07687845517082
+ 49 | W_'s          |            -1 |       16 |  2.61430312229883
+ 63 | W_acquire     |            -1 |       26 |  1.67247047385797
+ 51 | E.            |            26 |        2 |   3.0114240119435
+ 69 | E.            |            29 |       17 |  2.82385531733866
+ 71 | E.            |             2 |       11 |  3.00970493772732
+ 83 | W_the         |            -1 |        2 |  2.58742315259326
+...
+</pre></li>
+<li>To find the best labels for a test set using the trained linear CRF model, 
repeat steps #1-2 and generate the test features, except instead of creating a 
new dictionary, use the dictionary generated from the training set. <pre 
class="example">
+SELECT * from test_segmenttbl ORDER BY doc_id, start_pos;
+</pre> Result: <pre class="result">
+ doc_id | start_pos |   seg_text
+&#160;-------+-----------+---------------
+      0 |         0 | Rockwell
+      0 |         1 | International
+      0 |         2 | Corp.
+      0 |         3 | 's
+      0 |         4 | Tulsa
+      0 |         5 | unit
+      0 |         6 | said
+...
+      1 |         0 | Rockwell
+      1 |         1 | said
+      1 |         2 | the
+      1 |         3 | agreement
+      1 |         4 | calls
+...
+</pre> <pre class="example">
+SELECT crf_test_fgen( 'test_segmenttbl',
+                      'crf_dictionary',
+                      'crf_label',
+                      'crf_regex',
+                      'crf_weights_tbl',
+                      'viterbi_mtbl',
+                      'viterbi_rtbl'
+                    );
+</pre></li>
+<li>Calculate the best label sequence and save in the table 
<code>extracted_best_labels</code>. <pre class="example">
+SELECT vcrf_label( 'test_segmenttbl',
+                   'viterbi_mtbl',
+                   'viterbi_rtbl',
+                   'crf_label',
+                   'extracted_best_labels'
+                 );
+</pre> View the best labels. <pre class="example">
+SELECT * FROM extracted_best_labels;
+</pre> Result: <pre class="result">
+ doc_id | start_pos |   seg_text    | label | id | max_pos |   prob
+&#160;-------+-----------+---------------+-------+----+---------+----------
+      0 |         0 | Rockwell      | NNP   | 19 |      27 | 0.000269
+      0 |         1 | International | NNP   | 19 |      27 | 0.000269
+      0 |         2 | Corp.         | NNP   | 19 |      27 | 0.000269
+      0 |         3 | 's            | NNP   | 19 |      27 | 0.000269
+...
+      1 |         0 | Rockwell      | NNP   | 19 |      16 | 0.000168
+      1 |         1 | said          | NNP   | 19 |      16 | 0.000168
+      1 |         2 | the           | DT    | 10 |      16 | 0.000168
+      1 |         3 | agreement     | JJ    | 14 |      16 | 0.000168
+      1 |         4 | calls         | NNS   | 21 |      16 | 0.000168
+...
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>Specifically, a linear-chain CRF is a distribution defined by </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ p_\lambda(\boldsymbol y | \boldsymbol x) = 
\frac{\exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol 
y)}}{Z_\lambda(\boldsymbol x)} \,. \]" src="form_56.png"/>
+</p>
+<p>where</p><ul>
+<li><img class="formulaInl" alt="$ F_m(\boldsymbol x, \boldsymbol y) = 
\sum_{i=1}^n f_m(y_i,y_{i-1},x_i) $" src="form_57.png"/> is a global feature 
function that is a sum along a sequence <img class="formulaInl" alt="$ 
\boldsymbol x $" src="form_58.png"/> of length <img class="formulaInl" alt="$ n 
$" src="form_10.png"/></li>
+<li><img class="formulaInl" alt="$ f_m(y_i,y_{i-1},x_i) $" src="form_59.png"/> 
is a local feature function dependent on the current token label <img 
class="formulaInl" alt="$ y_i $" src="form_60.png"/>, the previous token label 
<img class="formulaInl" alt="$ y_{i-1} $" src="form_61.png"/>, and the 
observation <img class="formulaInl" alt="$ x_i $" src="form_62.png"/></li>
+<li><img class="formulaInl" alt="$ \lambda_m $" src="form_63.png"/> is the 
corresponding feature weight</li>
+<li><img class="formulaInl" alt="$ Z_\lambda(\boldsymbol x) $" 
src="form_64.png"/> is an instance-specific normalizer <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ Z_\lambda(\boldsymbol x) = \sum_{\boldsymbol 
y'} \exp{\sum_{m=1}^M \lambda_m F_m(\boldsymbol x, \boldsymbol y')} \]" 
src="form_65.png"/>
+</p>
+</li>
+</ul>
+<p>A linear-chain CRF estimates the weights <img class="formulaInl" alt="$ 
\lambda_m $" src="form_63.png"/> by maximizing the log-likelihood of a given 
training set <img class="formulaInl" alt="$ T=\{(x_k,y_k)\}_{k=1}^N $" 
src="form_66.png"/>.</p>
+<p>The log-likelihood is defined as </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \ell_{\lambda}=\sum_k \log p_\lambda(y_k|x_k) 
=\sum_k[\sum_{m=1}^M \lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] \]" 
src="form_67.png"/>
+</p>
+<p>and the zero of its gradient </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \nabla 
\ell_{\lambda}=\sum_k[F(x_k,y_k)-E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] \]" 
src="form_68.png"/>
+</p>
+<p>is found since the maximum likelihood is reached when the empirical average 
of the global feature vector equals its model expectation. The MADlib 
implementation uses limited-memory BFGS (L-BFGS), a limited-memory variation of 
the Broyden–Fletcher–Goldfarb–Shanno (BFGS) update, a quasi-Newton method 
for unconstrained optimization.</p>
+<p><img class="formulaInl" alt="$E_{p_\lambda(Y|x)}[F(x,Y)]$" 
src="form_69.png"/> is found by using a variant of the forward-backward 
algorithm: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ E_{p_\lambda(Y|x)}[F(x,Y)] = \sum_y 
p_\lambda(y|x)F(x,y) = 
\sum_i\frac{\alpha_{i-1}(f_i*M_i)\beta_i^T}{Z_\lambda(x)} \]" 
src="form_70.png"/>
+</p>
+ <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ Z_\lambda(x) = \alpha_n.1^T \]" 
src="form_71.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$\alpha_i$" src="form_72.png"/> and 
<img class="formulaInl" alt="$ \beta_i$" src="form_73.png"/> are the forward 
and backward state cost vectors defined by </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \alpha_i = \begin{cases} \alpha_{i-1}M_i, 
&amp; 0<i<=n\\ 1, &amp; i=0 \end{cases}\\ \]" src="form_74.png"/>
+</p>
+ <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \beta_i^T = \begin{cases} 
M_{i+1}\beta_{i+1}^T, &amp; 1<=i<n\\ 1, &amp; i=n \end{cases} \]" 
src="form_75.png"/>
+</p>
+<p>To avoid overfitting, we penalize the likelihood with a spherical Gaussian 
weight prior: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \ell_{\lambda}^\prime=\sum_k[\sum_{m=1}^M 
\lambda_m F_m(x_k,y_k) - \log Z_\lambda(x_k)] - \frac{\lVert \lambda 
\rVert^2}{2\sigma ^2} \]" src="form_76.png"/>
+</p>
+<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \nabla \ell_{\lambda}^\prime=\sum_k[F(x_k,y_k) 
- E_{p_\lambda(Y|x_k)}[F(x_k,Y)]] - \frac{\lambda}{\sigma ^2} \]" 
src="form_77.png"/>
+</p>
+<dl class="section user"><dt>Literature</dt><dd>[1] F. Sha, F. Pereira. 
Shallow Parsing with Conditional Random Fields, <a 
href="http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf";>http://www-bcf.usc.edu/~feisha/pubs/shallow03.pdf</a></dd></dl>
+<p>[2] Wikipedia, Conditional Random Field, <a 
href="http://en.wikipedia.org/wiki/Conditional_random_field";>http://en.wikipedia.org/wiki/Conditional_random_field</a></p>
+<p>[3] A. Jaiswal, S.Tawari, I. Mansuri, K. Mittal, C. Tiwari (2012), CRF, <a 
href="http://crf.sourceforge.net/";>http://crf.sourceforge.net/</a></p>
+<p>[4] D. Wang, ViterbiCRF, <a 
href="http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html";>http://www.cs.berkeley.edu/~daisyw/ViterbiCRF.html</a></p>
+<p>[5] Wikipedia, Viterbi Algorithm, <a 
href="http://en.wikipedia.org/wiki/Viterbi_algorithm";>http://en.wikipedia.org/wiki/Viterbi_algorithm</a></p>
+<p>[6] J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage (1980), 
Mathematics of Computation 35, pp. 773-782</p>
+<p>[7] J. Nocedal, Software for Large-scale Unconstrained Optimization, <a 
href="http://users.eecs.northwestern.edu/~nocedal/lbfgs.html";>http://users.eecs.northwestern.edu/~nocedal/lbfgs.html</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="crf_8sql__in.html" title="SQL functions for 
conditional random field. ">crf.sql_in</a> <a class="el" 
href="crf__feature__gen_8sql__in.html" title="SQL function for POS/NER feature 
extraction. ">crf_feature_gen.sql_in</a> <a class="el" 
href="viterbi_8sql__in.html" title="concatenate a set of input values into 
arrays to feed into viterbi c function and create a human 
read...">viterbi.sql_in</a> (documenting the SQL functions) </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__data__prep.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__data__prep.html 
b/docs/v1.9.1/group__grp__data__prep.html
new file mode 100644
index 0000000..a9c02e6
--- /dev/null
+++ b/docs/v1.9.1/group__grp__data__prep.html
@@ -0,0 +1,244 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Encoding Categorical Variables</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__data__prep.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Encoding Categorical Variables<div class="ingroups"><a 
class="el" href="group__grp__datatrans.html">Data Types and 
Transformations</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#categorical">Coding systems for categorical variables</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+</ul>
+</div><p><a class="anchor" id="categorical"></a></p><dl class="section 
user"><dt>Coding systems for categorical variables</dt><dd>Categorical 
variables require special attention in regression analysis because, unlike 
dichotomous or continuous variables, they cannot be entered into the regression 
equation just as they are. For example, if you have a variable called race that 
is coded 1 = Hispanic, 2 = Asian, 3 = Black, 4 = White, then entering race in 
your regression will look at the linear effect of race, which is probably not 
what you intended. Instead, categorical variables like this need to be recoded 
into a series of indicator variables which can then be entered into the 
regression model. There are a variety of coding systems (also called as 
contrasts) that can be used when coding categorical variables. including dummy, 
effects, orthogonal, and helmert coding.</dd></dl>
+<p>We currently only support the dummy coding technique. Dummy coding is used 
when a researcher wants to compare other groups of the predictor variable with 
one specific group of the predictor variable. Often, the specific group to 
compare with is called the reference group.</p>
+<pre class="syntax">
+create_indicator_variables(
+    source_table,
+    output_table,
+    categorical_cols,
+    keep_null,
+    distributed_by
+    )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd>VARCHAR. Name of the source table, containing data for categorical 
variables. </dd>
+<dt>output_table </dt>
+<dd>VARCHAR. Name of result table. The output table has the same columns as 
the original table, adding new indicator variable columns for each categorical 
column. The column name for the indicator variable is <em>'categorical column 
name'</em>_<em>'categorical value'</em>.  </dd>
+<dt>categorical_cols  </dt>
+<dd>VARCHAR. Comma-separated string of column names of categorical variables 
that need to be dummy-coded. </dd>
+<dt>keep_null (optional) </dt>
+<dd>BOOLEAN. default: FALSE. Whether 'NULL' should be treated as one of the 
categories of the categorical variable. If True, then an indicator variable is 
created corresponding to the NULL value. If False, then all indicator variables 
for that record will be set to NULL.  </dd>
+<dt>distributed_by (optional) </dt>
+<dd>VARCHAR. default: NULL. Columns to use for the distribution policy of the 
output table. When NULL, the distribution policy of 'source_table' will be 
used. This argument is not available for POSTGRESQL platforms. </dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Use a subset of the abalone dataset. <pre class="example">
+DROP TABLE IF EXISTS abalone;
+CREATE TABLE abalone (
+    sex character varying,
+    length double precision,
+    diameter double precision,
+    height double precision
+);
+COPY abalone (sex, length, diameter, height) FROM stdin WITH DELIMITER '|' 
NULL as '@';
+M| 0.455 |   0.365 | 0.095
+F| 0.53  |   0.42  | 0.135
+M| 0.35  |   0.265 | 0.09
+F| 0.53  |   0.415 | 0.15
+M| 0.44  |   0.365 | 0.125
+F| 0.545 |   0.425 | 0.125
+I| 0.33  |   0.255 | 0.08
+F| 0.55  |   0.44  | 0.15
+I| 0.425 |   0.30  | 0.095
+F| 0.525 |   0.38  | 0.140
+M| 0.475 |   0.37  | 0.125
+F| 0.535 |   0.405 | 0.145
+M| 0.43  |   0.358 | 0.11
+F| 0.47  |   0.355 | 0.100
+M| 0.49  |   0.38  | 0.135
+F| 0.44  |   0.340 | 0.100
+M| 0.5   |   0.400 | 0.13
+F| 0.565 |   0.44  | 0.155
+I| 0.355 |   0.280 | 0.085
+F| 0.550 |   0.415 | 0.135
+| 0.475 |   0.37  | 0.125
+\.
+</pre></li>
+<li>Create new table with dummy-coded indicator variables <pre class="example">
+drop table if exists abalone_out;
+select madlib.create_indicator_variables ('abalone', 'abalone_out', 'sex');
+select * from abalone_out;
+</pre> <pre class="result">
+ sex  | length | diameter | height | sex_F  | sex_I  | sex_M
+&#160; -----+--------+----------+--------+--------+--------+-------
+ F    |   0.53 |     0.42 |  0.135 |      1 |      0 |     0
+ F    |   0.53 |    0.415 |   0.15 |      1 |      0 |     0
+ F    |  0.545 |    0.425 |  0.125 |      1 |      0 |     0
+ F    |   0.55 |     0.44 |   0.15 |      1 |      0 |     0
+ F    |  0.525 |     0.38 |   0.14 |      1 |      0 |     0
+ F    |  0.535 |    0.405 |  0.145 |      1 |      0 |     0
+ F    |   0.47 |    0.355 |    0.1 |      1 |      0 |     0
+ F    |   0.44 |     0.34 |    0.1 |      1 |      0 |     0
+ F    |  0.565 |     0.44 |  0.155 |      1 |      0 |     0
+ F    |   0.55 |    0.415 |  0.135 |      1 |      0 |     0
+ M    |  0.455 |    0.365 |  0.095 |      0 |      0 |     1
+ M    |   0.35 |    0.265 |   0.09 |      0 |      0 |     0
+ M    |   0.44 |    0.365 |  0.125 |      0 |      0 |     0
+ I    |   0.33 |    0.255 |   0.08 |      0 |      1 |     0
+ I    |  0.425 |      0.3 |  0.095 |      0 |      1 |     0
+ M    |  0.475 |     0.37 |  0.125 |      0 |      0 |     0
+ M    |   0.43 |    0.358 |   0.11 |      0 |      0 |     0
+ M    |   0.49 |     0.38 |  0.135 |      0 |      0 |     0
+ M    |    0.5 |      0.4 |   0.13 |      0 |      0 |     0
+ I    |  0.355 |     0.28 |  0.085 |      0 |      1 |     0
+ NULL |   0.55 |    0.415 |  0.135 |   NULL |   NULL |  NULL
+</pre></li>
+<li>Create indicator variable for 'NULL' value (note the additional column 
'"sex_NULL"') <pre class="example">
+drop table if exists abalone_out;
+select madlib.create_indicator_variables'abalone', 'abalone_out', 'sex', True);
+select * from abalone_out;
+</pre> <pre class="result">
+ sex  | length | diameter | height | sex_F  | sex_I  | sex_M | sex_NULL
+&#160; 
---&mdash;+-----&mdash;+-------&mdash;+-----&mdash;+-----&mdash;+-----&mdash;+----&mdash;+----&mdash;
+ F    |   0.53 |     0.42 |  0.135 |      1 |      0 |     0 |     0
+ F    |   0.53 |    0.415 |   0.15 |      1 |      0 |     0 |     0
+ F    |  0.545 |    0.425 |  0.125 |      1 |      0 |     0 |     0
+ F    |   0.55 |     0.44 |   0.15 |      1 |      0 |     0 |     0
+ F    |  0.525 |     0.38 |   0.14 |      1 |      0 |     0 |     0
+ F    |  0.535 |    0.405 |  0.145 |      1 |      0 |     0 |     0
+ F    |   0.47 |    0.355 |    0.1 |      1 |      0 |     0 |     0
+ F    |   0.44 |     0.34 |    0.1 |      1 |      0 |     0 |     0
+ F    |  0.565 |     0.44 |  0.155 |      1 |      0 |     0 |     0
+ F    |   0.55 |    0.415 |  0.135 |      1 |      0 |     0 |     0
+ M    |  0.455 |    0.365 |  0.095 |      0 |      0 |     1 |     0
+ M    |   0.35 |    0.265 |   0.09 |      0 |      0 |     0 |     0
+ M    |   0.44 |    0.365 |  0.125 |      0 |      0 |     0 |     0
+ I    |   0.33 |    0.255 |   0.08 |      0 |      1 |     0 |     0
+ I    |  0.425 |      0.3 |  0.095 |      0 |      1 |     0 |     0
+ M    |  0.475 |     0.37 |  0.125 |      0 |      0 |     0 |     0
+ M    |   0.43 |    0.358 |   0.11 |      0 |      0 |     0 |     0
+ M    |   0.49 |     0.38 |  0.135 |      0 |      0 |     0 |     0
+ M    |    0.5 |      0.4 |   0.13 |      0 |      0 |     0 |     0
+ I    |  0.355 |     0.28 |  0.085 |      0 |      1 |     0 |     0
+ NULL |   0.55 |    0.415 |  0.135 |      0 |      0 |     0 |     1
+</pre> </li>
+</ol>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>


Reply via email to