[09/51] [partial] incubator-madlib-site git commit: Update doc for 1.9.1 release

xtang Tue, 20 Sep 2016 11:32:13 -0700

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__association__rules.js
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__association__rules.js 
b/docs/v1.9.1/group__grp__association__rules.js
new file mode 100644
index 0000000..e10c849
--- /dev/null
+++ b/docs/v1.9.1/group__grp__association__rules.js
@@ -0,0 +1,4 @@
+var group__grp__association__rules =
+[
+    [ "Apriori Algorithm", "group__grp__assoc__rules.html", null ]
+];
\ No newline at end of file


http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__bayes.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__bayes.html 
b/docs/v1.9.1/group__grp__bayes.html
new file mode 100644
index 0000000..f86f8f0
--- /dev/null
+++ b/docs/v1.9.1/group__grp__bayes.html
@@ -0,0 +1,483 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Naive Bayes Classification</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__bayes.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Naive Bayes Classification<div class="ingroups"><a 
class="el" href="group__grp__early__stage.html">Early Stage 
Development</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#train">Training Function(s)</a> </li>
+<li>
+<a href="#classify">Classify Function(s)</a> </li>
+<li>
+<a href="#probabilities">Probabilities Function(s)</a> </li>
+<li>
+<a href="#adhoc">Ad Hoc Computation</a> </li>
+<li>
+<a href="#notes">Implementation Notes</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#background">Technical Background</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method 
is still in early stage development. There may be some issues that will be 
addressed in a future version. Interface and implementation is subject to 
change. </em></dd></dl>
+<p>Naive Bayes refers to a stochastic model where all independent variables 
<img class="formulaInl" alt="$ a_1, \dots, a_n $" src="form_18.png"/> (often 
referred to as attributes in this context) independently contribute to the 
probability that a data point belongs to a certain class <img 
class="formulaInl" alt="$ c $" src="form_19.png"/>.</p>
+<p>Naives Bayes classification estimates feature probabilities and class 
priors using maximum likelihood or Laplacian smoothing. For numeric attributes, 
Gaussian smoothing can be used to estimate the feature probabilities.These 
parameters are then used to classify new data.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function(s)</dt><dd></dd></dl>
+<p>For data with only categorical attributes, precompute feature probabilities 
and class priors using the following function:</p>
+<pre class="syntax">
+create_nb_prepared_data_tables ( trainingSource,
+                                 trainingClassColumn,
+                                 trainingAttrColumn,
+                                 numAttrs,
+                                 featureProbsName,
+                                 classPriorsName
+                               )
+</pre><p>For data containing both categorical and numeric attributes, use the 
following form to precompute the Gaussian parameters (mean and variance) for 
numeric attributes alongside the feature probabilities for categorical 
attributes and class priors.</p>
+<pre class="syntax">
+create_nb_prepared_data_tables ( trainingSource,
+                                 trainingClassColumn,
+                                 trainingAttrColumn,
+                                 numericAttrsColumnIndices,
+                                 numAttrs,
+                                 featureProbsName,
+                                 numericAttrParamsName,
+                                 classPriorsName
+                               )
+</pre><p>The <em>trainingSource</em> is expected to be of the following form: 
</p><pre>{TABLE|VIEW} <em>trainingSource</em> (
+    ...
+    <em>trainingClassColumn</em> INTEGER,
+    <em>trainingAttrColumn</em> INTEGER[] OR NUMERIC[] OR FLOAT8[],
+    ...
+)</pre><p><em>numericAttrsColumnIndices</em> should be of type TEXT, specified 
as an array of indices (starting from 1) in the <em>trainingAttrColumn</em> 
attributes-array that correspond to numeric attributes.</p>
+<p>The two output tables are:</p><ul>
+<li><em>featureProbsName</em> &ndash; stores feature probabilities</li>
+<li><em>classPriorsName</em> &ndash; stores the class priors</li>
+</ul>
+<p>In addition to the above, if the function specifying numeric attributes is 
used, an additional table <em>numericAttrParamsName</em> is created which 
stores the Gaussian parameters for the numeric attributes.</p>
+<p><a class="anchor" id="classify"></a></p><dl class="section 
user"><dt>Classify Function(s)</dt><dd></dd></dl>
+<p>Perform Naive Bayes classification: </p><pre class="syntax">
+create_nb_classify_view ( featureProbsName,
+                          classPriorsName,
+                          classifySource,
+                          classifyKeyColumn,
+                          classifyAttrColumn,
+                          numAttrs,
+                          destName
+                        )
+</pre><p>For data with numeric attributes, use the following version:</p>
+<pre class="syntax">
+create_nb_classify_view ( featureProbsName,
+                          classPriorsName,
+                          classifySource,
+                          classifyKeyColumn,
+                          classifyAttrColumn,
+                          numAttrs,
+                          numericAttrParamsName,
+                          destName
+                        )
+</pre><p>The <b>data to classify</b> is expected to be of the following form: 
</p><pre>{TABLE|VIEW} <em>classifySource</em> (
+    ...
+    <em>classifyKeyColumn</em> ANYTYPE,
+    <em>classifyAttrColumn</em> INTEGER[],
+    ...
+)</pre><p>This function creates the view <code><em>destName</em></code> 
mapping <em>classifyKeyColumn</em> to the Naive Bayes classification. </p><pre 
class="result">
+key | nb_classification
+&#160;---+------------------
+...
+</pre><p><a class="anchor" id="probabilities"></a></p><dl class="section 
user"><dt>Probabilities Function(s)</dt><dd></dd></dl>
+<p>Compute Naive Bayes probabilities. </p><pre class="syntax">
+create_nb_probs_view( featureProbsName,
+                      classPriorsName,
+                      classifySource,
+                      classifyKeyColumn,
+                      classifyAttrColumn,
+                      numAttrs,
+                      destName
+                    )
+</pre><p>For data with numeric attributes , use the following version:</p>
+<pre class="syntax">
+create_nb_probs_view( featureProbsName,
+                      classPriorsName,
+                      classifySource,
+                      classifyKeyColumn,
+                      classifyAttrColumn,
+                      numAttrs,
+                      numericAttrParamsName,
+                      destName
+                    )
+</pre><p>This creates the view <code><em>destName</em></code> mapping 
<em>classifyKeyColumn</em> and every single class to the Naive Bayes 
probability: </p><pre class="result">
+key | class | nb_prob
+&#160;---+-------+--------
+...
+</pre><p><a class="anchor" id="adhoc"></a></p><dl class="section user"><dt>Ad 
Hoc Computation Function</dt><dd></dd></dl>
+<p>With ad hoc execution (no precomputation), the functions <a class="el" 
href="bayes_8sql__in.html#a798402280fc6db710957ae3ab58767e0" title="Create a 
view with columns (key, nb_classification) ">create_nb_classify_view()</a> and 
<a class="el" href="bayes_8sql__in.html#a163afffd0c845d325f060f74bcf02243" 
title="Create view with columns (key, class, nb_prob) 
">create_nb_probs_view()</a> can be used in an ad-hoc fashion without the 
precomputation step. In this case, replace the function arguments</p>
+<pre>'<em>featureProbsName</em>', '<em>classPriorsName</em>'</pre><p> with 
</p><pre>'<em>trainingSource</em>', '<em>trainingClassColumn</em>', 
'<em>trainingAttrColumn</em>'</pre><p> for data without any any numeric 
attributes and with </p><pre>'<em>trainingSource</em>', 
'<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>', 
'<em>numericAttrsColumnIndices</em>'</pre><p> for data containing numeric 
attributes as well.</p>
+<p><a class="anchor" id="notes"></a></p><dl class="section 
user"><dt>Implementation Notes</dt><dd><ul>
+<li>The probabilities computed on the platforms of PostgreSQL and Greenplum 
database have a small difference due to the nature of floating point 
computation. Usually this is not important. However, if a data point has <p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(C=c_i \mid A) \approx P(C=c_j \mid A) \]" 
src="form_20.png"/>
+</p>
+ for two classes, this data point might be classified into diferent classes on 
PostgreSQL and Greenplum. This leads to the differences in classifications on 
PostgreSQL and Greenplum for some data sets, but this should not affect the 
quality of the results.</li>
+<li>When two classes have equal and highest probability among all classes, the 
classification result is an array of these two classes, but the order of the 
two classes is random.</li>
+<li>The current implementation of Naive Bayes classification is suitable for 
discontinuous (categorial) attributes as well as continuous (numeric) 
attributes.<br />
+For continuous data, a typical assumption, usually used for small datasets, is 
that the continuous values associated with each class are distributed according 
to a Gaussian distribution, and the probabilities <img class="formulaInl" 
alt="$ P(A_i = a \mid C=c) $" src="form_21.png"/> are estimated using the 
Gaussian Distribution formula: <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(A_i=a \mid C=c) = 
\frac{1}{\sqrt{2\pi\sigma^{2}_c}}exp\left(-\frac{(a-\mu_c)^{2}}{2\sigma^{2}_c}\right)
 \]" src="form_22.png"/>
+</p>
+ where <img class="formulaInl" alt="$\mu_c$" src="form_23.png"/> and <img 
class="formulaInl" alt="$\sigma^{2}_c$" src="form_24.png"/> are the population 
mean and variance of the attribute for the class <img class="formulaInl" 
alt="$c$" src="form_25.png"/>.<br />
+Another common technique for handling continuous values, which is better for 
large data sets, is to use binning to discretize the values, and convert the 
continuous data into categorical bins. This approach is currently not 
implemented.</li>
+<li>One can provide floating point data to the Naive Bayes classification 
function. If the corresponding attribute index is not specified in 
<em>numericAttrsColumnIndices</em>, floating point numbers will be used as 
symbolic substitutions for categorial data. In this case, the classification 
would work best if there are sufficient data points for each floating point 
attribute. However, if floating point numbers are used as continuous data 
without the attribute being marked as of type numeric in 
<em>numericAttrsColumnIndices</em>, no warning is raised and the result may not 
be as expected.</li>
+</ul>
+</dd></dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<p>The following is an extremely simplified example of the above option #1 
which can by verified by hand.</p>
+<ol type="1">
+<li>The training and the classification data. <pre class="example">
+SELECT * FROM training;
+</pre> Result: <pre class="result">
+ id | class | attributes
+&#160;---+-------+------------
+  1 |     1 | {1,2,3}
+  2 |     1 | {1,2,1}
+  3 |     1 | {1,4,3}
+  4 |     2 | {1,2,2}
+  5 |     2 | {0,2,2}
+  6 |     2 | {0,1,3}
+(6 rows)
+</pre> <pre class="example">
+SELECT * FROM toclassify;
+</pre> Result: <pre class="result">
+ id | attributes
+&#160;---+------------
+  1 | {0,2,1}
+  2 | {1,2,3}
+(2 rows)
+</pre></li>
+<li>Precompute feature probabilities and class priors. <pre class="example">
+SELECT madlib.create_nb_prepared_data_tables( 'training',
+                                              'class',
+                                              'attributes',
+                                              3,
+                                              'nb_feature_probs',
+                                              'nb_class_priors'
+                                            );
+</pre></li>
+<li>Optionally check the contents of the precomputed tables. <pre 
class="example">
+SELECT * FROM nb_class_priors;
+</pre> Result: <pre class="result">
+ class | class_cnt | all_cnt
+&#160;------+-----------+---------
+     1 |         3 |       6
+     2 |         3 |       6
+(2 rows)
+</pre> <pre class="example">
+SELECT * FROM nb_feature_probs;
+</pre> Result: <pre class="result">
+ class | attr | value | cnt | attr_cnt
+&#160;------+------+-------+-----+----------
+     1 |    1 |     0 |   0 |        2
+     1 |    1 |     1 |   3 |        2
+     1 |    2 |     1 |   0 |        3
+     1 |    2 |     2 |   2 |        3
+...
+</pre></li>
+<li>Create the view with Naive Bayes classification and check the results. 
<pre class="example">
+SELECT madlib.create_nb_classify_view( 'nb_feature_probs',
+                                       'nb_class_priors',
+                                       'toclassify',
+                                       'id',
+                                       'attributes',
+                                       3,
+                                       'nb_classify_view_fast'
+                                     );
+&#160;
+SELECT * FROM nb_classify_view_fast;
+</pre> Result: <pre class="result">
+ key | nb_classification
+&#160;----+-------------------
+   1 | {2}
+   2 | {1}
+(2 rows)
+</pre></li>
+<li>Look at the probabilities for each class (note that we use "Laplacian 
smoothing"), <pre class="example">
+SELECT madlib.create_nb_probs_view( 'nb_feature_probs',
+                                    'nb_class_priors',
+                                    'toclassify',
+                                    'id',
+                                    'attributes',
+                                    3,
+                                    'nb_probs_view_fast'
+                                  );
+&#160;
+SELECT * FROM nb_probs_view_fast;
+</pre> Result: <pre class="result">
+ key | class | nb_prob
+&#160;----+-------+---------
+   1 |     1 |     0.4
+   1 |     2 |     0.6
+   2 |     1 |    0.75
+   2 |     2 |    0.25
+(4 rows)
+</pre></li>
+</ol>
+<p>The following is an example of using a dataset with both numeric and 
categorical attributes</p>
+<ol type="1">
+<li>The training and the classification data. Attributes 
{height(numeric),weight(numeric),shoe size(categorical)}, 
Class{sex(1=male,2=female)} <pre class="example">
+SELECT * FROM gaussian_data;
+</pre> Result: <pre class="result">
+ id | sex |  attributes   
+&#160;----+-----+---------------
+  1 |   1 | {6,180,12}
+  2 |   1 | {5.92,190,12}
+  3 |   1 | {5.58,170,11}
+  4 |   1 | {5.92,165,11}
+  5 |   2 | {5,100,6}
+  6 |   2 | {5.5,150,6}
+  7 |   2 | {5.42,130,7}
+  8 |   2 | {5.75,150,8}
+(8 rows)
+</pre> <pre class="example">
+SELECT * FROM gaussian_test;
+</pre> Result: <pre class="result">
+ id | sex |  attributes  
+----+-----+--------------
+  9 |   1 | {5.8,180,11}
+ 10 |   2 | {5,160,6}
+(2 rows)
+</pre></li>
+<li>Precompute feature probabilities and class priors. <pre class="example">
+SELECT madlib.create_nb_prepared_data_tables( 'gaussian_data',
+                                              'sex',
+                                              'attributes',
+                                              'ARRAY[1,2]',
+                                              3,
+                                              'categ_feature_probs',
+                                              'numeric_attr_params',
+                                              'class_priors'
+                                            );
+</pre></li>
+<li>Optionally check the contents of the precomputed tables. <pre 
class="example">
+SELECT * FROM class_priors;
+</pre> Result: <pre class="result">
+class | class_cnt | all_cnt 
+&#160;-------+-----------+---------
+     1 |         4 |       8
+     2 |         4 |       8
+(2 rows)
+</pre> <pre class="example">
+SELECT * FROM categ_feature_probs;
+</pre> Result: <pre class="result">
+ class | attr | value | cnt | attr_cnt 
+-------+------+-------+-----+----------
+     2 |    3 |     6 |   2 |        5
+     1 |    3 |    12 |   2 |        5
+     2 |    3 |     7 |   1 |        5
+     1 |    3 |    11 |   2 |        5
+     2 |    3 |     8 |   1 |        5
+     2 |    3 |    12 |   0 |        5
+     1 |    3 |     6 |   0 |        5
+     2 |    3 |    11 |   0 |        5
+     1 |    3 |     8 |   0 |        5
+     1 |    3 |     7 |   0 |        5
+(10 rows)
+</pre> <pre class="example">
+SELECT * FROM numeric_attr_params;
+</pre> Result: <pre class="result">
+class | attr |      attr_mean       |        attr_var        
+-------+------+----------------------+------------------------
+     1 |    1 |   5.8550000000000000 | 0.03503333333333333333
+     1 |    2 | 176.2500000000000000 |   122.9166666666666667
+     2 |    1 |   5.4175000000000000 | 0.09722500000000000000
+     2 |    2 | 132.5000000000000000 |   558.3333333333333333
+(4 rows)
+</pre></li>
+<li>Create the view with Naive Bayes classification and check the results. 
<pre class="example">
+SELECT madlib.create_nb_classify_view( 'categ_feature_probs',
+                                       'class_priors',
+                                       'gaussian_test',
+                                       'id',
+                                       'attributes',
+                                       3,
+                                       'numeric_attr_params',
+                                       'classify_view'
+                                     );
+&#160;
+SELECT * FROM classify_view;
+</pre> Result: <pre class="result">
+ key | nb_classification
+&#160;----+-------------------
+   9 | {1}
+   10 | {2}
+(2 rows)
+</pre></li>
+<li>Look at the probabilities for each class <pre class="example">
+SELECT madlib.create_nb_probs_view( 'categ_feature_probs',
+                                       'class_priors',
+                                       'gaussian_test',
+                                       'id',
+                                       'attributes',
+                                       3,
+                                       'numeric_attr_params',
+                                       'probs_view'
+                                  );
+&#160;
+SELECT * FROM probs_view;
+</pre> Result: <pre class="result">
+ key | class |       nb_prob        
+-----+-------+----------------------
+   9 |     1 |    0.993556745948775
+   9 |     2 |  0.00644325405122553
+  10 |     1 | 5.74057538627122e-05
+  10 |     2 |    0.999942594246137
+(4 rows)
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>In detail, <b>Bayes'</b> theorem states that </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \Pr(C = c \mid A_1 = a_1, \dots, A_n = a_n) = 
\frac{\Pr(C = c) \cdot \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c)} {\Pr(A_1 = 
a_1, \dots, A_n = a_n)} \,, \]" src="form_26.png"/>
+</p>
+<p> and the <b>naive</b> assumption is that </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c) = 
\prod_{i=1}^n \Pr(A_i = a_i \mid C = c) \,. \]" src="form_27.png"/>
+</p>
+<p> Naives Bayes classification estimates feature probabilities and class 
priors using maximum likelihood or Laplacian smoothing. These parameters are 
then used to classifying new data.</p>
+<p>A Naive Bayes classifier computes the following formula: </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ \text{classify}(a_1, ..., a_n) = \arg\max_c 
\left\{ \Pr(C = c) \cdot \prod_{i=1}^n \Pr(A_i = a_i \mid C = c) \right\} \]" 
src="form_28.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$ c $" src="form_19.png"/> ranges over 
all classes in the training data and probabilites are estimated with relative 
frequencies from the training set. There are different ways to estimate the 
feature probabilities <img class="formulaInl" alt="$ P(A_i = a \mid C = c) $" 
src="form_29.png"/>. The maximum likelihood estimate takes the relative 
frequencies. That is: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(A_i = a \mid C = c) = \frac{\#(c,i,a)}{\#c} 
\]" src="form_30.png"/>
+</p>
+<p> where</p><ul>
+<li><img class="formulaInl" alt="$ \#(c,i,a) $" src="form_31.png"/> denotes 
the # of training samples where attribute <img class="formulaInl" alt="$ i $" 
src="form_32.png"/> is <img class="formulaInl" alt="$ a $" src="form_33.png"/> 
and class is <img class="formulaInl" alt="$ c $" src="form_19.png"/></li>
+<li><img class="formulaInl" alt="$ \#c $" src="form_34.png"/> denotes the # of 
training samples where class is <img class="formulaInl" alt="$ c $" 
src="form_19.png"/>.</li>
+</ul>
+<p>Since the maximum likelihood sometimes results in estimates of "0", you 
might want to use a "smoothed" estimate. To do this, you add a number of 
"virtual" samples and make the assumption that these samples are evenly 
distributed among the values assumed by attribute <img class="formulaInl" 
alt="$ i $" src="form_32.png"/> (that is, the set of all values observed for 
attribute <img class="formulaInl" alt="$ a $" src="form_33.png"/> for any 
class):</p>
+<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(A_i = a \mid C = c) = \frac{\#(c,i,a) + 
s}{\#c + s \cdot \#i} \]" src="form_35.png"/>
+</p>
+<p> where</p><ul>
+<li><img class="formulaInl" alt="$ \#i $" src="form_36.png"/> denotes the # of 
distinct values for attribute <img class="formulaInl" alt="$ i $" 
src="form_32.png"/> (for all classes)</li>
+<li><img class="formulaInl" alt="$ s \geq 0 $" src="form_37.png"/> denotes the 
smoothing factor.</li>
+</ul>
+<p>The case <img class="formulaInl" alt="$ s = 1 $" src="form_38.png"/> is 
known as "Laplace smoothing". The case <img class="formulaInl" alt="$ s = 0 $" 
src="form_39.png"/> trivially reduces to maximum-likelihood estimates.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] Tom Mitchell: Machine Learning, McGraw Hill, 1997. Book chapter 
<em>Generativ and Discriminative Classifiers: Naive Bayes and Logistic 
Regression</em> available at: <a 
href="http://www.cs.cmu.edu/~tom/NewChapters.html";>http://www.cs.cmu.edu/~tom/NewChapters.html</a></p>
+<p>[2] Wikipedia, Naive Bayes classifier, <a 
href="http://en.wikipedia.org/wiki/Naive_Bayes_classifier";>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd>File <a class="el" href="bayes_8sql__in.html" title="SQL 
functions for naive Bayes. ">bayes.sql_in</a> documenting the SQL 
functions.</dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__cg.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__cg.html b/docs/v1.9.1/group__grp__cg.html
new file mode 100644
index 0000000..ad239bc
--- /dev/null
+++ b/docs/v1.9.1/group__grp__cg.html
@@ -0,0 +1,180 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Conjugate Gradient</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__cg.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Conjugate Gradient<div class="ingroups"><a class="el" 
href="group__grp__early__stage.html">Early Stage Development</a></div></div>  
</div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#syntax">Function Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method 
is still in early stage development. There may be some issues that will be 
addressed in a future version. Interface and implementation is subject to 
change. </em></dd></dl>
+<p>This function uses the iterative conjugate gradient method [1] to find a 
solution to the function: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \boldsymbol Ax = \boldsymbol b \]" 
src="form_44.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$ \boldsymbol A $" src="form_45.png"/> 
is a symmetric, positive definite matrix and <img class="formulaInl" alt="$x$" 
src="form_42.png"/> and <img class="formulaInl" alt="$ \boldsymbol b $" 
src="form_43.png"/> are vectors.</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function 
Syntax</dt><dd>Conjugate gradient returns x as an array. It has the following 
syntax.</dd></dl>
+<pre class="syntax">
+conjugate_gradient( table_name,
+                    name_of_row_values_col,
+                    name_of_row_number_col,
+                    aray_of_b_values,
+                    desired_precision
+                  )
+</pre><p>Matrix <img class="formulaInl" alt="$ \boldsymbol A $" 
src="form_45.png"/> is assumed to be stored in a table where each row consists 
of at least two columns: array containing values of a given row, row number: 
</p><pre>{TABLE|VIEW} <em>matrix_A</em> (
+    <em>row_number</em> FLOAT,
+    <em>row_values</em> FLOAT[],
+)</pre><p> The number of elements in each row should be the same.</p>
+<p><img class="formulaInl" alt="$ \boldsymbol b $" src="form_43.png"/> is 
passed as a FLOAT[] to the function.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd><ol type="1">
+<li>Construct matrix A according to structure. <pre class="example">
+SELECT * FROM data;
+</pre> Result: <pre class="result">
+ row_num | row_val
+&#160;--------+---------
+       1 | {2,1}
+       2 | {1,4}
+(2 rows)
+</pre></li>
+<li>Call the conjugate gradient function. <pre class="example">
+SELECT conjugate_gradient( 'data',
+                           'row_val',
+                           'row_num',
+                           '{2,1}',
+                           1E-6,1
+                         );
+</pre> <pre class="result">
+INFO:  COMPUTE RESIDUAL ERROR 14.5655661859659
+INFO:  ERROR 0.144934004246004
+INFO:  ERROR 3.12963615962926e-31
+INFO:  TEST FINAL ERROR 2.90029642185163e-29
+    conjugate_gradient
+&#160;--------------------------
+ {1,-1.31838984174237e-15}
+(1 row)
+</pre></li>
+</ol>
+</dd></dl>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd>[1] "Conjugate gradient method" Wikipedia - <a 
href="http://en.wikipedia.org/wiki/Conjugate_gradient_method";>http://en.wikipedia.org/wiki/Conjugate_gradient_method</a></dd></dl>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd>File <a class="el" href="conjugate__gradient_8sql__in.html" 
title="SQL function computing Conjugate Gradient. 
">conjugate_gradient.sql_in</a> documenting the SQL function. </dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__clustered__errors.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__clustered__errors.html 
b/docs/v1.9.1/group__grp__clustered__errors.html
new file mode 100644
index 0000000..cbe30c2
--- /dev/null
+++ b/docs/v1.9.1/group__grp__clustered__errors.html
@@ -0,0 +1,400 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Clustered Variance</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__clustered__errors.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Clustered Variance<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" 
href="group__grp__regml.html">Regression Models</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#train_linregr">Clustered Variance Linear Regression Training 
Function</a> </li>
+<li>
+<a href="#train_logregr">Clustered Variance Logistic Regression Training 
Function</a> </li>
+<li>
+<a href="#train_mlogregr">Clustered Variance Multinomial Logistic Regression 
Training Function</a> </li>
+<li>
+<a href="#train_cox">Clustered Variance for Cox Proportional Hazards model</a> 
</li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#notes">Notes</a> </li>
+<li>
+<a href="#background">Technical Background</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>The Clustered Variance module adjusts standard errors for clustering. 
For example, replicating a dataset 100 times should not increase the precision 
of parameter estimates, but performing this procedure with the IID assumption 
will actually do this. Another example is in economics of education research, 
it is reasonable to expect that the error terms for children in the same class 
are not independent. Clustering standard errors can correct for this.</p>
+<p>The MADlib Clustered Variance module includes functions to calculate 
linear, logistic, and multinomial logistic regression problems.</p>
+<p><a class="anchor" id="train_linregr"></a></p><dl class="section 
user"><dt>Clustered Variance Linear Regression Training 
Function</dt><dd></dd></dl>
+<p>The clustered variance linear regression training function has the 
following syntax. </p><pre class="syntax">
+clustered_variance_linregr ( source_table,
+                             out_table,
+                             dependent_varname,
+                             independent_varname,
+                             clustervar,
+                             grouping_cols
+                           )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the input 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>out_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the generated table containing the 
output model. The output table contains the following columns. </p><table  
class="output">
+<tr>
+<th>coef </th><td>DOUBLE PRECISION[]. Vector of the coefficients of the 
regression.  </td></tr>
+<tr>
+<th>std_err </th><td>DOUBLE PRECISION[]. Vector of the standard error of the 
coefficients.  </td></tr>
+<tr>
+<th>t_stats </th><td>DOUBLE PRECISION[]. Vector of the t-stats of the 
coefficients.  </td></tr>
+<tr>
+<th>p_values </th><td>DOUBLE PRECISION[]. Vector of the p-values of the 
coefficients.  </td></tr>
+</table>
+<p>A summary table named &lt;out_table&gt;_summary is also created, which is 
the same as the summary table created by linregr_train function. Please refer 
to the documentation for linear regression for details.</p>
+<p></p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_varname </dt>
+<dd>TEXT. An expression to evaluate for the dependent variable. </dd>
+<dt>independent_varname </dt>
+<dd>TEXT. An Expression to evalue for the independent variables. </dd>
+<dt>clustervar </dt>
+<dd>TEXT. A comma-separated list of the columns to use as cluster variables. 
</dd>
+<dt>grouping_cols (optional) </dt>
+<dd>TEXT, default: NULL. <em>Not currently implemented. Any non-NULL value is 
ignored.</em> An expression list used to group the input dataset into discrete 
groups, running one regression per group. Similar to the SQL GROUP BY clause. 
When this value is null, no grouping is used and a single result model is 
generated. </dd>
+</dl>
+<p><a class="anchor" id="train_logregr"></a></p><dl class="section 
user"><dt>Clustered Variance Logistic Regression Training 
Function</dt><dd></dd></dl>
+<p>The clustered variance logistic regression training function has the 
following syntax. </p><pre class="syntax">
+clustered_variance_logregr( source_table,
+                            out_table,
+                            dependent_varname,
+                            independent_varname,
+                            clustervar,
+                            grouping_cols,
+                            max_iter,
+                            optimizer,
+                            tolerance,
+                            verbose_mode
+                          )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd>TEXT. The name of the table containing the input data. </dd>
+<dt>out_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the generated table containing the 
output model. The output table has the following columns: </p><table  
class="output">
+<tr>
+<th>coef </th><td>Vector of the coefficients of the regression.  </td></tr>
+<tr>
+<th>std_err </th><td>Vector of the standard error of the coefficients.  
</td></tr>
+<tr>
+<th>z_stats </th><td>Vector of the z-stats of the coefficients.  </td></tr>
+<tr>
+<th>p_values </th><td>Vector of the p-values of the coefficients.  </td></tr>
+</table>
+<p>A summary table named &lt;out_table&gt;_summary is also created, which is 
the same as the summary table created by logregr_train function. Please refer 
to the documentation for logistic regression for details.</p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_varname </dt>
+<dd>TEXT. An expression to evaluate for the dependent variable. </dd>
+<dt>independent_varname </dt>
+<dd>TEXT. An expression to evaluate for the independent variable. </dd>
+<dt>clustervar </dt>
+<dd>TEXT. A comma-separated list of columns to use as cluster variables. </dd>
+<dt>grouping_cols (optional) </dt>
+<dd>TEXT, default: NULL. <em>Not yet implemented. Any non-NULL values are 
ignored.</em> An expression list used to group the input dataset into discrete 
groups, running one regression per group. Similar to the SQL GROUP BY clause. 
When this value is NULL, no grouping is used and a single result model is 
generated. </dd>
+<dt>max_iter (optional) </dt>
+<dd>INTEGER, default: 20. The maximum number of iterations that are allowed. 
</dd>
+<dt>optimizer (optional) </dt>
+<dd>TEXT, default: 'irls'. The name of the optimizer to use: <ul>
+<li>
+'newton' or 'irls': Iteratively reweighted least squares </li>
+<li>
+'cg': conjugate gradient </li>
+<li>
+'igd': incremental gradient descent. </li>
+</ul>
+</dd>
+<dt>tolerance (optional) </dt>
+<dd>FLOAT8, default: 0.0001 The difference between log-likelihood values in 
successive iterations that should indicate convergence. A zero disables the 
convergence criterion, so that execution stops after <em>n</em> Iterations have 
completed. </dd>
+<dt>verbose_mode (optional) </dt>
+<dd>BOOLEAN, default FALSE. Provides verbose_mode output of the results of 
training. </dd>
+</dl>
+<p><a class="anchor" id="train_mlogregr"></a></p><dl class="section 
user"><dt>Clustered Variance Multinomial Logistic Regression Training 
Function</dt><dd></dd></dl>
+<pre class="syntax">
+clustered_variance_mlogregr( source_table,
+                             out_table,
+                             dependent_varname,
+                             independent_varname,
+                             cluster_varname,
+                             ref_category,
+                             grouping_cols,
+                             optimizer_params,
+                             verbose_mode
+                           )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd>TEXT. The name of the table containing the input data. </dd>
+<dt>out_table </dt>
+<dd><p class="startdd">TEXT. The name of the table where the regression model 
will be stored. The output table has the following columns: </p><table  
class="output">
+<tr>
+<th>category </th><td>The category.  </td></tr>
+<tr>
+<th>ref_category </th><td>The refererence category used for modeling.  
</td></tr>
+<tr>
+<th>coef </th><td>Vector of the coefficients of the regression.  </td></tr>
+<tr>
+<th>std_err </th><td>Vector of the standard error of the coefficients.  
</td></tr>
+<tr>
+<th>z_stats </th><td>Vector of the z-stats of the coefficients.  </td></tr>
+<tr>
+<th>p_values </th><td>Vector of the p-values of the coefficients.  </td></tr>
+</table>
+<p class="enddd">A summary table named &lt;out_table&gt;_summary is also 
created, which is the same as the summary table created by mlogregr_train 
function. Please refer to the documentation for multinomial logistic regression 
for details.  </p>
+</dd>
+<dt>dependent_varname </dt>
+<dd>TEXT. An expression to evaluate for the dependent variable. </dd>
+<dt>independent_varname </dt>
+<dd>TEXT. An expression to evaluate for the independent variable. </dd>
+<dt>cluster_varname </dt>
+<dd>TEXT. A comma-separated list of columns to use as cluster variables. </dd>
+<dt>ref_category (optional) </dt>
+<dd>INTEGER. Reference category in the range [0, num_category). </dd>
+<dt>groupingvarng_cols (optional) </dt>
+<dd>TEXT, default: NULL. <em>Not yet implemented. Any non-NULL values are 
ignored.</em> A comma-separated list of columns to use as grouping variables. 
</dd>
+<dt>optimizer_params (optional) </dt>
+<dd>TEXT, default: NULL, which uses the default values of optimizer 
parameters: max_iter=20, optimizer='newton', tolerance=1e-4. It should be a 
string that contains pairs of 'key=value' separated by commas. </dd>
+<dt>verbose_mode (optional) </dt>
+<dd>BOOLEAN, default FALSE. If TRUE, detailed information is printed when 
computing logistic regression. </dd>
+</dl>
+<p><a class="anchor" id="train_cox"></a></p><dl class="section 
user"><dt>Clustered Variance for Cox Proportional Hazards 
model</dt><dd></dd></dl>
+<p>The clustered robust variance estimator function for the Cox Proportional 
Hazards model has the following syntax. </p><pre class="syntax">
+clustered_variance_coxph(model_table, output_table, clustervar)
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>model_table </dt>
+<dd>TEXT. The name of the model table, which is exactaly the same as the 
'output_table' parameter of <a class="el" 
href="cox__prop__hazards_8sql__in.html#a737450bbfe0f10204b0074a9d45b0cef" 
title="Compute cox-regression coefficients and diagnostic statistics. 
">coxph_train()</a> function. </dd>
+<dt>output_table </dt>
+<dd>TEXT. The name of the table where the output is saved. It has the 
following columns: <table  class="output">
+<tr>
+<th>coef </th><td>FLOAT8[]. Vector of the coefficients.  </td></tr>
+<tr>
+<th>loglikelihood </th><td>FLOAT8. Log-likelihood value of the MLE estimate.  
</td></tr>
+<tr>
+<th>std_err </th><td>FLOAT8[]. Vector of the standard error of the 
coefficients.  </td></tr>
+<tr>
+<th>clustervar </th><td>TEXT. A comma-separated list of columns to use as 
cluster variables.  </td></tr>
+<tr>
+<th>clustered_se </th><td>FLOAT8[]. Vector of the robust standard errors of 
the coefficients.  </td></tr>
+<tr>
+<th>clustered_z </th><td>FLOAT8[]. Vector of the robust z-stats of the 
coefficients.  </td></tr>
+<tr>
+<th>clustered_p </th><td>FLOAT8[]. Vector of the robust p-values of the 
coefficients.  </td></tr>
+<tr>
+<th>hessian </th><td>FLOAT8[]. The Hessian matrix.  </td></tr>
+</table>
+</dd>
+<dt>clustervar </dt>
+<dd>TEXT. A comma-separated list of columns to use as cluster variables. </dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>View online help for the clustered variance linear regression function. 
<pre class="example">
+SELECT madlib.clustered_variance_linregr();
+</pre></li>
+<li>Run the linear regression function and view the results. <pre 
class="example">
+DROP TABLE IF EXISTS out_table;
+SELECT madlib.clustered_variance_linregr( 'abalone',
+                                          'out_table',
+                                          'rings',
+                                          'ARRAY[1, diameter, length, width]',
+                                          'sex',
+                                          NULL
+                                        );
+SELECT * FROM out_table;
+</pre></li>
+<li>View online help for the clustered variance logistic regression function. 
<pre class="example">
+SELECT madlib.clustered_variance_logregr();
+</pre></li>
+<li>Run the logistic regression function and view the results. <pre 
class="example">
+DROP TABLE IF EXISTS out_table;
+SELECT madlib.clustered_variance_logregr( 'abalone',
+                                          'out_table',
+                                          'rings &lt; 10',
+                                          'ARRAY[1, diameter, length, width]',
+                                          'sex'
+                                        );
+SELECT * FROM out_table;
+</pre></li>
+<li>View online help for the clustered variance multinomial logistic 
regression function. <pre class="example">
+SELECT madlib.clustered_variance_mlogregr();
+</pre></li>
+<li>Run the multinomial logistic regression and view the results. <pre 
class="example">
+DROP TABLE IF EXISTS out_table;
+SELECT madlib.clustered_variance_mlogregr( 'abalone',
+                                           'out_table',
+                                           'CASE WHEN rings &lt; 10 THEN 1 
ELSE 0 END',
+                                           'ARRAY[1, diameter, length, width]',
+                                           'sex',
+                                           0
+                                         );
+SELECT * FROM out_table;
+</pre></li>
+<li>Run the Cox Proportional Hazards regression and compute the clustered 
robust estimator. <pre class="example">
+DROP TABLE IF EXISTS lung_cl_out;
+DROP TABLE IF EXISTS lung_out;
+DROP TABLE IF EXISTS lung_out_summary;
+SELECT madlib.coxph_train('lung',
+                          'lung_out',
+                          'time',
+                          'array[age, "ph.ecog"]',
+                          'TRUE',
+                          NULL,
+                          NULL);
+SELECT madlib.clustered_variance_coxph('lung_out',
+                                       'lung_cl_out',
+                                       '"ph.karno"');
+SELECT * FROM lung_cl_out;
+</pre></li>
+</ol>
+<p><a class="anchor" id="notes"></a></p><dl class="section 
user"><dt>Notes</dt><dd></dd></dl>
+<ul>
+<li>Note that we need to manually include an intercept term in the independent 
variable expression. The NULL value of <em>groupingvar</em> means that there is 
no grouping in the calculation.</li>
+</ul>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>Assume that the data can be separated into <img class="formulaInl" 
alt="$m$" src="form_314.png"/> clusters. Usually this can be done by grouping 
the data table according to one or multiple columns.</p>
+<p>The estimator has a similar form to the usual sandwich estimator </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ S(\vec{c}) = B(\vec{c}) M(\vec{c}) B(\vec{c}) 
\]" src="form_315.png"/>
+</p>
+<p>The bread part is the same as Huber-White sandwich estimator </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\begin{eqnarray} B(\vec{c}) &amp; = &amp; 
\left(-\sum_{i=1}^{n} H(y_i, \vec{x}_i, \vec{c})\right)^{-1}\\ &amp; = &amp; 
\left(-\sum_{i=1}^{n}\frac{\partial^2 l(y_i, \vec{x}_i, \vec{c})}{\partial 
c_\alpha \partial c_\beta}\right)^{-1} \end{eqnarray}" src="form_316.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$H$" src="form_317.png"/> is the 
hessian matrix, which is the second derivative of the target function </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ L(\vec{c}) = \sum_{i=1}^n l(y_i, \vec{x}_i, 
\vec{c})\ . \]" src="form_318.png"/>
+</p>
+<p>The meat part is different </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ M(\vec{c}) = \bf{A}^T\bf{A} \]" 
src="form_319.png"/>
+</p>
+<p> where the <img class="formulaInl" alt="$m$" src="form_314.png"/>-th row of 
<img class="formulaInl" alt="$\bf{A}$" src="form_320.png"/> is </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ A_m = \sum_{i\in G_m}\frac{\partial 
l(y_i,\vec{x}_i,\vec{c})}{\partial \vec{c}} \]" src="form_321.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$G_m$" src="form_322.png"/> is the set 
of rows that belong to the same cluster.</p>
+<p>We can compute the quantities of <img class="formulaInl" alt="$B$" 
src="form_206.png"/> and <img class="formulaInl" alt="$A$" src="form_41.png"/> 
for each cluster during one scan through the data table in an aggregate 
function. Then sum over all clusters to the full <img class="formulaInl" 
alt="$B$" src="form_206.png"/> and <img class="formulaInl" alt="$A$" 
src="form_41.png"/> in the outside of the aggregate function. At last, the 
matrix mulplitications are done in a separate function on the master node.</p>
+<p>When multinomial logistic regression is computed before the multinomial 
clustered variance calculation, it uses a default reference category of zero 
and the regression coefficients are included in the output table. The 
regression coefficients in the output are in the same order as multinomial 
logistic regression function, which is described below. For a problem with <img 
class="formulaInl" alt="$ K $" src="form_117.png"/> dependent variables <img 
class="formulaInl" alt="$ (1, ..., K) $" src="form_118.png"/> and <img 
class="formulaInl" alt="$ J $" src="form_119.png"/> categories <img 
class="formulaInl" alt="$ (0, ..., J-1) $" src="form_120.png"/>, let <img 
class="formulaInl" alt="$ {m_{k,j}} $" src="form_121.png"/> denote the 
coefficient for dependent variable <img class="formulaInl" alt="$ k $" 
src="form_97.png"/> and category <img class="formulaInl" alt="$ j $" 
src="form_122.png"/>. The output is <img class="formulaInl" alt="$ {m_{k_1, 
j_0}, m_{k_1, j_1} \ldots m_{k_1, j_{J-1}},
  m_{k_2, j_0}, m_{k_2, j_1} \ldots m_{k_K, j_{J-1}}} $" src="form_323.png"/>. 
The order is NOT CONSISTENT with the multinomial regression marginal effect 
calculation with function <em>marginal_mlogregr</em>. This is deliberate 
because the interfaces of all multinomial regressions (robust, clustered, ...) 
will be moved to match that used in marginal.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] Standard, Robust, and Clustered Standard Errors Computed in R, <a 
href="http://diffuseprior.wordpress.com/2012/06/15/standard-robust-and-clustered-standard-errors-computed-in-r/";>http://diffuseprior.wordpress.com/2012/06/15/standard-robust-and-clustered-standard-errors-computed-in-r/</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd>File <a class="el" 
href="clustered__variance_8sql__in.html">clustered_variance.sql_in</a> 
documenting the clustered variance SQL functions.</dd></dl>
+<p>File <a class="el" href="clustered__variance__coxph_8sql__in.html" 
title="SQL functions for clustered robust cox proportional hazards regression. 
">clustered_variance_coxph.sql_in</a> documenting the clustered variance for 
Cox proportional hazards SQL functions.</p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__clustering.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__clustering.html 
b/docs/v1.9.1/group__grp__clustering.html
new file mode 100644
index 0000000..0cf3a34
--- /dev/null
+++ b/docs/v1.9.1/group__grp__clustering.html
@@ -0,0 +1,134 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Clustering</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__clustering.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Clustering<div class="ingroups"><a class="el" 
href="group__grp__unsupervised.html">Unsupervised Learning</a></div></div>  
</div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>A collection of methods for clustering data </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__kmeans"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__kmeans.html">k-Means Clustering</a></td></tr>
+<tr class="memdesc:group__grp__kmeans"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Partitions a set of observations into clusters by finding 
centroids that minimize the sum of observations' distances from their closest 
centroid. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__clustering.js
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__clustering.js 
b/docs/v1.9.1/group__grp__clustering.js
new file mode 100644
index 0000000..61858da
--- /dev/null
+++ b/docs/v1.9.1/group__grp__clustering.js
@@ -0,0 +1,4 @@
+var group__grp__clustering =
+[
+    [ "k-Means Clustering", "group__grp__kmeans.html", null ]
+];
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/bed9253d/docs/v1.9.1/group__grp__correlation.html
----------------------------------------------------------------------
diff --git a/docs/v1.9.1/group__grp__correlation.html 
b/docs/v1.9.1/group__grp__correlation.html
new file mode 100644
index 0000000..38d945b
--- /dev/null
+++ b/docs/v1.9.1/group__grp__correlation.html
@@ -0,0 +1,262 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.10"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Pearson&#39;s Correlation</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+  $(window).load(resizeHeight);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.net";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.9.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.10 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__correlation.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Pearson's Correlation<div class="ingroups"><a class="el" 
href="group__grp__stats.html">Statistics</a> &raquo; <a class="el" 
href="group__grp__desc__stats.html">Descriptive Statistics</a></div></div>  
</div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> </p><ul>
+<li>
+<a href="#usage">Correlation Function</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#seealso">See Also</a> </li>
+</ul>
+</div><p>A correlation function is the degree and direction of association of 
two variables&mdash;how well one random variable can be predicted from the 
other. The coefficient of correlation varies from -1 to 1. A coefficient of 1 
implies perfect correlation, 0 means no correlation, and -1 means perfect 
anti-correlation.</p>
+<p>This function provides a cross-correlation matrix for all pairs of numeric 
columns in a <em>source_table</em>. A correlation matrix describes correlation 
among <img class="formulaInl" alt="$ M $" src="form_174.png"/> variables. It is 
a square symmetrical <img class="formulaInl" alt="$ M $" src="form_174.png"/>x 
<img class="formulaInl" alt="$M $" src="form_380.png"/> matrix with the <img 
class="formulaInl" alt="$ (ij) $" src="form_381.png"/>th element equal to the 
correlation coefficient between the <img class="formulaInl" alt="$i$" 
src="form_128.png"/>th and the <img class="formulaInl" alt="$j$" 
src="form_129.png"/>th variable. The diagonal elements (correlations of 
variables with themselves) are always equal to 1.0.</p>
+<p><a class="anchor" id="usage"></a></p><dl class="section 
user"><dt>Correlation Function</dt><dd></dd></dl>
+<p>The correlation function has the following syntax: </p><pre class="syntax">
+correlation( source_table,
+             output_table,
+             target_cols,
+             verbose
+           )
+</pre><p>The covariance function, with a similar syntax, can be used to 
compute the covariance between features. </p><pre class="syntax">
+covariance( source_table,
+             output_table,
+             target_cols,
+             verbose
+           )
+</pre><dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. The name of the data containing the input 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. The name of the table where the cross-correlation 
matrix will be saved. The output is a table with N+2 columns and N rows, where 
N is the number of target columns. It contains the following columns. 
</p><table  class="output">
+<tr>
+<th>column_position </th><td>The first column is a sequential counter 
indicating the position of the variable in the '<em>output_table</em>'.  
</td></tr>
+<tr>
+<th>variable </th><td>The second column contains the row-header for the 
variables.  </td></tr>
+<tr>
+<th>&lt;...&gt; </th><td>The remainder of the table is the NxN correlation 
matrix for the pairs of numeric 'source_table' columns.  </td></tr>
+</table>
+<p>The output table is arranged as a lower-triangular matrix with the upper 
triangle set to NULL and the diagonal elements set to 1.0. To obtain the result 
from the '<em>output_table</em>' in this matrix format ensure to order the 
elements using the '<em>column_position</em>', as shown in the example below. 
</p><pre class="example">
+SELECT * FROM output_table ORDER BY column_position;
+</pre><p>In addition to output table, a summary table named 
&lt;output_table&gt;_summary is also created at the same time, which has the 
following columns: </p><table  class="output">
+<tr>
+<th>method</th><td>'correlation' </td></tr>
+<tr>
+<th>source_table</th><td>VARCHAR. The data source table name. </td></tr>
+<tr>
+<th>output_table</th><td>VARCHAR. The output table name. </td></tr>
+<tr>
+<th>column_names</th><td>VARCHAR. Column names used for correlation 
computation, comma-separated string. </td></tr>
+<tr>
+<th>mean_vector</th><td>FLOAT8[]. Vector where each is the mean of a column. 
</td></tr>
+<tr>
+<th>total_rows_processed </th><td>BIGINT. Total numbers of rows processed.  
</td></tr>
+<tr>
+<th>total_rows_skipped </th><td>BIGINT. Total numbers of rows skipped due to 
missing values.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>target_cols (optional) </dt>
+<dd><p class="startdd">TEXT, default: '*'. A comma-separated list of the 
columns to correlate. If NULL or <code>'*'</code>, results are produced for all 
numeric columns.</p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd><p class="startdd">BOOLEAN, default: FALSE. Print verbose debugging 
information if TRUE.</p>
+<p class="enddd"></p>
+</dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>View online help for the correlation function. <pre class="example">
+SELECT madlib.correlation();
+</pre></li>
+<li>Create an input data set. <pre class="example">
+DROP TABLE IF EXISTS example_data;
+CREATE TABLE example_data(
+    id SERIAL, outlook TEXT,
+    temperature FLOAT8, humidity FLOAT8,
+    windy TEXT, class TEXT);
+INSERT INTO example_data VALUES
+(1, 'sunny', 85, 85, 'false', 'Dont Play'),
+(2, 'sunny', 80, 90, 'true', 'Dont Play'),
+(3, 'overcast', 83, 78, 'false', 'Play'),
+(4, 'rain', 70, 96, 'false', 'Play'),
+(5, 'rain', 68, 80, 'false', 'Play'),
+(6, 'rain', 65, 70, 'true', 'Dont Play'),
+(7, 'overcast', 64, 65, 'true', 'Play'),
+(8, 'sunny', 72, 95, 'false', 'Dont Play'),
+(9, 'sunny', 69, 70, 'false', 'Play'),
+(10, 'rain', 75, 80, 'false', 'Play'),
+(11, 'sunny', 75, 70, 'true', 'Play'),
+(12, 'overcast', 72, 90, 'true', 'Play'),
+(13, 'overcast', 81, 75, 'false', 'Play'),
+(14, 'rain', 71, 80, 'true', 'Dont Play'),
+(15, NULL, 100, 100, 'true', NULL),
+(16, NULL, 110, 100, 'true', NULL);
+</pre></li>
+<li>Run the <a class="el" 
href="correlation_8sql__in.html#ada17a10ea8a6c4580e7413c86ae5345e">correlation()</a>
 function on the data set. <pre class="example">
+-- Correlate all numeric columns
+SELECT madlib.correlation( 'example_data',
+                           'example_data_output'
+                         );
+-- Setting target_cols to NULL or '*' also correlates all numeric columns
+SELECT madlib.correlation( 'example_data',
+                           'example_data_output',
+                           '*'
+                         );
+-- Correlate only the temperature and humidity columns
+SELECT madlib.correlation( 'example_data',
+                           'example_data_output',
+                           'temperature, humidity'
+                         );
+</pre></li>
+<li>View the correlation matrix. <pre class="example">
+SELECT * FROM example_data_output ORDER BY column_position;
+</pre> Result: <pre class="result">
+ column_position |  variable   |    temperature    | humidity
+-----------------+-------------+-------------------+----------
+               1 | temperature |               1.0 |
+               2 | humidity    | 0.616876934548786 |      1.0
+(2 rows)
+</pre></li>
+<li>Compute the covariance of features in the data set. <pre class="example">
+SELECT madlib.covariance( 'example_data',
+                          'cov_output'
+                         );
+</pre></li>
+<li>View the covariance matrix. <pre class="example">
+SELECT * FROM cov_output ORDER BY column_position;
+</pre> Result: <pre class="result">
+ column_position |  variable   |    temperature    | humidity
+-----------------+-------------+-------------------+----------
+               1 | temperature |      146.25       |
+               2 | humidity    |      82.125       | 121.1875
+(2 rows)
+</pre></li>
+</ol>
+<dl class="section user"><dt>Notes</dt><dd>Current implementation ignores a 
row that contains NULL entirely. This means any correlation in such a row (with 
NULLs) does not contribute to the final answer.</dd></dl>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="correlation_8sql__in.html" title="SQL functions 
for correlation computation. ">correlation.sql_in</a> documenting the SQL 
functions</p>
+<p><a class="el" href="group__grp__summary.html">Summary</a> for general 
descriptive statistics for a table </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue Sep 20 2016 11:27:01 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
+  </ul>
+</div>
+</body>
+</html>

[09/51] [partial] incubator-madlib-site git commit: Update doc for 1.9.1 release

Reply via email to