[18/51] [partial] incubator-madlib-site git commit: Add v1.11 docs

riyer Tue, 16 May 2017 13:30:16 -0700

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__pred.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__pred.html b/docs/v1.11/group__grp__pred.html
new file mode 100644
index 0000000..ac5aaa4
--- /dev/null
+++ b/docs/v1.11/group__grp__pred.html
@@ -0,0 +1,359 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Prediction Metrics</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__pred.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Prediction Metrics<div class="ingroups"><a class="el" 
href="group__grp__mdl.html">Model Evaluation</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#list">List of Prediction Metric Functions</a> </li>
+<li>
+<a href="#specs">Function Specific Details</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>This module provides a set of metrics to evaluate the quality of 
predictions of a model. A typical function will take a set of "prediction" and 
"observation" values and use them to calculate the desired metric, unless noted 
otherwise. Grouping is supported for all functions (except confusion 
matrix).</p>
+<p><a class="anchor" id="list"></a></p><dl class="section user"><dt>Prediction 
Metrics Functions</dt><dd><table class="output">
+<tr>
+<th>mean_abs_error(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>Mean absolute error  </td></tr>
+<tr>
+<th>mean_abs_perc_error(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>Mean absolute percentage error  </td></tr>
+<tr>
+<th>mean_perc_error(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>Mean percentage error  </td></tr>
+<tr>
+<th>mean_squared_error(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>Mean squared error </td></tr>
+<tr>
+<th>r2_score(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>R-squared  </td></tr>
+<tr>
+<th>adjusted_r2_score(table_in, table_out, prediction_col, observed_col, 
num_predictors, training_size, grouping_cols)</th><td>Adjusted R-squared  
</td></tr>
+<tr>
+<th>binary_classifier(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>Collection of prediction metrics related to binary 
classification </td></tr>
+<tr>
+<th>area_under_roc(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>Area under the ROC curve (in binary classification)  
</td></tr>
+<tr>
+<th>confusion_matrix(table_in, table_out, prediction_col, observed_col, 
grouping_cols)</th><td>Confusion matrix for a multi-class classifier  </td></tr>
+</table>
+</dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>table_in </dt>
+<dd>TEXT. Name of the input table. </dd>
+<dt>table_out </dt>
+<dd>TEXT. Name of the output table. For consistency, a table is created for 
all metric outputs even when grouping is not used, which may mean there is only 
a single value in the output table in some cases.  </dd>
+<dt>prediction_col </dt>
+<dd>TEXT. Name of the column of predicted values from input table. </dd>
+<dt>observed_col </dt>
+<dd>TEXT. Name of the column of observed values from input table. </dd>
+<dt>num_predictors (for adjusted R-squared score only) </dt>
+<dd>INTEGER. The number of parameters in the predicting model, not counting 
the constant term. </dd>
+<dt>training_size (for adjusted R-squared score only) </dt>
+<dd>INTEGER. The number of rows used for training, excluding any NULL rows. 
</dd>
+<dt>grouping_cols (optional) </dt>
+<dd>TEXT, default: NULL. Name of the column of grouping values from input 
table. </dd>
+</dl>
+<p><a class="anchor" id="specs"></a></p><dl class="section user"><dt>Function 
Specific Details</dt><dd></dd></dl>
+<p><b>R-squared Score</b></p>
+<p>This function returns the coefficient of determination (R2) between the 
predicted and observed values. An R2 of 1 indicates that the regression line 
perfectly fits the data, while an R2 of 0 indicates that the line does not fit 
the data at all. Negative values of R2 may occur when fitting non-linear 
functions to data. Please refer to reference <a href="#r2">[1]</a> for more 
details.</p>
+<p><b>Adjusted R-squared Score</b></p>
+<p>This function returns the adjusted R2 score in addition to the R-squared 
score described above. Adjusted R2 score is used to counter the problem of the 
R2 automatically increasing when extra explanatory variables are added to the 
model. It takes two additional parameters describing the degrees of freedom of 
the model (num_predictors) and the size of the training set over which it was 
developed (training_size):</p>
+<ul>
+<li>num_predictors: Indicates the number of parameters the model has other 
than the constant term. For example, if it is set to '3' the model may take the 
following form as an example: 7 + 5x + 39y + 0.91z.</li>
+<li>training_size: Indicates the number of rows in the training set (excluding 
any NULL rows).</li>
+</ul>
+<p>Neither of these arguments can be deduced from the predicted values and the 
test data alone which is why they are explicit inputs. Please refer to 
reference <a href="#r2">[1]</a> for more details.</p>
+<p><a class="anchor" id="bc"></a><b>Binary Classification</b></p>
+<p>This function returns an output table with a number of metrics commonly 
used in binary classification.</p>
+<p>The definitions of the various metrics are as follows:</p>
+<ul>
+<li><img class="formulaInl" alt="$\textit{tp}$" src="form_510.png"/> is the 
count of correctly-classified positives.</li>
+<li><img class="formulaInl" alt="$\textit{tn}$" src="form_511.png"/> is the 
count of correctly-classified negatives.</li>
+<li><img class="formulaInl" alt="$\textit{fp}$" src="form_512.png"/> is the 
count of misclassified negatives.</li>
+<li><img class="formulaInl" alt="$\textit{fn}$" src="form_513.png"/> is the 
count of misclassified positives.</li>
+<li><img class="formulaInl" 
alt="$\textit{tpr}=\textit{tp}/(\textit{tp}+\textit{fn})$" 
src="form_514.png"/>.</li>
+<li><img class="formulaInl" 
alt="$\textit{tnr}=\textit{tn}/(\textit{fp}+\textit{tn})$" 
src="form_515.png"/>.</li>
+<li><img class="formulaInl" 
alt="$\textit{ppv}=\textit{tp}/(\textit{tp}+\textit{fp})$" 
src="form_516.png"/>.</li>
+<li><img class="formulaInl" 
alt="$\textit{npv}=\textit{tn}/(\textit{tn}+\textit{fn})$" 
src="form_517.png"/>.</li>
+<li><img class="formulaInl" 
alt="$\textit{fpr}=\textit{fp}/(\textit{fp}+\textit{tn})$" 
src="form_518.png"/>.</li>
+<li><img class="formulaInl" alt="$\textit{fdr}=1-\textit{ppv}$" 
src="form_519.png"/>.</li>
+<li><img class="formulaInl" 
alt="$\textit{fnr}=\textit{fn}/(\textit{fn}+\textit{tp})$" 
src="form_520.png"/>.</li>
+<li><img class="formulaInl" 
alt="$\textit{acc}=(\textit{tp}+\textit{tn})/(\textit{tp}+\textit{tn}+\textit{fp}
 +\textit{fn})$" src="form_521.png"/>.</li>
+<li><img class="formulaInl" 
alt="$\textit{f1}=2*\textit{tp}/(2*\textit{tp}+\textit{fp}+\textit{fn})$" 
src="form_522.png"/>.</li>
+</ul>
+<p><b>Area Under ROC Curve</b></p>
+<p>This function returns the area under the Receiver Operating Characteristic 
curve for binary classification (the AUC). The ROC curve is the curve relating 
the classifier's TPR and FPR metrics. (See <a href="#bc">Binary 
Classification</a> above for a definition of these metrics). Please refer to 
reference <a href="#aoc">[2]</a> for more details. Note that the binary 
classification function can be used to obtain the data (TPR and FPR values) 
required for drawing the ROC curve.</p>
+<dl class="section note"><dt>Note</dt><dd>For 'binary_classifier' and 
'area_under_roc' functions:<ul>
+<li>The 'observed_col' column is assumed to be a numeric column with two 
values: 0 and 1, or a Boolean column. For the purposes of the metric 
calculation, 0 is considered to be negative and 1 to be positive.</li>
+<li>The 'pred_col' column is expected to contain numeric values corresponding 
to likelihood/probability. A larger value corresponds to greater certainty that 
the observed value will be '1', and a lower value corresponds to a greater 
certainty that it will be '0'.</li>
+</ul>
+</dd></dl>
+<p><b>Confusion Matrix</b></p>
+<p>This function returns the confusion matrix of a multi-class classification. 
Each column of the matrix represents the instances in a predicted class while 
each row represents the instances in an actual class. This allows more detailed 
analysis than mere proportion of correct guesses (accuracy). Please refer to 
the reference <a href="#cm">[3]</a> for more details. Please note that grouping 
is not supported for the confusion matrix.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Create the sample data: <pre class="example">
+DROP TABLE IF EXISTS test_set;
+CREATE TABLE test_set(
+                  pred FLOAT8,
+                  obs FLOAT8
+                );
+INSERT INTO test_set VALUES
+  (37.5,53.1), (12.3,34.2), (74.2,65.4), (91.1,82.1);
+</pre></li>
+<li>Run the Mean Absolute Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_abs_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_abs_error
+&#160;----------------
+         13.825
+</pre></li>
+<li>Run the Mean Absolute Percentage Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_abs_perc_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_abs_perc_error
+&#160;---------------------
+   0.294578793636013
+</pre></li>
+<li>Run the Mean Percentage Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_perc_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_perc_error
+&#160;-------------------
+   -0.17248930032771
+</pre></li>
+<li>Run the Mean Squared Error function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.mean_squared_error( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ mean_squared_error
+&#160;--------------------
+   220.3525
+</pre></li>
+<li>Run the R2 Score function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.r2_score( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ r2_score
+&#160;------------------------
+   0.27992908844337695865
+</pre></li>
+<li>Run the Adjusted R2 Score function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.adjusted_r2_score( 'test_set', 'table_out', 'pred', 'obs', 3, 
100);
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+       r2_score      | adjusted_r2_score 
+&#160;--------------------+------------------
+   0.279929088443375 | 0.257426872457231
+</pre></li>
+<li>Create the sample data for binary classifier metrics: <pre class="example">
+DROP TABLE IF EXISTS test_set;
+CREATE TABLE test_set AS
+    SELECT ((a*8)::integer)/8.0 pred,
+        ((a*0.5+random()*0.5)&gt;0.5) obs
+    FROM (select random() as a from generate_series(1,100)) x;
+</pre></li>
+<li>Run the Binary Classifier metrics function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.binary_classifier( 'test_set', 'table_out', 'pred', 'obs');
+</pre></li>
+<li>View the True Positive Rate and the False Positive Rate: <pre 
class="example">
+SELECT threshold, tpr, fpr FROM table_out ORDER BY threshold;
+</pre> Result (your results for this and other functions below will look 
different due to the presence of the random function in sample data generator): 
<pre class="result">
+       threshold        |          tpr           |          fpr
+------------------------+------------------------+------------------------
+ 0.00000000000000000000 | 1.00000000000000000000 | 1.00000000000000000000
+ 0.12500000000000000000 | 1.00000000000000000000 | 0.94915254237288135593
+ 0.25000000000000000000 | 0.92682926829268292683 | 0.64406779661016949153
+ 0.37500000000000000000 | 0.80487804878048780488 | 0.47457627118644067797
+ 0.50000000000000000000 | 0.70731707317073170732 | 0.35593220338983050847
+ 0.62500000000000000000 | 0.63414634146341463415 | 0.25423728813559322034
+ 0.75000000000000000000 | 0.48780487804878048780 | 0.06779661016949152542
+ 0.87500000000000000000 | 0.29268292682926829268 | 0.03389830508474576271
+ 1.00000000000000000000 | 0.12195121951219512195 | 0.00000000000000000000
+</pre></li>
+<li>View all metrics at a given threshold value: <pre class="example">
+-- Set extended display on for easier reading of output
+\x on
+SELECT * FROM table_out WHERE threshold=0.5;
+</pre> Result <pre class="result">
+-[ RECORD 1 ]---------------------
+threshold | 0.50000000000000000000
+tp        | 29
+fp        | 21
+fn        | 12
+tn        | 38
+tpr       | 0.70731707317073170732
+tnr       | 0.64406779661016949153
+ppv       | 0.58000000000000000000
+npv       | 0.76000000000000000000
+fpr       | 0.35593220338983050847
+fdr       | 0.42000000000000000000
+fnr       | 0.29268292682926829268
+acc       | 0.67000000000000000000
+f1        | 0.63736263736263736264
+</pre></li>
+<li>Run the Area Under ROC curve function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.area_under_roc( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out;
+</pre> Result <pre class="result">
+ area_under_roc
+&#160;---------------------------------------------
+0.77428689541132699462698842496899545266640
+</pre></li>
+<li>Create the sample data for confusion matrix. <pre class="example">
+DROP TABLE IF EXISTS test_set;
+CREATE TABLE test_set AS
+    SELECT (x+y)%5+1 AS pred,
+        (x*y)%5 AS obs
+    FROM generate_series(1,5) x,
+        generate_series(1,5) y;
+</pre></li>
+<li>Run the confusion matrix function: <pre class="example">
+DROP TABLE IF EXISTS table_out;
+SELECT madlib.confusion_matrix( 'test_set', 'table_out', 'pred', 'obs');
+SELECT * FROM table_out ORDER BY class;
+</pre> Result <pre class="result">
+ class | confusion_arr
+-------+---------------
+     0 | {0,1,2,2,2,2}
+     1 | {0,2,0,1,1,0}
+     2 | {0,0,0,2,2,0}
+     3 | {0,0,2,0,0,2}
+     4 | {0,2,1,0,0,1}
+     5 | {0,0,0,0,0,0}
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p><a class="anchor" id="r2"></a> [1] <a 
href="https://en.wikipedia.org/wiki/Coefficient_of_determination";>https://en.wikipedia.org/wiki/Coefficient_of_determination</a></p>
+<p><a class="anchor" id="aoc"></a> [2] <a 
href="https://en.wikipedia.org/wiki/Receiver_operating_characteristic";>https://en.wikipedia.org/wiki/Receiver_operating_characteristic</a></p>
+<p><a class="anchor" id="cm"></a> [3] <a 
href="https://en.wikipedia.org/wiki/Confusion_matrix";>https://en.wikipedia.org/wiki/Confusion_matrix</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="pred__metrics_8sql__in.html" title="A collection 
of summary statistics to gauge model accuracy based on predicted values 
vs...">pred_metrics.sql_in</a> for list of functions and usage. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__prob.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__prob.html b/docs/v1.11/group__grp__prob.html
new file mode 100644
index 0000000..6d065d1
--- /dev/null
+++ b/docs/v1.11/group__grp__prob.html
@@ -0,0 +1,158 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Probability Functions</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__prob.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Probability Functions<div class="ingroups"><a class="el" 
href="group__grp__stats.html">Statistics</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#syntax">Function Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>The Probability Functions module provides cumulative distribution, 
density/mass, and quantile functions for a wide range of probability 
distributions.</p>
+<p>Unless otherwise documented, all of these functions are wrappers around 
functionality provided by the boost C++ library [1, â<a 
href="http://www.boost.org/doc/libs/1_49_0/libs/math/doc/sf_and_dist/html/math_toolkit/dist.html";>Statistical
 Distributions and Functions</a>â].</p>
+<p>For convenience, all cumulative distribution and density/mass functions 
(CDFs and PDF/PMFs in short) are defined over the range of all floating-point 
numbers including infinity. Inputs that are <code>NULL</code> or 
<code>NaN</code> (not a number) will always produce a <code>NULL</code> or 
<code>NaN</code> result, respectively. Inputs that are plus or minus infinity 
will return the respective limits.</p>
+<p>A quantile function for a probability distrution with CDF <img 
class="formulaInl" alt="$ F $" src="form_113.png"/> takes a probability 
argument <img class="formulaInl" alt="$ p \in [0,1] $" src="form_242.png"/> and 
returns the value <img class="formulaInl" alt="$ x $" src="form_179.png"/> so 
that <img class="formulaInl" alt="$ F(x) = p $" src="form_243.png"/>, provided 
such an <img class="formulaInl" alt="$ x $" src="form_179.png"/> exists and it 
is unique. If it does not, the result will be <img class="formulaInl" alt="$ 
\sup \{ x \in D \mid F(x) \leq p \} $" src="form_244.png"/> (interpreted as 0 
if the supremum is over an empty set) if <img class="formulaInl" alt="$ p &lt; 
0.5 $" src="form_245.png"/>, and <img class="formulaInl" alt="$ \inf \{ x \in D 
\mid F(x) \geq p \} $" src="form_246.png"/> if <img class="formulaInl" alt="$ p 
\geq 0.5 $" src="form_247.png"/>. Here <img class="formulaInl" alt="$ D $" 
src="form_248.png"/> denotes the domain of the distribution, which is the 
 set of reals <img class="formulaInl" alt="$ \mathbb R $" src="form_249.png"/> 
for continuous and the set of nonnegative integers <img class="formulaInl" 
alt="$ \mathbb N_0 $" src="form_250.png"/> for discrete distributions.</p>
+<p>Intuitively, the formulas in the previous paragraph deal with the following 
special cases. The 0-quantile will always be the âleft endâ of the support, 
and the 1-quantile will be the âright endâ of the support of the 
distribution. For discrete distributions, most values of <img 
class="formulaInl" alt="$ p \in [0,1] $" src="form_242.png"/> do not admit an 
<img class="formulaInl" alt="$ x $" src="form_179.png"/> with <img 
class="formulaInl" alt="$ F(x) = p $" src="form_243.png"/>. Instead, there is 
an <img class="formulaInl" alt="$ x \in \mathbb N_0 $" src="form_251.png"/> so 
that <img class="formulaInl" alt="$ F(x) &lt; p &lt; F(x + 1) $" 
src="form_252.png"/>. The above formulas mean that the value returned as <img 
class="formulaInl" alt="$ p $" src="form_111.png"/>-quantile is <img 
class="formulaInl" alt="$ x $" src="form_179.png"/> if <img class="formulaInl" 
alt="$ p &lt; 0.5 $" src="form_245.png"/>, and it is <img class="formulaInl" 
alt="$ x + 1 $" src="form_253.png"/> 
 if <img class="formulaInl" alt="$ p \geq 0.5 $" src="form_247.png"/>. (As a 
special case, in order to ensure that quantiles are always within the support, 
the <img class="formulaInl" alt="$ p $" src="form_111.png"/>-quantile will be 0 
if <img class="formulaInl" alt="$ p &lt; F(0) $" src="form_254.png"/>).</p>
+<p>The rationale for choosing this behavior is that <img class="formulaInl" 
alt="$p$" src="form_255.png"/>-quantiles for <img class="formulaInl" alt="$ p 
&lt; 0.5 $" src="form_245.png"/> are typically requested when interested in the 
value <img class="formulaInl" alt="$ x $" src="form_179.png"/> such that with 
confidence level <b>at least</b> <img class="formulaInl" alt="$ 1 - p $" 
src="form_256.png"/> a random variable will be <img class="formulaInl" alt="$ 
&gt; x $" src="form_257.png"/> (or equivalently, with probability <b>at 
most</b> <img class="formulaInl" alt="$ p $" src="form_111.png"/>, it will be 
<img class="formulaInl" alt="$ \leq x $" src="form_258.png"/>). Likewise, <img 
class="formulaInl" alt="$p$" src="form_255.png"/>-quantiles for <img 
class="formulaInl" alt="$ p \geq 0.5 $" src="form_247.png"/> are typically 
requested when interested in the value <img class="formulaInl" alt="$ x $" 
src="form_179.png"/> such that with confidence level <b>at least</b> <img 
class="formu
 laInl" alt="$ p $" src="form_111.png"/> a random variable will be <img 
class="formulaInl" alt="$ \leq x $" src="form_258.png"/>. See also [1, â<a 
href="http://www.boost.org/doc/libs/1_46_1/libs/math/doc/sf_and_dist/html/math_toolkit/policy/pol_tutorial/understand_dis_quant.html";>Understanding
 Quantiles of Discrete Distributions</a>â].</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function 
Syntax</dt><dd></dd></dl>
+<p>Cumulative distribution functions:</p>
+<pre class="syntax"><em>distribution</em>_cdf(<em>random variate</em>[, 
<em>parameter1</em> [, <em>parameter2</em> [, <em>parameter3</em>] ] 
])</pre><p>Probability density/mass functions: </p><pre 
class="syntax"><em>distribution</em>_{pdf|pmf}(<em>random variate</em>[, 
<em>parameter1</em> [, <em>parameter2</em> [, <em>parameter3</em>] ] 
])</pre><p>Quantile functions: </p><pre 
class="syntax"><em>distribution</em>_quantile(<em>probability</em>[, 
<em>parameter1</em> [, <em>parameter2</em> [, <em>parameter3</em>] ] 
])</pre><p>For concrete function signatures, see <a class="el" 
href="prob_8sql__in.html">prob.sql_in</a>.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<pre class="example">
+SELECT madlib.normal_cdf(0);
+</pre><p> Result: </p><pre class="result">
+ normal_cdf
+&#160;-----------
+        0.5
+</pre> <pre class="example">
+SELECT madlib.normal_quantile(0.5, 0, 1);
+</pre><p> Result: </p><pre class="result">
+ normal_quantile
+&#160;----------------
+               0
+(1 row)
+</pre><p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] John Maddock, Paul A. Bristow, Hubert Holin, Xiaogang Zhang, Bruno 
Lalande, Johan RÃ¥de, Gautam Sewani and Thijs van den Berg: <em>Boost Math 
Toolkit</em>, Version 1.49, available at: <a 
href="http://www.boost.org/doc/libs/1_49_0/libs/math/doc/sf_and_dist/html/index.html";>http://www.boost.org/doc/libs/1_49_0/libs/math/doc/sf_and_dist/html/index.html</a></p>
+<dl class="section user"><dt>Related Topics</dt><dd><a class="anchor" 
id="related"></a>File <a class="el" href="prob_8sql__in.html" title="SQL 
functions for evaluating probability functions. ">prob.sql_in</a> documenting 
the SQL functions. </dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__random__forest.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__random__forest.html 
b/docs/v1.11/group__grp__random__forest.html
new file mode 100644
index 0000000..4ce89b2
--- /dev/null
+++ b/docs/v1.11/group__grp__random__forest.html
@@ -0,0 +1,754 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Random Forest</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__random__forest.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Random Forest<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" 
href="group__grp__tree.html">Tree Methods</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#get_tree">Display Function</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>Random forests build an ensemble of classifiers, each of which is a 
tree model constructed using bootstrapped samples from the input data. The 
results of these models are then combined to yield a single prediction, which, 
at the expense of some loss in interpretation, have been found to be highly 
accurate.</p>
+<p>Please also refer to the decision tree user documentation for information 
relevant to the implementation of random forests in MADlib.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function</dt><dd>Random Forest training function has the following format: <pre 
class="syntax">
+forest_train(training_table_name,
+             output_table_name,
+             id_col_name,
+             dependent_variable,
+             list_of_features,
+             list_of_features_to_exclude,
+             grouping_cols,
+             num_trees,
+             num_random_features,
+             importance,
+             num_permutations,
+             max_tree_depth,
+             min_split,
+             min_bucket,
+             num_splits,
+             surrogate_params,
+             verbose,
+             sample_ratio
+             )
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>training_table_name </dt>
+<dd><p class="startdd">text. Name of the table containing the training 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table_name </dt>
+<dd><p class="startdd">text. Name of the generated table containing the 
model.</p>
+<p>The model table produced by the training function contains the following 
columns:</p>
+<table class="output">
+<tr>
+<th>gid </th><td>integer. group id that uniquely identifies a set of grouping 
column values.  </td></tr>
+<tr>
+<th>sample_id </th><td>integer. The id of the bootstrap sample that this tree 
is a part of.  </td></tr>
+<tr>
+<th>tree </th><td>bytea8. Trained tree model stored in binary format.  
</td></tr>
+</table>
+<p>A summary table named <em>&lt;model_table&gt;_summary</em> is also created 
at the same time, which contains the following columns: </p><table 
class="output">
+<tr>
+<th>method </th><td><p class="starttd">'forest_train' </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>is_classification </th><td><p class="starttd">boolean. True if it is a 
classification model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>source_table </th><td><p class="starttd">text. Data source table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>model_table </th><td><p class="starttd">text. Model table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>id_col_name </th><td><p class="starttd">text. The ID column name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_varname </th><td><p class="starttd">text. Dependent variable. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_varname </th><td><p class="starttd">text. Independent 
variables </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_features </th><td><p class="starttd">text. Categorical feature names. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>con_features </th><td><p class="starttd">text. Continuous feature names. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>grouping_col </th><td><p class="starttd">int. Names of grouping columns. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_trees </th><td><p class="starttd">int. Number of trees grown by the 
model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_random_features </th><td><p class="starttd">int. Number of features 
randomly selected for each split. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>max_tree_depth </th><td><p class="starttd">int. Maximum depth of any tree 
in the random forest model_table. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>min_split </th><td><p class="starttd">int. Minimum number of observations 
in a node for it to be split. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>min_bucket </th><td><p class="starttd">int. Minimum number of observations 
in any terminal node. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_splits </th><td><p class="starttd">int. Number of buckets for 
continuous variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>verbose </th><td><p class="starttd">boolean. Whether or not to display 
debug info. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>importance </th><td><p class="starttd">boolean. Whether or not to 
calculate variable importance. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_permutations </th><td><p class="starttd">int. Number of times feature 
values are permuted while calculating variable importance. The default value is 
1. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_all_groups </th><td><p class="starttd">int. Number of groups during 
forest training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_failed_groups </th><td><p class="starttd">int. Number of failed groups 
during forest training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_processed </th><td><p class="starttd">bigint. Total numbers of 
rows processed in all groups. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_skipped </th><td><p class="starttd">bigint. Total numbers of 
rows skipped in all groups due to missing values or failures. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_levels </th><td><p class="starttd">itext. For 
classification, the distinct levels of the dependent variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_type </th><td>text. The type of dependent variable.  
</td></tr>
+</table>
+<p>A group table named <em> &lt;model_table&gt;_group</em> is created, which 
has the following columns: </p><table class="output">
+<tr>
+<th>gid </th><td><p class="starttd">integer. Group id that uniquely identifies 
a set of grouping column values. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>&lt;...&gt; </th><td><p class="starttd">Same type as in the training data 
table. Grouping columns, if provided in input. This could be multiple columns 
depending on the <code>grouping_cols</code> input. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>success </th><td><p class="starttd">boolean. Indicator of the success of 
the group. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_levels_in_text </th><td><p class="starttd">text[]. Ordered levels of 
categorical variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_n_levels </th><td><p class="starttd">integer[]. Number of levels for 
each categorical variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>oob_error </th><td><p class="starttd">double precision. Out-of-bag error 
for the random forest model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_var_importance </th><td><p class="starttd">double precision[]. 
Variable importance for categorical features. The order corresponds to the 
order of the variables as found in cat_features in <em> 
&lt;model_table&gt;_summary</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>con_var_importance </th><td><p class="starttd">double precision[]. 
Variable importance for continuous features. The order corresponds to the order 
of the variables as found in con_features in <em> 
&lt;model_table&gt;_summary</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">text. Name of the column containing id information in 
the training data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_variable </dt>
+<dd><p class="startdd">text. Name of the column that contains the output for 
training. Boolean, integer and text are considered classification outputs, 
while float values are considered regression outputs.</p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features </dt>
+<dd><p class="startdd">text. Comma-separated string of column names to use as 
predictors. Can also be a '*' implying all columns are to be used as predictors 
(except the ones included in the next argument). The types of the features can 
be mixed where boolean, integer, and text columns are considered categorical 
and double precision columns are considered continuous. The categorical 
variables are not encoded and used as is for the training.</p>
+<p>It is important to note that we don't test for every combination of levels 
of a categorical variable when evaluating a split. We order the levels of the 
non-integer categorical variable by the entropy of the variable in predicting 
the response. The split at each node is evaluated between these ordered levels. 
Integer categorical variables are ordered by their value.</p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features_to_exclude </dt>
+<dd><p class="startdd">text. Comma-separated string of column names to exclude 
from the predictors list. If the <em>dependent_variable</em> argument is an 
expression (including cast of a column name), then this list should include the 
columns that are included in the <em>dependent_variable</em> expression, 
otherwise those columns will be included in the features (resulting in 
meaningless trees).</p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_cols (optional) </dt>
+<dd><p class="startdd">text, default: NULL. Comma-separated list of column 
names to group the data by. This will lead to creating multiple random forests, 
one for each group.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_trees (optional) </dt>
+<dd><p class="startdd">integer, default: 100. Maximum number of trees to grow 
in the Random Forest model. Actual number of trees grown may be slighlty 
different.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_random_features (optional) </dt>
+<dd><p class="startdd">integer, default: sqrt(n) if classification tree, 
otherwise n/3. Number of features to randomly select at each split.</p>
+<p class="enddd"></p>
+</dd>
+<dt>importance (optional) </dt>
+<dd><p class="startdd">boolean, default: true. Whether or not to calculate 
variable importance. If set to true, variable importance for categorical and 
continuous features will be output in the group table 
<em>&lt;model_table&gt;_group</em> described above. Will increase run time when 
variable importance is turned on. </p>
+<p class="enddd"></p>
+</dd>
+<dt>num_permutations (optional) </dt>
+<dd><p class="startdd">integer, default: 1. Number of times to permute each 
feature value while calculating variable importance.</p>
+<dl class="section note"><dt>Note</dt><dd>Variable importance for a feature is 
computed by permuting the variable with random values and computing the drop in 
predictive accuracy (using OOB samples). Setting this greater than 1 performs 
an average over multiple importance calculation. This increases the total run 
time and in most cases the default value of 1 is sufficient to compute the 
importance. </dd></dl>
+</dd>
+<dt>max_tree_depth (optional) </dt>
+<dd><p class="startdd">integer, default: 7. Maximum depth of any node of a 
tree, with the root node counted as depth 0. A deeper tree can lead to better 
prediction but will also result in longer processing time and higher memory 
usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_split (optional) </dt>
+<dd><p class="startdd">integer, default: 20. Minimum number of observations 
that must exist in a node for a split to be attempted.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_bucket (optional) </dt>
+<dd><p class="startdd">integer, default: min_split/3. Minimum number of 
observations in any terminal node. If only one of min_bucket or min_split is 
specified, min_split is set to min_bucket*3 or min_bucket to min_split/3, as 
appropriate.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_splits (optional) </dt>
+<dd><p class="startdd">integer, default: 20. Continuous-valued features are 
binned into discrete quantiles to compute split boundaries. This global 
parameter is used to compute the resolution of splits for continuous features. 
Higher number of bins will lead to better prediction, but will also result in 
longer processing time and higher memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>surrogate_params (optional) </dt>
+<dd><p class="startdd">text, Comma-separated string of key-value pairs 
controlling the behavior of surrogate splits for each node in a tree. 
</p><table class="output">
+<tr>
+<th>max_surrogates </th><td>Default: 0. Number of surrogates to store for each 
node.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd><p class="startdd">boolean, default: FALSE. Provides verbose output of the 
results of training.</p>
+<p class="enddd"></p>
+</dd>
+<dt>sample_ratio (optional) </dt>
+<dd>double precision, in the range of (0, 1], default: 1. If sample_ratio is 
less than 1, a bootstrap sample size smaller than the data table is expected to 
be used for training each tree in the forest. A ratio that is close to 0 may 
result in trees with only the root node. This allows users to experiment with 
the function in a speedy fashion. </dd>
+</dl>
+<dl class="section note"><dt>Note</dt><dd>The main parameters that affect 
memory usage are: depth of tree (âmax_tree_depthâ), number of features, 
number of values per categorical feature, and number of bins for continuous 
features (ânum_splitsâ). If you are hitting memory limits, consider 
reducing one or more of these parameters.</dd></dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function is provided to 
estimate the conditional mean given a new predictor. It has the following 
syntax: <pre class="syntax">
+forest_predict(random_forest_model,
+               new_data_table,
+               output_table,
+               type)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>forest_model </dt>
+<dd><p class="startdd">text. Name of the table containing the Random Forest 
model.</p>
+<p class="enddd"></p>
+</dd>
+<dt>new_data_table </dt>
+<dd><p class="startdd">text. Name of the table containing prediction data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">text. Name of the table to output prediction results 
to.</p>
+<p class="enddd"></p>
+</dd>
+<dt>type </dt>
+<dd>text, optional, default: 'response'. For regression models, the output is 
always the predicted value of the dependent variable. For classification 
models, the <em>type</em> variable can be 'response', giving the classification 
prediction as output, or 'prob', giving the class probabilities as output. For 
each value of the dependent variable, a column with the probabilities is added 
to the output table.  </dd>
+</dl>
+<p><a class="anchor" id="get_tree"></a></p><dl class="section 
user"><dt>Display Function</dt><dd>The 'get_tree' function is provided to 
output a graph representation of a single tree of the random forest. The output 
can either be in the popular 'dot' format that can be visualized using various 
programs including those in the GraphViz package, or in a simple text format. 
The details of the text format is outputted with the tree. <pre class="syntax">
+get_tree(forest_model_table,
+         gid,
+         sample_id,
+         dot_format,
+         verbose)
+</pre></dd></dl>
+<p>An additional display function is provided to output the surrogate splits 
chosen for each internal node. </p><pre class="syntax">
+get_tree_surr(forest_model_table,
+              gid,
+              sample_id)
+</pre><p>The output contains the list of surrogate splits for each internal 
node of a tree. The nodes are sorted in ascending order by id. This is 
equivalent to viewing the tree in a breadth-first manner. For each surrogate, 
the output gives the surrogate split (variable and threshold) and also provides 
the number of rows that were common between the primary split and the surrogate 
split. Finally, the number of rows present in the majority branch of the 
primary split is also presented. Only surrogates that perform better than this 
majority branch are used. When the primary variable has a NULL value the 
surrogate variables are used in order to compute the split for that node. If 
all surrogates variables are NULL, then the majority branch is used to compute 
the split for a tuple.</p>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>forest_model_table </dt>
+<dd><p class="startdd">text. Name of the table containing the Random Forest 
model.</p>
+<p class="enddd"></p>
+</dd>
+<dt>gid </dt>
+<dd><p class="startdd">integer. Id of the group that this tree is a part 
of.</p>
+<p class="enddd"></p>
+</dd>
+<dt>sample_id </dt>
+<dd><p class="startdd">integer. Id of the bootstrap sample that this tree if a 
part of.</p>
+<p class="enddd"></p>
+</dd>
+<dt>dot_format (optional) </dt>
+<dd><p class="startdd">boolean, default = TRUE. Output can either be in a dot 
format or a text format. If TRUE, the result is in the dot format, else output 
is in text format.</p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd>boolean, default = FALSE. If true, the dot format output will contain 
additional information (impurity, sample size, number of weighted rows for each 
response variable, classification or prediction if the tree was pruned at this 
level) </dd>
+</dl>
+<p>The output is always returned as a 'TEXT'. For the dot format, the output 
can be redirected to a file on the client side and then rendered using 
visualization programs.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd><b>Note:</b> The output results may vary due the 
random nature of random forests.</dd></dl>
+<p><b>Random Forest Classification Example</b></p>
+<ol type="1">
+<li>Prepare input data: <pre class="example">
+DROP TABLE IF EXISTS dt_golf;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    windy text,
+    class text
+);
+</pre> <pre class="example">
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
+(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
+(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
+(3, 'overcast', 83, 78, 'false', 'Play'),
+(4, 'rain', 70, 96, 'false', 'Play'),
+(5, 'rain', 68, 80, 'false', 'Play'),
+(6, 'rain', 65, 70, 'true', 'Don''t Play'),
+(7, 'overcast', 64, 65, 'true', 'Play'),
+(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
+(9, 'sunny', 69, 70, 'false', 'Play'),
+(10, 'rain', 75, 80, 'false', 'Play'),
+(11, 'sunny', 75, 70, 'true', 'Play'),
+(12, 'overcast', 72, 90, 'true', 'Play'),
+(13, 'overcast', 81, 75, 'false', 'Play'),
+(14, 'rain', 71, 80, 'true', 'Don''t Play');
+</pre></li>
+<li>Run the random forest training function and view summary output: <pre 
class="example">
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('dt_golf',         -- source table
+                           'train_output',    -- output model table
+                           'id',              -- id column
+                           'class',           -- response
+                           '"OUTLOOK", temperature, humidity, windy',   -- 
features
+                           NULL,              -- exclude columns
+                           NULL,              -- grouping columns
+                           20::integer,       -- number of trees
+                           2::integer,        -- number of random features
+                           TRUE::boolean,     -- variable importance
+                           1::integer,        -- num_permutations
+                           8::integer,        -- max depth
+                           3::integer,        -- min split
+                           1::integer,        -- min bucket
+                           10::integer        -- number of splits per 
continuous variable
+                           );
+\x on
+SELECT * FROM train_output_summary;
+</pre> Result: <pre class="result">
+-[ RECORD 1 ]---------+-----------------------------------------------
+method                | forest_train
+is_classification     | t
+source_table          | dt_golf
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | class
+independent_varnames  | "OUTLOOK",windy,temperature,humidity
+cat_features          | "OUTLOOK",windy
+con_features          | temperature,humidity
+grouping_cols         | 
+num_trees             | 20
+num_random_features   | 2
+max_tree_depth        | 8
+min_split             | 3
+min_bucket            | 1
+num_splits            | 10
+verbose               | f
+importance            | t
+num_permutations      | 1
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 14
+total_rows_skipped    | 0
+dependent_var_levels  | "Don't Play","Play"
+dependent_var_type    | text
+independent_var_types | text, text, double precision, double precision
+</pre> View the group table output: <pre class="example">
+SELECT * FROM train_output_group;
+</pre> Result: <pre class="result">
+-[ RECORD 1 ]------+----------------------------------------
+gid                | 1
+success            | t
+cat_n_levels       | {3,2}
+cat_levels_in_text | {overcast,rain,sunny,false,true}
+oob_error          | 0.50000000000000000000
+cat_var_importance | {-0.206309523809524,-0.234345238095238}
+con_var_importance | {-0.308690476190476,-0.272678571428571}
+</pre></li>
+<li>Obtain a dot format display of a single tree within the forest: <pre 
class="example">
+\x off
+SELECT madlib.get_tree('train_output',1,2);
+</pre> Result: <pre class="result">
+ digraph "Classification tree for dt_golf" {                 
+ "0" [label="humidity &lt;= 75", shape=ellipse];                
+ "0" -&gt; "1"[label="yes"];                                    
+ "1" [label="\"Play"",shape=box];                           
+ "0" -&gt; "2"[label="no"];                                     
+ "2" [label="humidity &lt;= 80", shape=ellipse];                
+ "2" -&gt; "5"[label="yes"];                                    
+ "5" [label=""Don't Play"",shape=box];                     
+ "2" -&gt; "6"[label="no"];                                     
+ "6" [label=""OUTLOOK" in {overcast,rain}", shape=ellipse];
+ "6" -&gt; "13"[label="yes"];                                   
+ "13" [label=""Play"",shape=box];                          
+ "6" -&gt; "14"[label="no"];                                    
+ "14" [label=""Don't Play"",shape=box];                                        
                                       
+ } //---end of digraph--------- 
+</pre></li>
+<li>Obtain a text display of the tree: <pre class="example">
+SELECT madlib.get_tree('train_output',1,2,FALSE);
+</pre> Result: <pre class="result">
+&#160;-------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Leaf nodes have a * while internal nodes have the split condition at 
the end.
+&#160;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
+&#160;- For each split the first indented child (2i+1) is the 'True' node and
+second indented child (2i+2) is the 'False' node.
+&#160;- Number of (weighted) rows for each response variable inside [].
+&#160;- Order of values = ['"Don\'t Play"', '"Play"']
+&#160;-------------------------------------
+ (0)[ 4 10]  humidity &lt;= 75                                               
+    (1)[0 7]  * --&gt; "Play"                                                
+    (2)[4 3]  humidity &lt;= 80                                              
+       (5)[3 1]  * --&gt; "Don't Play"                                       
+       (6)[1 2]  "OUTLOOK" in {overcast,rain}                             
+          (13)[0 2]  * --&gt; "Play"                                        
+          (14)[1 0]  * --&gt; "Don't Play"                                   
+&#160;-------------------------------------
+</pre></li>
+<li>Predict output categories for the same data as was used for input: <pre 
class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('train_output',
+                             'dt_golf',
+                             'prediction_results',
+                             'response');
+\x off
+SELECT id, estimated_class, class
+FROM prediction_results JOIN dt_golf USING (id)
+ORDER BY id;
+</pre> Result: <pre class="result">
+  id | estimated_class |   class    
+----+-----------------+------------
+  1 | Don't Play      | Don't Play
+  2 | Don't Play      | Don't Play
+  3 | Play            | Play
+  4 | Play            | Play
+  5 | Play            | Play
+  6 | Don't Play      | Don't Play
+  7 | Play            | Play
+  8 | Don't Play      | Don't Play
+  9 | Play            | Play
+ 10 | Play            | Play
+ 11 | Play            | Play
+ 12 | Play            | Play
+ 13 | Play            | Play
+ 14 | Don't Play      | Don't Play
+(14 rows)
+</pre></li>
+<li>Predict probablities of output categories for the same data: <pre 
class="example">
+DROP TABLE IF EXISTS prediction_prob;
+SELECT madlib.forest_predict('train_output',
+                             'dt_golf',
+                             'prediction_prob',
+                             'prob');
+\x off
+SELECT id, "estimated_prob_Play", class
+FROM prediction_prob JOIN dt_golf USING (id)
+ORDER BY id;
+</pre> Result: <pre class="result">
+ id | estimated_prob_Play |   class    
+----+---------------------+------------
+  1 |                0.05 | Don't Play
+  2 |                0.15 | Don't Play
+  3 |                0.95 | Play
+  4 |                0.65 | Play
+  5 |                0.75 | Play
+  6 |                 0.4 | Don't Play
+  7 |                 0.7 | Play
+  8 |                 0.1 | Don't Play
+  9 |                 0.9 | Play
+ 10 |                0.85 | Play
+ 11 |                 0.8 | Play
+ 12 |                 0.7 | Play
+ 13 |                   1 | Play
+ 14 |                 0.4 | Don't Play
+(14 rows)
+</pre></li>
+</ol>
+<p><b>Random Forest Regression Example</b></p>
+<ol type="1">
+<li>Prepare input data: <pre class="example">
+DROP TABLE IF EXISTS mt_cars;
+CREATE TABLE mt_cars (
+    id integer NOT NULL,
+    mpg double precision,
+    cyl integer,
+    disp double precision,
+    hp integer,
+    drat double precision,
+    wt double precision,
+    qsec double precision,
+    vs integer,
+    am integer,
+    gear integer,
+    carb integer
+);
+</pre> <pre class="example">
+INSERT INTO mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) VALUES
+(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
+(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
+(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
+(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
+(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
+(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
+(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
+(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
+(9,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1),
+(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
+(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
+(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
+(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
+(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
+(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
+(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
+(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
+(18,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2),
+(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
+(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
+(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
+(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
+(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
+(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
+(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
+(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
+(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
+(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
+(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
+(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
+(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
+(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
+</pre></li>
+<li>Run the random forest training function: <pre class="example">
+DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
mt_cars_output_summary;
+SELECT madlib.forest_train('mt_cars',
+                           'mt_cars_output',
+                           'id',
+                           'mpg',
+                           '*',
+                           'id, hp, drat, am, gear, carb',  -- exclude columns
+                           'am',
+                           10::integer,
+                           2::integer,
+                           TRUE::boolean,
+                           1,
+                           10,
+                           8,
+                           3,
+                           10
+                           );
+\x on
+SELECT * FROM mt_cars_output_summary;
+SELECT * FROM mt_cars_output_group;
+\x off
+</pre></li>
+<li>Display a single tree of the random forest in dot format: <pre 
class="example">
+SELECT madlib.get_tree('mt_cars_output',1,1);
+</pre> Result: <pre class="result">
+digraph "Regression tree for mt_cars" {
+"0" [label="28.8444",shape=box];
+} //---end of digraph---------
+</pre></li>
+<li>Predict regression output for the same data and compare with original: 
<pre class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('mt_cars_output',
+                             'mt_cars',
+                             'prediction_results',
+                             'response');
+SELECT am, id, estimated_mpg, mpg
+FROM prediction_results JOIN mt_cars USING (id)
+ORDER BY am, id;
+</pre> Result: <pre class="result">
+ am | id |  estimated_mpg   | mpg
+----+----+------------------+------
+  0 |  1 |  15.893525974026 | 18.7
+  0 |  3 | 21.5238492063492 | 24.4
+  0 |  5 | 20.0175396825397 | 17.8
+  0 |  6 | 14.8406818181818 | 16.4
+  0 |  8 | 14.8406818181818 | 17.3
+  0 |  9 | 20.0496825396825 | 21.4
+  0 | 10 | 14.4012272727273 | 15.2
+  0 | 11 | 20.0175396825397 | 18.1
+  0 | 13 | 15.0162878787879 | 14.3
+  0 | 14 | 21.5238492063492 | 22.8
+  0 | 16 | 20.0175396825397 | 19.2
+  0 | 18 | 15.4787532467532 | 15.2
+  0 | 19 | 14.4272987012987 | 10.4
+  0 | 21 | 14.4272987012987 | 10.4
+  0 | 23 | 14.8667532467532 | 14.7
+  0 | 25 | 21.5238492063492 | 21.5
+  0 | 27 |  15.281525974026 | 15.5
+  0 | 29 | 15.0162878787879 | 13.3
+  0 | 30 |  15.281525974026 | 19.2
+  1 |  2 | 20.6527393162393 |   21
+  1 |  4 | 20.6527393162393 |   21
+  1 |  7 | 22.7707393162393 | 22.8
+  1 | 12 | 27.0888266178266 | 32.4
+  1 | 15 | 28.2478650793651 | 30.4
+  1 | 17 | 28.2478650793651 | 33.9
+  1 | 20 | 28.2478650793651 | 27.3
+  1 | 22 | 23.8401984126984 |   26
+  1 | 24 | 26.9748650793651 | 30.4
+  1 | 26 | 20.6527393162393 | 15.8
+  1 | 28 | 20.6527393162393 |   15
+  1 | 31 | 20.6527393162393 | 19.7
+  1 | 32 | 22.7707393162393 | 21.4
+</pre></li>
+</ol>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" 
href="random__forest_8sql__in.html">random_forest.sql_in</a> documenting the 
training function</p>
+<p><a class="el" href="group__grp__decision__tree.html">Decision Tree</a></p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__regml.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__regml.html 
b/docs/v1.11/group__grp__regml.html
new file mode 100644
index 0000000..27c8eaf
--- /dev/null
+++ b/docs/v1.11/group__grp__regml.html
@@ -0,0 +1,160 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Regression Models</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__regml.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Regression Models<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>A collection of methods for modeling conditional expectation of a response 
variable. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__clustered__errors"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__clustered__errors.html">Clustered Variance</a></td></tr>
+<tr class="memdesc:group__grp__clustered__errors"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Calculates clustered 
variance for linear, logistic, and multinomial logistic regression models, and 
Cox proportional hazards models. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__cox__prop__hazards"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__cox__prop__hazards.html">Cox-Proportional Hazards 
Regression</a></td></tr>
+<tr class="memdesc:group__grp__cox__prop__hazards"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Models the relationship 
between one or more independent predictor variables and the amount of time 
before an event occurs. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__elasticnet"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__elasticnet.html">Elastic Net 
Regularization</a></td></tr>
+<tr class="memdesc:group__grp__elasticnet"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Generates a regularized 
regression model for variable selection in linear and logistic regression 
problems, combining the L1 and L2 penalties of the lasso and ridge methods. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__glm"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__glm.html">Generalized Linear Models</a></td></tr>
+<tr class="memdesc:group__grp__glm"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Estimate generalized linear model (GLM). GLM is a flexible 
generalization of ordinary linear regression that allows for response variables 
that have error distribution models other than a normal distribution. The GLM 
generalizes linear regression by allowing the linear model to be related to the 
response variable via a link function and by allowing the magnitude of the 
variance of each measurement to be a function of its predicted value. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__linreg"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__linreg.html">Linear Regression</a></td></tr>
+<tr class="memdesc:group__grp__linreg"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Also called Ordinary Least Squares Regression, models linear 
relationship between a dependent variable and one or more independent 
variables. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__logreg"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__logreg.html">Logistic Regression</a></td></tr>
+<tr class="memdesc:group__grp__logreg"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Models the relationship between one or more predictor 
variables and a binary categorical dependent variable by predicting the 
probability of the dependent variable using a logistic function. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__marginal"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__marginal.html">Marginal Effects</a></td></tr>
+<tr class="memdesc:group__grp__marginal"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Calculates marginal effects for the coefficients in 
regression problems. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__multinom"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__multinom.html">Multinomial Regression</a></td></tr>
+<tr class="memdesc:group__grp__multinom"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Multinomial regression is to model the conditional 
distribution of the multinomial response variable using a linear combination of 
predictors. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__ordinal"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__ordinal.html">Ordinal Regression</a></td></tr>
+<tr class="memdesc:group__grp__ordinal"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Regression to model data with ordinal response variable. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__robust"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__robust.html">Robust Variance</a></td></tr>
+<tr class="memdesc:group__grp__robust"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Calculates Huber-White variance estimates for linear, 
logistic, and multinomial regression models, and for Cox proportional hazards 
models. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__regml.js
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__regml.js b/docs/v1.11/group__grp__regml.js
new file mode 100644
index 0000000..76e3c69
--- /dev/null
+++ b/docs/v1.11/group__grp__regml.js
@@ -0,0 +1,13 @@
+var group__grp__regml =
+[
+    [ "Clustered Variance", "group__grp__clustered__errors.html", null ],
+    [ "Cox-Proportional Hazards Regression", 
"group__grp__cox__prop__hazards.html", null ],
+    [ "Elastic Net Regularization", "group__grp__elasticnet.html", null ],
+    [ "Generalized Linear Models", "group__grp__glm.html", null ],
+    [ "Linear Regression", "group__grp__linreg.html", null ],
+    [ "Logistic Regression", "group__grp__logreg.html", null ],
+    [ "Marginal Effects", "group__grp__marginal.html", null ],
+    [ "Multinomial Regression", "group__grp__multinom.html", null ],
+    [ "Ordinal Regression", "group__grp__ordinal.html", null ],
+    [ "Robust Variance", "group__grp__robust.html", null ]
+];
\ No newline at end of file

[18/51] [partial] incubator-madlib-site git commit: Add v1.11 docs

Reply via email to