diff --git a/docs/v1.14/group__grp__svm.html b/docs/v1.14/group__grp__svm.html
new file mode 100644
index 0000000..46161e0
--- /dev/null
+++ b/docs/v1.14/group__grp__svm.html
@@ -0,0 +1,651 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
+<html xmlns="";>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Support Vector Machines</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+</script><script type="text/javascript" 
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
+  })(window,document,'script','//','ga');
+  ga('create', 'UA-45382226-1', '');
+  ga('send', 'pageview');
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+ </tr>
+ </tbody>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+<script type="text/javascript">
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+<div class="header">
+  <div class="headertitle">
+<div class="title">Support Vector Machines<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#svm_classification">Classification Function</a> </li>
+<li class="level1">
+<a href="#svm_regression">Regression Function</a> </li>
+<li class="level1">
+<a href="#novelty_detection">Novelty Detection</a> </li>
+<li class="level1">
+<a href="#kernel_params">Kernel Parameters</a> </li>
+<li class="level1">
+<a href="#parameters">Other Parameters</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Functions</a> </li>
+<li class="level1">
+<a href="#example">Examples</a> </li>
+<li class="level1">
+<a href="#background">Technical Background</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</div><p>Support Vector Machines (SVMs) are models for regression and 
classification tasks. SVM models have two particularly desirable features: 
robustness in the presence of noisy data and applicability to a variety of data 
configurations. At its core, a <em>linear</em> SVM model is a hyperplane 
separating two distinct classes of data (in the case of classification 
problems), in such a way that the distance between the hyperplane and the 
nearest training data point (called the <em>margin</em>) is maximized. Vectors 
that lie on this margin are called support vectors. With the support vectors 
fixed, perturbations of vectors beyond the margin will not affect the model; 
this contributes to the model’s robustness. By substituting a kernel function 
for the usual inner product, one can approximate a large variety of decision 
boundaries in addition to linear hyperplanes. <a class="anchor" 
id="svm_classification"></a></p><dl class="section user"><dt>Classification 
Training Function</dt><d
 d>The SVM classification training function has the following format: <pre 
+    source_table,
+    model_table,
+    dependent_varname,
+    independent_varname,
+    kernel_func,
+    kernel_params,
+    grouping_col,
+    params,
+    verbose
+    )
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the training 
+<p class="enddd"></p>
+<dt>model_table </dt>
+<dd><p class="startdd">TEXT. Name of the output table containing the model. 
Details of the output tables are provided below. </p>
+<p class="enddd"></p>
+<dt>dependent_varname </dt>
+<dd><p class="startdd">TEXT. Name of the dependent variable column. For 
classification, this column can contain values of any type, but must assume 
exactly two distinct values. Otherwise, an error will be thrown. </p>
+<p class="enddd"></p>
+<dt>independent_varname </dt>
+<dd><p class="startdd">TEXT. Expression list to evaluate for the independent 
variables. An intercept variable should not be included as part of this 
expression. See 'fit_intercept' in the kernel params for info on intercepts. 
Please note that expression should be able to be cast to DOUBLE PRECISION[].</p>
+<p class="enddd"></p>
+<dt>kernel_func (optional) </dt>
+<dd><p class="startdd">TEXT, default: 'linear'. Type of kernel. Currently 
three kernel types are supported: 'linear', 'gaussian', and 'polynomial'. The 
text can be any subset of the three strings; for e.g., kernel_func='ga' will 
create a Gaussian kernel. </p>
+<p class="enddd"></p>
+<dt>kernel_params (optional) </dt>
+<dd><p class="startdd">TEXT, defaults: NULL. Parameters for non-linear kernel 
in a comma-separated string of key-value pairs. The actual parameters differ 
depending on the value of <em>kernel_func</em>. See the description below for 
details. </p>
+<p class="enddd"></p>
+<dt>grouping_col (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. An expression list used to group 
the input dataset into discrete groups, which results in running one model per 
group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no 
grouping is used and a single model is generated. Please note that cross 
validation is not supported if grouping is used.</p>
+<p class="enddd"></p>
+<dt>params (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. Parameters for optimization and 
regularization in a comma-separated string of key-value pairs. If a list of 
values is provided, then cross-validation will be performed to select the 
<em>best</em> value from the list. See the description below for details. </p>
+<p class="enddd"></p>
+<dt>verbose (optional) </dt>
+<dd>BOOLEAN default: FALSE. Verbose output of the results of training. </dd>
+<p><b>Output tables</b> <br />
+ The model table produced by SVM contains the following columns: </p><table 
+<th>coef </th><td>FLOAT8. Vector of coefficients.  </td></tr>
+<th>grouping_key </th><td>TEXT Identifies the group to which the datum 
belongs.  </td></tr>
+<th>num_rows_processed </th><td>BIGINT. Numbers of rows processed.  </td></tr>
+<th>num_rows_skipped </th><td>BIGINT. Numbers of rows skipped due to missing 
values or failures.  </td></tr>
+<th>num_iterations </th><td>INTEGER. Number of iterations completed by 
stochastic gradient descent algorithm. The algorithm either converged in this 
number of iterations or hit the maximum number specified in the optimization 
parameters.   </td></tr>
+<th>loss </th><td>FLOAT8. Value of the objective function of SVM. See 
Technical Background section below for more details.  </td></tr>
+<th>norm_of_gradient </th><td>FLOAT8. Value of the L2-norm of the 
(sub)-gradient of the objective function.  </td></tr>
+<th>__dep_var_mapping </th><td>TEXT[]. Vector of dependent variable labels. 
The first entry corresponds to -1 and the second to +1. For internal use only.  
+<p>An auxiliary table named &lt;model_table&gt;_random is created if the 
kernel is not linear. It contains data needed to embed test data into a random 
feature space (see references [2,3]). This data is used internally by 
svm_predict and not meaningful on its own to the user, so you can ignore it.</p>
+<p>A summary table named &lt;model_table&gt;_summary is also created, which 
has the following columns: </p><table class="output">
+<th>method </th><td>'svm'  </td></tr>
+<th>version_number </th><td>Version of MADlib which was used to generate the 
model.  </td></tr>
+<th>source_table </th><td>The data source table name.  </td></tr>
+<th>model_table </th><td>The model table name.  </td></tr>
+<th>dependent_varname </th><td>The dependent variable.  </td></tr>
+<th>independent_varname </th><td>The independent variables.  </td></tr>
+<th>kernel_func </th><td>The kernel function.  </td></tr>
+<th>kernel_parameters </th><td>The kernel parameters, as well as random 
feature map data.  </td></tr>
+<th>grouping_col </th><td>Columns on which to group.  </td></tr>
+<th>optim_params </th><td>A string containing the optimization parameters.  
+<th>reg_params </th><td>A string containing the regularization parameters.  
+<th>num_all_groups </th><td>Number of groups in SVM training.  </td></tr>
+<th>num_failed_groups </th><td>Number of failed groups in SVM training.  
+<th>total_rows_processed </th><td>Total numbers of rows processed in all 
groups.  </td></tr>
+<th>total_rows_skipped </th><td>Total numbers of rows skipped in all groups 
due to missing values or failures.  </td></tr>
+<p><a class="anchor" id="svm_regression"></a></p><dl class="section 
user"><dt>Regression Training Function</dt><dd>The SVM regression training 
function has the following format: <pre class="syntax">
+    model_table,
+    dependent_varname,
+    independent_varname,
+    kernel_func,
+    kernel_params,
+    grouping_col,
+    params,
+    verbose
+    )
+<p><b>Arguments</b> </p>
+<p>Specifications for regression are largely the same as for classification. 
In the model table, there is no dependent variable mapping. The following 
arguments have specifications which differ from svm_classification: </p><dl 
+<dt>dependent_varname </dt>
+<dd>TEXT. Name of the dependent variable column. For regression, this column 
can contain only values or expressions that can be cast to DOUBLE PRECISION. 
Otherwise, an error will be thrown.  </dd>
+<dt>params (optional) </dt>
+<dd>TEXT, default: NULL. The parameters <em>epsilon</em> and 
<em>eps_table</em> are only meaningful for regression. See description below 
for more details.  </dd>
+<p><a class="anchor" id="novelty_detection"></a></p><dl class="section 
user"><dt>Novelty Detection Training Function</dt><dd>The novelty detection 
function is a one-class SVM classifier, and has the following format: <pre 
+    source_table,
+    model_table,
+    independent_varname,
+    kernel_func,
+    kernel_params,
+    grouping_col,
+    params,
+    verbose
+    )
+</pre> <b>Arguments</b> </dd></dl>
+<p>Specifications for novelty detection are largely the same as for 
classification, except the dependent variable name is not specified. The model 
table is the same as that for classification.</p>
+<p><a class="anchor" id="kernel_params"></a></p><dl class="section 
user"><dt>Kernel Parameters</dt><dd>Kernel parameters are supplied in a string 
containing a comma-delimited list of name-value pairs. All of these named 
parameters are optional, and their order does not matter. You must use the 
format "&lt;param_name&gt; = &lt;value&gt;" to specify the value of a 
parameter, otherwise the parameter is ignored.</dd></dl>
+<dl class="arglist">
+<dt><em>Parameters common to all kernels</em></dt>
+<dt>fit_intercept </dt>
+<dd>Default: True. The parameter <em>fit_intercept</em> is an indicator to add 
an intercept to the <em>independent_varname</em> array expression. The 
intercept is added to the end of the feature list - thus the last element of 
the coefficient list is the intercept.  </dd>
+<dt>n_components </dt>
+<dd>Default: 2*num_features. The dimensionality of the transformed feature 
space. A larger value lowers the variance of the estimate of the kernel but 
requires more memory and takes longer to train. </dd>
+<dt>random_state </dt>
+<dd>Default: 1. Seed used by a random number generator.  </dd>
+<dl class="arglist">
+<dt><em>Parameters for 'gaussian' kernel</em></dt>
+<dt>gamma </dt>
+<dd>Default: 1/num_features. The parameter \(\gamma\) in the Radius Basis 
Function kernel, i.e., \(\exp(-\gamma||x-y||^2)\). Choosing a proper value for 
<em>gamma</em> is critical to the performance of kernel machine; e.g., while a 
large <em>gamma</em> tends to cause overfitting, a small <em>gamma</em> will 
make the model too constrained to capture the complexity of the data.  </dd>
+<dl class="arglist">
+<dt><em>Parameters for 'polynomial' kernel</em></dt>
+<dt>coef0 </dt>
+<dd>Default: 1.0. The independent term \(q\) in \( (\langle x,y\rangle + q)^r 
\). Must be larger than or equal to 0. When it is 0, the polynomial kernel is 
in homogeneous form.  </dd>
+<dt>degree </dt>
+<dd>Default: 3. The parameter \(r\) in \( (\langle x,y\rangle + q)^r \).  </dd>
+<p><a class="anchor" id="parameters"></a></p><dl class="section 
user"><dt>Other Parameters</dt><dd>Parameters in this section are supplied in 
the <em>params</em> argument as a string containing a comma-delimited list of 
name-value pairs. All of these named parameters are optional, and their order 
does not matter. You must use the format "&lt;param_name&gt; = &lt;value&gt;" 
to specify the value of a parameter, otherwise the parameter is 
+<p>Hyperparameter optimization can be carried out using the built-in cross 
validation mechanism, which is activated by assigning a value greater than 1 to 
the parameter <em>n_folds</em> in <em>params</em>. Please note that cross 
validation is not supported if grouping is used.</p>
+<p>The values of a parameter to cross validate should be provided in a list. 
For example, if one wanted to regularize with the L1 norm and use a lambda 
value from the set {0.3, 0.4, 0.5}, one might input 'lambda={0.3, 0.4, 0.5}, 
norm=L1, n_folds=10' in <em>params</em>. Note that the use of '{}' and '[]' are 
both valid here. </p><dl class="section note"><dt>Note</dt><dd>Note that not 
all of the parameters below can be cross-validated. For parameters where cross 
validation is allowed, their default values are presented in list format; e.g., 
+<pre class="syntax">
+  'init_stepsize = &lt;value&gt;,
+   decay_factor = &lt;value&gt;,
+   max_iter = &lt;value&gt;,
+   tolerance = &lt;value&gt;,
+   lambda = &lt;value&gt;,
+   norm = &lt;value&gt;,
+   epsilon = &lt;value&gt;,
+   eps_table = &lt;value&gt;,
+   validation_result = &lt;value&gt;,
+   n_folds = &lt;value&gt;,
+   class_weight = &lt;value&gt;'
+</pre><p> <b>Parameters</b> </p><dl class="arglist">
+<dt>init_stepsize </dt>
+<dd><p class="startdd">Default: [0.01]. Also known as the initial learning 
rate. A small value is usually desirable to ensure convergence, while a large 
value provides more room for progress during training. Since the best value 
depends on the condition number of the data, in practice one often searches in 
an exponential grid using built-in cross validation; e.g., "init_stepsize = [1, 
0.1, 0.001]". To reduce training time, it is common to run cross validation on 
a subsampled dataset, since this usually provides a good estimate of the 
condition number of the whole dataset. Then the resulting 
<em>init_stepsize</em> can be run on the whole dataset.</p>
+<p class="enddd"></p>
+<dt>decay_factor </dt>
+<dd><p class="startdd">Default: [0.9]. Control the learning rate schedule: 0 
means constant rate; &lt;-1 means inverse scaling, i.e., stepsize = 
init_stepsize / iteration; &gt; 0 means &lt;exponential decay, i.e., stepsize = 
init_stepsize * decay_factor^iteration. </p>
+<p class="enddd"></p>
+<dt>max_iter </dt>
+<dd><p class="startdd">Default: [100]. The maximum number of iterations 
allowed. </p>
+<p class="enddd"></p>
+<dt>tolerance </dt>
+<dd><p class="startdd">Default: 1e-10. The criterion to end iterations. The 
training stops whenever &lt;the difference between the training models of two 
consecutive iterations is &lt;smaller than <em>tolerance</em> or the iteration 
number is larger than <em>max_iter</em>. </p>
+<p class="enddd"></p>
+<dt>lambda </dt>
+<dd><p class="startdd">Default: [0.01]. Regularization parameter. Must be 
non-negative. </p>
+<p class="enddd"></p>
+<dt>norm </dt>
+<dd><p class="startdd">Default: 'L2'. Name of the regularization, either 'L2' 
or 'L1'. </p>
+<p class="enddd"></p>
+<dt>epsilon </dt>
+<dd><p class="startdd">Default: [0.01]. Determines the \(\epsilon\) for 
\(\epsilon\)-regression. Ignored during classification. When training the 
model, differences of less than \(\epsilon\) between estimated labels and 
actual labels are ignored. A larger \(\epsilon\) will yield a model with fewer 
support vectors, but will not generalize as well to future data. Generally, it 
has been suggested that epsilon should increase with noisier data, and decrease 
with the number of samples. See [5]. </p>
+<p class="enddd"></p>
+<dt>eps_table </dt>
+<dd><p class="startdd">Default: NULL. Name of the input table that contains 
values of epsilon for different groups. Ignored when <em>grouping_col</em> is 
NULL. Define this input table if you want different epsilon values for 
different groups. The table consists of a column named <em>epsilon</em> which 
specifies the epsilon values, and one or more columns for 
<em>grouping_col</em>. Extra groups are ignored, and groups not present in this 
table will use the epsilon value specified in parameter <em>epsilon</em>. </p>
+<p class="enddd"></p>
+<dt>validation_result </dt>
+<dd><p class="startdd">Default: NULL. Name of the table to store the cross 
validation results including the values of parameters and their averaged error 
values. For now, simple metric like 0-1 loss is used for classification and 
mean square error is used for regression. The table is only created if the name 
is not NULL. </p>
+<p class="enddd"></p>
+<dt>n_folds </dt>
+<dd><p class="startdd">Default: 0. Number of folds (k). Must be at least 2 to 
activate cross validation. If a value of k &gt; 2 is specified, each fold is 
then used as a validation set once, while the other k - 1 folds form the 
training set. </p>
+<p class="enddd"></p>
+<dt>class_weight </dt>
+<dd><p class="startdd">Default: 1 for classification, 'balanced' for one-class 
novelty detection, n/a for regression.</p>
+<p>Set the weight for the positive and negative classes. If not given, all 
classes are set to have weight one. If class_weight = balanced, values of y are 
automatically adjusted as inversely proportional to class frequencies in the 
input data i.e. the weights are set as n_samples / (n_classes * 
+<p>Alternatively, class_weight can be a mapping, giving the weight for each 
class. Eg. For dependent variable values 'a' and 'b', the class_weight can be 
{a: 2, b: 3}. This would lead to each 'a' tuple's y value multiplied by 2 and 
each 'b' y value will be multiplied by 3.</p>
+<p class="enddd">For regression, the class weights are always one.  </p>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function is used to 
estimate the conditional mean given a new predictor. The same syntax is used 
for classification, regression and novelty detection: <pre class="syntax">
+            new_data_table,
+            id_col_name,
+            output_table)
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>model_table </dt>
+<dd><p class="startdd">TEXT. Model table produced by the training function.</p>
+<p class="enddd"></p>
+<dt>new_data_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the prediction data. 
This table is expected to contain the same features that were used during 
training. The table should also contain id_col_name used for identifying each 
+<p class="enddd"></p>
+<dt>id_col_name </dt>
+<dd><p class="startdd">TEXT. The name of the id column in the input table.</p>
+<p class="enddd"></p>
+<dt>output_table </dt>
+<dd>TEXT. Name of the table where output predictions are written. If this 
table name is already in use, then an error is returned. Table contains: <table 
+<th>id </th><td>Gives the 'id' for each prediction, corresponding to each row 
from the new_data_table.  </td></tr>
+<th>prediction </th><td>Provides the prediction for each row in 
new_data_table. For regression this would be the same as decision_function. For 
classification, this will be one of the dependent variable values.  </td></tr>
+<th>decision_function </th><td>Provides the distance between each point and 
the separating hyperplane.  </td></tr>
+<p><a class="anchor" id="example"></a></p><dl class="section 
user"><dt>Examples</dt><dd><ol type="1">
+<li>Create an input data set. <pre class="example">
+CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
+            size INT, lot INT);
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
+<li>Train a classification model. First, use a linear model. <pre 
+DROP TABLE IF EXISTS houses_svm, houses_svm_summary;
+SELECT madlib.svm_classification('houses',
+                                 'houses_svm',
+                                 'price &lt; 100000',
+                                 'ARRAY[1, tax, bath, size]'
+                           );
+<li>View the result for the linear classification model. <pre class="example">
+-- Set extended display on for easier reading of output
+\x ON
+SELECT * FROM houses_svm;
+</pre> Result: <pre class="result">
+-[ RECORD 1 
+coef               | 
+loss               | 601.279740124
+norm_of_gradient   | 1300.96615851627
+num_iterations     | 100
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {f,t}
+<li>Next generate a nonlinear model using a Gaussian kernel. This time we 
specify the initial step size and maximum number of iterations to run. As part 
of the kernel parameter, we choose 10 as the dimension of the space where we 
train SVM. A larger number will lead to a more powerful model but run the risk 
of overfitting. As a result, the model will be a 10 dimensional vector, instead 
of 4 as in the case of linear model, which we will verify when we examine the 
models. <pre class="example">
+DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, 
+SELECT madlib.svm_classification( 'houses',
+                                  'houses_svm_gaussian',
+                                  'price &lt; 100000',
+                                  'ARRAY[1, tax, bath, size]',
+                                  'gaussian',
+                                  'n_components=10',
+                                  '',
+                                  'init_stepsize=1, max_iter=200'
+                           );
+<li>View the results from kernel SVM for classification. <pre class="example">
+-- Set extended display on for easier reading of output
+\x ON
+SELECT * FROM houses_svm_gaussian;
+</pre> Result: <pre class="result">
+-[ RECORD 1 
+coef               | 
+loss               | 0.998735180388
+norm_of_gradient   | 0.729823950583579
+num_iterations     | 196
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {f,t}
+<li>The regression models have a similar format (model output not shown). 
First, for a linear model: <pre class="example">
+DROP TABLE IF EXISTS houses_svm_regression, houses_svm_regression_summary;
+SELECT madlib.svm_regression('houses',
+                             'houses_svm_regression',
+                             'price',
+                             'ARRAY[1, tax, bath, size]'
+                           );
+</pre> For a non-linear regression model using a Gaussian kernel: <pre 
+DROP TABLE IF EXISTS houses_svm_gaussian_regression, 
houses_svm_gaussian_regression_summary, houses_svm_gaussian_regression_random;
+SELECT madlib.svm_regression( 'houses',
+                              'houses_svm_gaussian_regression',
+                              'price',
+                              'ARRAY[1, tax, bath, size]',
+                              'gaussian',
+                              'n_components=10',
+                              '',
+                              'init_stepsize=1, max_iter=200'
+                           );
+<li>Now train a non-linear one-class SVM for novelty detection, using a 
Gaussian kernel. Note that the dependent variable is not a parameter for 
one-class: <pre class="example">
+DROP TABLE IF EXISTS houses_one_class_gaussian, 
houses_one_class_gaussian_summary, houses_one_class_gaussian_random;
+select madlib.svm_one_class('houses',
+                            'houses_one_class_gaussian',
+                            'ARRAY[1,tax,bedroom,bath,size,lot,price]',
+                            'gaussian',
+                            'gamma=0.5,n_components=55, random_state=3',
+                            NULL,
+                            'max_iter=100, init_stepsize=10,lambda=10, 
+                            );
+<li>View the result for the Gaussian novelty detection model. <pre 
+-- Set extended display on for easier reading of output
+\x ON
+SELECT * FROM houses_one_class_gaussian;
+</pre> Result: <pre class="result">
+-[ RECORD 1 
+coef               | {redacted for brevity}
+loss               | 15.1053343738
+norm_of_gradient   | 13.9133653663837
+num_iterations     | 100
+num_rows_processed | 16
+num_rows_skipped   | -1
+dep_var_mapping    | {-1,1}
+<li>Now let's look at the prediction functions. We want to predict if house 
price is less than $100,000. In the following examples we will use the training 
data set for prediction as well, which is not usual but serves to show the 
syntax. The predicted results are in the <em>prediction</em> column and the 
actual data is in the <em>target</em> column. For the linear model: <pre 
+DROP TABLE IF EXISTS houses_pred;
+SELECT madlib.svm_predict('houses_svm', 'houses', 'id', 'houses_pred');
+SELECT *, price &lt; 100000 AS target FROM houses JOIN houses_pred USING (id) 
+</pre> Result: <pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function  | target 
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |   
104.685894748292 | t
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |   
200.592436923938 | t
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | t          |   
378.765847404582 | t
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | t          |   
214.034895129328 | t
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | t          |   
153.227581012028 | f
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | f          |  
-102.382793811158 | t
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | f          |  
-53.8237999423388 | f
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | t          |   
223.090041223192 | f
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | f          | 
-0.858545961972027 | f
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | f          |   
-78.226279884182 | f
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | f          |  
-118.078558954948 | t
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | f          |  
-26.3388234857219 | f
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | f          |  
-279.923699905712 | f
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          |  
-50.7810508979155 | f
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          |   
333.579085875975 | t
+</pre> Prediction using the Gaussian model: <pre class="example">
+DROP TABLE IF EXISTS houses_pred_gaussian;
+SELECT madlib.svm_predict('houses_svm_gaussian', 'houses', 'id', 
+SELECT *, price &lt; 100000 AS target FROM houses JOIN houses_pred_gaussian 
+</pre> This produces a more accurate result than the linear case for this 
small data set: <pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function | target 
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |  
1.00338548176312 | t
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |  
1.00000000098154 | t
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | t          | 
0.246566699635389 | t
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | t          |   
1.0000000003367 | t
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | f          | 
-1.98940593324397 | f
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | t          |  
3.74336995109761 | t
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | f          | 
-1.01574407296086 | f
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | f          |  
-1.0000000002071 | f
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | f          | 
-3.88267069310101 | f
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | f          | 
-3.44507576539002 | f
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | t          |   
2.3409866081761 | t
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | f          | 
-3.51563221173085 | f
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | f          | 
-1.00000000011163 | f
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          | 
-1.87710363254055 | f
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          |  
1.34334834982263 | t
+<li>Prediction using the linear regression model: <pre class="example">
+DROP TABLE IF EXISTS houses_regr;
+SELECT madlib.svm_predict('houses_svm_regression', 'houses', 'id', 
+SELECT * FROM houses JOIN houses_regr USING (id) ORDER BY id;
+</pre> Result for the linear regression model: <pre class="result">
+  id | tax  | bedroom | bath | price  | size |  lot  |    prediction    | 
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | 55288.6992755623 |  
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | 99978.8137019119 |  
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | 43157.5130381023 |  
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | 88098.9557296729 |  
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | 114803.884262468 |  
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | 88899.5186193813 |  
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | 201108.397013076 |  
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | 75004.3236915733 |  
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | 136434.749667136 |  
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | 264483.856987395 |  
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | 110180.048139857 |  
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | 117300.841695563 |  
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | 199229.683967752 |  
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | 147998.930271016 |  
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | 84936.7661235861 |  
+</pre> For the non-linear Gaussian regression model (output not shown): <pre 
+DROP TABLE IF EXISTS houses_gaussian_regr;
+SELECT madlib.svm_predict('houses_svm_gaussian_regression', 'houses', 'id', 
+SELECT * FROM houses JOIN houses_gaussian_regr USING (id) ORDER BY id;
+<li>For the novelty detection using one-class, let's create a test data set 
using the last 3 values from the training set plus an outlier at the end (10x 
price): <pre class="example">
+DROP TABLE IF EXISTS houses_one_class_test;
+CREATE TABLE houses_one_class_test (id INT, tax INT, bedroom INT, bath FLOAT, 
price INT,
+            size INT, lot INT);
+COPY houses_one_class_test FROM STDIN WITH DELIMITER '|';
+ 1 | 3100 |       3 |    2 | 140000 | 1760 | 38000
+ 2 | 2070 |       2 |    3 | 148000 | 1550 | 14000
+ 3 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
+ 4 |  650 |       3 |  1.5 |  650000 | 1450 | 12000
+</pre> Now run prediction on the Gaussian one-class novelty detection model: 
<pre class="example">
+DROP TABLE IF EXISTS houses_once_class_pred;
+SELECT madlib.svm_predict('houses_one_class_gaussian', 
'houses_one_class_test', 'id', 'houses_one_class_pred');
+SELECT * FROM houses_one_class_test JOIN houses_one_class_pred USING (id) 
+</pre> Result showing the last row predicted to be novel: <pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction |  
+  1 | 3100 |       3 |    2 | 140000 | 1760 | 38000 |          1 |   
+  2 | 2070 |       2 |    3 | 148000 | 1550 | 14000 |          1 |  
+  3 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 |          1 |  
+  4 |  650 |       3 |  1.5 | 650000 | 1450 | 12000 |         -1 | 
+<li>Create a model for an unbalanced class-size dataset, then use the 
'balanced' parameter to classify: <pre class="example">
+DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, 
+SELECT madlib.svm_classification( 'houses',
+                                  'houses_svm_gaussian',
+                                  'price &lt; 150000',
+                                  'ARRAY[1, tax, bath, size]',
+                                  'gaussian',
+                                  'n_components=10',
+                                  '',
+                                  'init_stepsize=1, max_iter=200, 
+                           );
+SELECT * FROM houses_svm_gaussian;
+</pre> <pre class="result">
+-[ RECORD 1 
+coef               | 
+loss               | 1.87657250199
+norm_of_gradient   | 1.41148000266816
+num_iterations     | 174
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {f,t}
+</pre> Note that the results you get for all examples may vary with the 
platform you are using.</li>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>To solve linear SVM, the following objective function is minimized: </p><p 
+\[ \underset{w,b}{\text{Minimize }} \lambda||w||^2 + \frac{1}{n}\sum_{i=1}^n 
\ell(y_i,f_{w,b}(x_i)) \]
+<p>where \((x_1,y_1),\ldots,(x_n,y_n)\) are labeled training data and 
\(\ell(y,f(x))\) is a loss function. When performing classification, 
\(\ell(y,f(x)) = \max(0,1-yf(x))\) is the <em>hinge loss</em>. For regression, 
the loss function \(\ell(y,f(x)) = \max(0,|y-f(x)|-\epsilon)\) is used.</p>
+<p>If \( f_{w,b}(x) = \langle w, x\rangle + b\) is linear, then the objective 
function is convex and incremental gradient descent (IGD, or SGD) can be 
applied to find a global minimum. See Feng, et al. [1] for more details.</p>
+<p>To learn with Gaussian or polynomial kernels, the training data is first 
mapped via a <em>random feature map</em> in such a way that the usual inner 
product in the feature space approximates the kernel function in the input 
space. The linear SVM training function is then run on the resulting data. See 
the papers [2,3] for more information on random feature maps.</p>
+<p>Also, see the book [4] by Scholkopf and Smola for more details on SVMs in 
+<p><a class="anchor" id="literature"></a></p><dl class="section 
+<p><a class="anchor" id="svm-lit-1"></a>[1] Xixuan Feng, Arun Kumar, Ben 
Recht, and Christopher Re: Towards a Unified Architecture for in-RDBMS 
analytics, in SIGMOD Conference, 2012 <a 
+<p><a class="anchor" id="svm-lit-2"></a>[2] Purushottam Kar and Harish 
Karnick: Random Feature Maps for Dot Product Kernels, Proceedings of the 15th 
International Conference on Artificial Intelligence and Statistics, 2012, <a 
+<p><a class="anchor" id="svm-lit-3"></a>[3] Ali Rahmini and Ben Recht: Random 
Features for Large-Scale Kernel Machines, Neural Information Processing Systems 
2007, <a 
+<p><a class="anchor" id="svm-lit-4"></a>[4] Bernhard Scholkopf and Alexander 
Smola: Learning with Kernels, The MIT Press, Cambridge, MA, 2002.</p>
+<p><a class="anchor" id="svm-lit-5"></a>[5] Vladimir Cherkassky and Yunqian 
Ma: Practical Selection of SVM Parameters and Noise Estimation for SVM 
Regression, Neural Networks, 2004 <a 
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
+<p>File <a class="el" href="svm_8sql__in.html" title="SQL functions for SVM 
(Poisson) ">svm.sql_in</a> documenting the training function</p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
diff --git a/docs/v1.14/group__grp__text__utilities.html 
new file mode 100644
index 0000000..cf12ebb
--- /dev/null
+++ b/docs/v1.14/group__grp__text__utilities.html
@@ -0,0 +1,358 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
+<html xmlns="";>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Term Frequency</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+</script><script type="text/javascript" 
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
+  })(window,document,'script','//','ga');
+  ga('create', 'UA-45382226-1', '');
+  ga('send', 'pageview');
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+ </tr>
+ </tbody>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+<script type="text/javascript">
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+<div class="header">
+  <div class="headertitle">
+<div class="title">Term Frequency<div class="ingroups"><a class="el" 
href="group__grp__utility__functions.html">Utility Functions</a></div></div>  
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<a href="#function_syntax">Function Syntax</a> </li>
+<a href="#examples">Examples</a> </li>
+<a href="#related">Related Topics</a> </li>
+</div><p>Term frequency computes the number of times that a word or term 
occurs in a document. Term frequency is often used as part of a larger text 
processing pipeline, which may include operations such as stemming, stop word 
removal and topic modelling.</p>
+<p><a class="anchor" id="function_syntax"></a></p><dl class="section 
user"><dt>Function Syntax</dt><dd></dd></dl>
+<pre class="syntax">
+    term_frequency(input_table,
+                   doc_id_col,
+                   word_col,
+                   output_table,
+                   compute_vocab)
+</pre><p><b>Arguments:</b> </p><dl class="arglist">
+<dt>input_table </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the documents, 
with one document per row. Each row is in the form &lt;doc_id, word_vector&gt; 
where <code>doc_id</code> is an id unique to each document, and 
<code>word_vector</code> is a text array containing the words in the document. 
The <code>word_vector</code> should contain multiple entries of a word if the 
document contains multiple occurrence of that word. </p>
+<p class="enddd"></p>
+<dt>doc_id_col </dt>
+<dd><p class="startdd">TEXT. The name of the column containing the document 
id. </p>
+<p class="enddd"></p>
+<dt>word_col </dt>
+<dd><p class="startdd">TEXT. The name of the column containing the vector of 
words/terms in the document. This column should be of type that can be cast to 
+<p class="enddd"></p>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. The name of the table to store the term frequency 
output. The output table contains the following columns:</p><ul>
+<li><code>doc_id_col:</code> This the document id column (name will be same as 
the one provided as input).</li>
+<li><code>word:</code> Word/term present in a document. Depending on the value 
of <code>compute_vocab</code> below, this is either the original word as it 
appears in <code>word_col</code>, or an id representing the word. Note that 
word id's start from 0 not 1.</li>
+<li><code>count:</code> The number of times this word is found in the 
document. </li>
+<p class="enddd"></p>
+<dt>compute_vocab </dt>
+<dd>BOOLEAN. (Optional, Default=FALSE) Flag to indicate if a vocabulary table 
is to be created. If TRUE, an additional output table is created containing the 
vocabulary of all words, with an id assigned to each word in alphabetical 
order. The table is called <em>output_table</em>_vocabulary (i.e., suffix added 
to the <em>output_table</em> name) and contains the following columns:<ul>
+<li><code>wordid:</code> An id for each word in alphabetical order.</li>
+<li><code>word:</code> The word/term corresponding to the id.  </li>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
+<ol type="1">
+<li>First we create a document table with one document per row: <pre 
+CREATE TABLE documents(docid INT4, contents TEXT);
+(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie 
for breakfast.'),
+(1, 'Chinchillas and kittens are cute.'),
+(2, 'My sister adopted two kittens yesterday.'),
+(3, 'Look at this cute hamster munching on a piece of broccoli.');
+</pre> You can apply stemming, stop word removal and tokenization at this 
point in order to prepare the documents for text processing. Depending upon 
your database version, various tools are available. Databases based on more 
recent versions of PostgreSQL may do something like: <pre class="example">
+SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
+</pre> <pre class="result">
+                    tsvector_to_array                     
+ {ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
+ {chinchilla,cute,kitten}
+ {adopt,kitten,sister,two,yesterday}
+ {broccoli,cute,hamster,look,munch,piec}
+(4 rows)
+</pre> In this example, we assume a database based on an older version of 
PostgreSQL and just perform basic punctuation removal and tokenization. The 
array of words is added as a new column to the documents table: <pre 
+ALTER TABLE documents ADD COLUMN words TEXT[];
+UPDATE documents SET words = 
+    regexp_split_to_array(lower(
+    regexp_replace(contents, E'[,.;\']','', 'g')
+    ), E'[\\s+]');
+\x on   
+SELECT * FROM documents ORDER BY docid;
+</pre> <pre class="result">
+-[ RECORD 1 
+docid    | 0
+contents | I like to eat broccoli and bananas. I ate a banana and spinach 
smoothie for breakfast.
+words    | 
+-[ RECORD 2 
+docid    | 1
+contents | Chinchillas and kittens are cute.
+words    | {chinchillas,and,kittens,are,cute}
+-[ RECORD 3 
+docid    | 2
+contents | My sister adopted two kittens yesterday.
+words    | {my,sister,adopted,two,kittens,yesterday}
+-[ RECORD 4 
+docid    | 3
+contents | Look at this cute hamster munching on a piece of broccoli.
+words    | {look,at,this,cute,hamster,munching,on,a,piece,of,broccoli}
+<li>Compute the frequency of each word in each document: <pre class="example">
+DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
+SELECT madlib.term_frequency('documents',    -- input table
+                             'docid',        -- document id column
+                             'words',        -- vector of words in document
+                             'documents_tf'  -- output table
+                            );
+\x off
+SELECT * FROM documents_tf ORDER BY docid;
+</pre> <pre class="result">
+ docid |    word     | count 
+     0 | a           |     1
+     0 | breakfast   |     1
+     0 | banana      |     1
+     0 | and         |     2
+     0 | eat         |     1
+     0 | smoothie    |     1
+     0 | to          |     1
+     0 | like        |     1
+     0 | broccoli    |     1
+     0 | bananas     |     1
+     0 | spinach     |     1
+     0 | i           |     2
+     0 | ate         |     1
+     0 | for         |     1
+     1 | are         |     1
+     1 | cute        |     1
+     1 | kittens     |     1
+     1 | chinchillas |     1
+     1 | and         |     1
+     2 | two         |     1
+     2 | yesterday   |     1
+     2 | kittens     |     1
+     2 | sister      |     1
+     2 | my          |     1
+     2 | adopted     |     1
+     3 | this        |     1
+     3 | at          |     1
+     3 | a           |     1
+     3 | broccoli    |     1
+     3 | of          |     1
+     3 | look        |     1
+     3 | hamster     |     1
+     3 | on          |     1
+     3 | piece       |     1
+     3 | cute        |     1
+     3 | munching    |     1
+(36 rows)
+<li>Next we create a vocabulary of the words and store a wordid in the output 
table instead of the actual word: <pre class="example">
+DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
+SELECT madlib.term_frequency('documents',    -- input table
+                             'docid',        -- document id column
+                             'words',        -- vector of words in document
+                             'documents_tf',-- output table
+                             TRUE
+                            );
+SELECT * FROM documents_tf ORDER BY docid;
+</pre>  <pre class="result">
+ docid | wordid | count 
+     0 |     17 |     1
+     0 |      9 |     1
+     0 |     25 |     1
+     0 |     12 |     1
+     0 |     13 |     1
+     0 |     15 |     2
+     0 |      0 |     1
+     0 |      2 |     2
+     0 |     28 |     1
+     0 |      5 |     1
+     0 |      6 |     1
+     0 |      7 |     1
+     0 |      8 |     1
+     0 |     26 |     1
+     1 |     16 |     1
+     1 |     11 |     1
+     1 |     10 |     1
+     1 |      2 |     1
+     1 |      3 |     1
+     2 |     30 |     1
+     2 |      1 |     1
+     2 |     16 |     1
+     2 |     20 |     1
+     2 |     24 |     1
+     2 |     29 |     1
+     3 |      4 |     1
+     3 |     21 |     1
+     3 |     22 |     1
+     3 |     23 |     1
+     3 |      0 |     1
+     3 |     11 |     1
+     3 |      9 |     1
+     3 |     27 |     1
+     3 |     14 |     1
+     3 |     18 |     1
+     3 |     19 |     1
+(36 rows)
+</pre>  Note above that wordid's start at 0 not 1. The vocabulary table maps 
wordid to the actual word: <pre class="example">
+SELECT * FROM documents_tf_vocabulary ORDER BY wordid;
+</pre> <pre class="result">
+ wordid |    word     
+      0 | a
+      1 | adopted
+      2 | and
+      3 | are
+      4 | at
+      5 | ate
+      6 | banana
+      7 | bananas
+      8 | breakfast
+      9 | broccoli
+     10 | chinchillas
+     11 | cute
+     12 | eat
+     13 | for
+     14 | hamster
+     15 | i
+     16 | kittens
+     17 | like
+     18 | look
+     19 | munching
+     20 | my
+     21 | of
+     22 | on
+     23 | piece
+     24 | sister
+     25 | smoothie
+     26 | spinach
+     27 | this
+     28 | to
+     29 | two
+     30 | yesterday
+(31 rows)
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
+<p>See <a class="el" href="text__utilities_8sql__in.html" title="SQL functions 
for carrying out routine text operations. ">text_utilities.sql_in</a> for the 
term frequency SQL function definition and <a class="el" 
href="porter__stemmer_8sql__in.html" title="implementation of porter stemmer 
operations in SQL ">porter_stemmer.sql_in</a> for the stemmer function. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
diff --git a/docs/v1.14/group__grp__topic__modelling.html 
new file mode 100644
index 0000000..30e055d
--- /dev/null
+++ b/docs/v1.14/group__grp__topic__modelling.html
@@ -0,0 +1,139 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
+<html xmlns="";>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Topic Modelling</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+</script><script type="text/javascript" 
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
+  })(window,document,'script','//','ga');
+  ga('create', 'UA-45382226-1', '');
+  ga('send', 'pageview');
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+ </tr>
+ </tbody>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+<script type="text/javascript">
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Topic Modelling<div class="ingroups"><a class="el" 
href="group__grp__unsupervised.html">Unsupervised Learning</a></div></div>  
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
+<p>A collection of methods to uncover abstract topics in a document corpus </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
+<tr class="memitem:group__grp__lda"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__lda.html">Latent Dirichlet Allocation</a></td></tr>
+<tr class="memdesc:group__grp__lda"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Generates a Latent Dirichlet Allocation predictive model for 
a collection of documents. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
diff --git a/docs/v1.14/group__grp__topic__modelling.js 
new file mode 100644
index 0000000..249097b
--- /dev/null
+++ b/docs/v1.14/group__grp__topic__modelling.js
@@ -0,0 +1,4 @@
+var group__grp__topic__modelling =
+    [ "Latent Dirichlet Allocation", "group__grp__lda.html", null ]
\ No newline at end of file
diff --git a/docs/v1.14/group__grp__train__test__split.html 
new file mode 100644
index 0000000..fa47ee2
--- /dev/null
+++ b/docs/v1.14/group__grp__train__test__split.html
@@ -0,0 +1,307 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
+<html xmlns="";>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Train-Test Split</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+</script><script type="text/javascript" 
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
+  })(window,document,'script','//','ga');
+  ga('create', 'UA-45382226-1', '');
+  ga('send', 'pageview');
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+ </tr>
+ </tbody>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+<script type="text/javascript">
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+<div class="header">
+  <div class="headertitle">
+<div class="title">Train-Test Split<div class="ingroups"><a class="el" 
href="group__grp__mdl.html">Model Selection</a></div></div>  </div>
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<a href="#strs">Train-Test Split</a> </li>
+<a href="#examples">Examples</a> </li>
+</div><p>Train-test split is a utility to create training and testing sets 
from a single data set.</p>
+<p><a class="anchor" id="strs"></a></p><dl class="section user"><dt>Train-Test 
+<pre class="syntax">
+train_test_split(   source_table,
+                    output_table,
+                    train_proportion,
+                    test_proportion,
+                    grouping_cols,
+                    target_cols,
+                    with_replacement,
+                    separate_output_tables
+                )
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the input data.</p>
+<p class="enddd"></p>
+<dt>output_table </dt>
+<dd><p class="startdd">Name of output table. A new INTEGER column on the right 
called 'split' will identify 1 for train set and 0 for test set, unless the 
'separate_output_tables' parameter below is TRUE, in which case two output 
tables will be created using the 'output_table' name with the suffixes '_train' 
and '_test'. The output table contains all the columns present in the source 
table unless otherwise specified in the 'target_cols' parameter below. </p>
+<p class="enddd"></p>
+<dt>train_proportion </dt>
+<dd><p class="startdd">FLOAT8 in the range (0,1). Proportion of the dataset to 
include in the train split. If the 'grouping_col' parameter is specified below, 
each group will be sampled independently using the train proportion, i.e., in a 
stratified fashion.</p>
+<p class="enddd"></p>
+<dt>test_proportion (optional) </dt>
+<dd><p class="startdd">FLOAT8 in the range (0,1). Proportion of the dataset to 
include in the test split. Default is the complement to the train proportion 
(1-'train_proportion'). If the 'grouping_col' parameter is specified below, 
each group will be sampled independently using the train proportion, i.e., in a 
stratified fashion.</p>
+<p class="enddd"></p>
+<dt>grouping_cols (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. A single column or a list of 
comma-separated columns that defines how to stratify. When this parameter is 
NULL, the train-test split is not stratified.</p>
+<p class="enddd"></p>
+<dt>target_cols (optional) </dt>
+<dd><p class="startdd">TEXT, default NULL. A comma-separated list of columns 
to appear in the 'output_table'. If NULL or '*', all columns from the 
'source_table' will appear in the 'output_table'.</p>
+<p class="enddd"><a class="anchor" id="note"></a></p><dl class="section 
note"><dt>Note</dt><dd>Do not include 'grouping_cols' in the parameter 
'target_cols', because they are always included in the 'output_table'.</dd></dl>
+<dt>with_replacement (optional) </dt>
+<dd><p class="startdd">BOOLEAN, default FALSE. Determines whether to sample 
with replacement or without replacement (default). With replacement means that 
it is possible that the same row may appear in the sample set more than once. 
Without replacement means a given row can be selected only once.</p>
+<p class="enddd"></p>
+<dt>separate_output_tables (optional) </dt>
+<dd>BOOLEAN, default FALSE. If TRUE, two output tables will be created using 
the 'output_table' name with the suffixes '_train' and '_test'. </dd>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
+<p>Please note that due to the random nature of sampling, your results may 
look different from those below.</p>
+<ol type="1">
+<li>Create an input table: <pre class="syntax">
+    id1 INTEGER,
+    id2 INTEGER,
+    gr1 INTEGER,
+    gr2 INTEGER
+<li>Sample without replacement: <pre class="syntax">
+SELECT madlib.train_test_split(
+                                'test',    -- Source table
+                                'out',     -- Output table
+                                0.5,       -- Sample proportion
+                                0.5,       -- Sample proportion
+                                'gr1,gr2', -- Strata definition
+                                'id1,id2', -- Columns to output
+                                FALSE,     -- Sample without replacement
+                                FALSE);    -- Do not separate output tables
+SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
+</pre> <pre class="result">
+ gr1 | gr2 | id1 | id2 | split
+   1 |   1 |   1 |   0 |     0
+   1 |   1 |   4 |   0 |     0
+   1 |   1 |   6 |   0 |     0
+   1 |   1 |   9 |   0 |     0
+   1 |   1 |   9 |   0 |     0
+   1 |   1 |   9 |   0 |     0
+   1 |   2 |   0 |   3 |     0
+   1 |   2 |   0 |   4 |     0
+   1 |   2 |   0 |   5 |     0
+   2 |   2 |  10 |  10 |     0
+   2 |   2 |  30 |  30 |     0
+   2 |   2 |  40 |  40 |     0
+   2 |   2 |  60 |  60 |     0
+   1 |   1 |   2 |   0 |     1
+   1 |   1 |   3 |   0 |     1
+   1 |   1 |   5 |   0 |     1
+   1 |   1 |   7 |   0 |     1
+   1 |   1 |   8 |   0 |     1
+   1 |   1 |   9 |   0 |     1
+   1 |   2 |   0 |   1 |     1
+   1 |   2 |   0 |   2 |     1
+   1 |   2 |   0 |   6 |     1
+   2 |   2 |  20 |  20 |     1
+   2 |   2 |  50 |  50 |     1
+   2 |   2 |  70 |  70 |     1
+(25 rows)
+<li>Sample with replacement and create separate train and test tables: <pre 
+DROP TABLE IF EXISTS out_train, out_test;
+SELECT madlib.train_test_split(
+                                'test',    -- Source table
+                                'out',     -- Output table
+                                0.5,       -- train_proportion
+                                NULL,      -- Default = 1 - train_proportion = 
+                                'gr1,gr2', -- Strata definition
+                                'id1,id2', -- Columns to output
+                                TRUE,      -- Sample with replacement
+                                TRUE);     -- Separate output tables
+SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
+</pre> <pre class="result">
+ gr1 | gr2 | id1 | id2
+   1 |   1 |   1 |   0
+   1 |   1 |   2 |   0
+   1 |   1 |   4 |   0
+   1 |   1 |   7 |   0
+   1 |   1 |   8 |   0
+   1 |   1 |   9 |   0
+   1 |   2 |   0 |   4
+   1 |   2 |   0 |   5
+   1 |   2 |   0 |   6
+   2 |   2 |  40 |  40
+   2 |   2 |  50 |  50
+   2 |   2 |  50 |  50
+(12 rows)
+</pre> <pre class="syntax">
+SELECT * FROM out_test ORDER BY gr1,gr2,id1,id2;
+</pre> <pre class="result">
+ gr1 | gr2 | id1 | id2
+   1 |   1 |   1 |   0
+   1 |   1 |   1 |   0
+   1 |   1 |   3 |   0
+   1 |   1 |   4 |   0
+   1 |   1 |   5 |   0
+   1 |   1 |   9 |   0
+   1 |   2 |   0 |   1
+   1 |   2 |   0 |   5
+   1 |   2 |   0 |   6
+   2 |   2 |  20 |  20
+   2 |   2 |  20 |  20
+   2 |   2 |  20 |  20
+   2 |   2 |  70 |  70
+(13 rows)
+</pre> </li>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:11 for MADlib by
+    <a href="";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
diff --git a/docs/v1.14/group__grp__tree.html b/docs/v1.14/group__grp__tree.html
new file mode 100644
index 0000000..a58a26c
--- /dev/null
+++ b/docs/v1.14/group__grp__tree.html
@@ -0,0 +1,142 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
+<html xmlns="";>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Tree Methods</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+</script><script type="text/javascript" 
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
+  })(window,document,'script','//','ga');
+  ga('create', 'UA-45382226-1', '');
+  ga('send', 'pageview');
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+ </tr>
+ </tbody>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+<script type="text/javascript">
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Tree Methods<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
+<p>A collection of recursive partitioning (tree) methods. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
+<tr class="memitem:group__grp__decision__tree"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__decision__tree.html">Decision 
+<tr class="memdesc:group__grp__decision__tree"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Decision trees are 
tree-based supervised learning methods that can be used for classification and 
regression. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__random__forest"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__random__forest.html">Random 
+<tr class="memdesc:group__grp__random__forest"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Random forest is an 
ensemble learning method for classification and regression that construct a 
multitude of decision trees at training time, then produces the class that is 
the mean (regression) or mode (classification) of the prediction produced by 
the individual trees. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
diff --git a/docs/v1.14/group__grp__tree.js b/docs/v1.14/group__grp__tree.js
new file mode 100644
index 0000000..7e73e8c
--- /dev/null
+++ b/docs/v1.14/group__grp__tree.js
@@ -0,0 +1,5 @@
+var group__grp__tree =
+    [ "Decision Tree", "group__grp__decision__tree.html", null ],
+    [ "Random Forest", "group__grp__random__forest.html", null ]
\ No newline at end of file

Reply via email to