[32/51] [partial] madlib-site git commit: Doc: Add v1.14 documentation

riyer Wed, 02 May 2018 13:54:56 -0700

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__arraysmatrix.html
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__arraysmatrix.html 
b/docs/v1.14/group__grp__arraysmatrix.html
new file mode 100644
index 0000000..85e1798
--- /dev/null
+++ b/docs/v1.14/group__grp__arraysmatrix.html
@@ -0,0 +1,175 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Arrays and Matrices</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" 
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__arraysmatrix.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Arrays and Matrices<div class="ingroups"><a class="el" 
href="group__grp__datatrans.html">Data Types and 
Transformations</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>These modules provide basic mathematical operations to be run on array and 
matrices.</p>
+<p>For a distributed system, a matrix cannot simply be represented as a 2D 
array of numbers in memory. <b>We provide two forms of distributed 
representation of a matrix</b>:</p>
+<ul>
+<li>Dense: The matrix is represented as a distributed collection of 1-D 
arrays. An example 3x10 matrix would be the below table: <pre>
+ row_id |         row_vec
+--------+-------------------------
+   1    | {9,6,5,8,5,6,6,3,10,8}
+   2    | {8,2,2,6,6,10,2,1,9,9}
+   3    | {3,9,9,9,8,6,3,9,5,6}
+</pre></li>
+<li>Sparse: The matrix is represented using the row and column indices for 
each non-zero entry of the matrix. Example: <pre>
+ row_id | col_id | value
+--------+--------+-------
+      1 |      1 |     9
+      1 |      5 |     6
+      1 |      6 |     6
+      2 |      1 |     8
+      3 |      1 |     3
+      3 |      2 |     9
+      4 |      7 |     0
+(6 rows)
+</pre> &#160; All matrix operations work with either form of 
representation.</li>
+</ul>
+<p>In many cases, a matrix function can be <b>decomposed to vector operations 
applied independently on each row of a matrix (or corresponding rows of two 
matrices)</b>. We have also provided access to these internal vector operations 
(<a class="el" href="group__grp__array.html">Array Operations</a>) for greater 
flexibility. Matrix operations like <em>matrix_add</em> use the corresponding 
vector operation (<em>array_add</em>) and also include additional validation 
and formating. Other functions like <em>matrix_mult</em> are complex and use a 
combination of such vector operations and other SQL operations.</p>
+<p><b>It's important to note</b> that these array functions are only available 
for the dense format representation of the matrix. In general, the scope of a 
single array function invocation is limited to only an array (1-dimensional or 
2-dimensional) that fits in memory. When such function is executed on a table 
of arrays, the function is called multiple times - once for each array (or pair 
of arrays). On contrary, scope of a single matrix function invocation is the 
complete matrix stored as a distributed table. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__array"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__array.html">Array Operations</a></td></tr>
+<tr class="memdesc:group__grp__array"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides fast array operations supporting other MADlib 
modules. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__matrix"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__matrix.html">Matrix Operations</a></td></tr>
+<tr class="memdesc:group__grp__matrix"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides fast matrix operations supporting other MADlib 
modules. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__matrix__factorization"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__matrix__factorization.html">Matrix Factorization</a></td></tr>
+<tr class="memdesc:group__grp__matrix__factorization"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Matrix Factorization 
methods including Singular Value Decomposition and Low-rank Matrix 
Factorization. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__linalg"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__linalg.html">Norms and Distance functions</a></td></tr>
+<tr class="memdesc:group__grp__linalg"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides utility functions for basic linear algebra 
operations. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__svec"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__svec.html">Sparse Vectors</a></td></tr>
+<tr class="memdesc:group__grp__svec"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Implements a sparse vector data type that provides 
compressed storage of vectors that may have many duplicate elements. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:11 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__arraysmatrix.js
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__arraysmatrix.js 
b/docs/v1.14/group__grp__arraysmatrix.js
new file mode 100644
index 0000000..5e12094
--- /dev/null
+++ b/docs/v1.14/group__grp__arraysmatrix.js
@@ -0,0 +1,8 @@
+var group__grp__arraysmatrix =
+[
+    [ "Array Operations", "group__grp__array.html", null ],
+    [ "Matrix Operations", "group__grp__matrix.html", null ],
+    [ "Matrix Factorization", "group__grp__matrix__factorization.html", 
"group__grp__matrix__factorization" ],
+    [ "Norms and Distance functions", "group__grp__linalg.html", null ],
+    [ "Sparse Vectors", "group__grp__svec.html", null ]
+];
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__assoc__rules.html
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__assoc__rules.html 
b/docs/v1.14/group__grp__assoc__rules.html
new file mode 100644
index 0000000..1131b62
--- /dev/null
+++ b/docs/v1.14/group__grp__assoc__rules.html
@@ -0,0 +1,368 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Apriori Algorithm</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" 
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__assoc__rules.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Apriori Algorithm<div class="ingroups"><a class="el" 
href="group__grp__unsupervised.html">Unsupervised Learning</a> &raquo; <a 
class="el" href="group__grp__association__rules.html">Association 
Rules</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#rules">Rules</a> </li>
+<li>
+<a href="#algorithm">Apriori Algorithm</a> </li>
+<li>
+<a href="#syntax">Function Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#notes">Notes</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>This module implements the association rules data mining technique on 
a transactional data set. Given the names of a table and the columns, minimum 
support and confidence values, this function generates all single and 
multidimensional association rules that meet the minimum thresholds.</p>
+<p>Association rule mining is a widely used technique for discovering 
relationships between variables in a large data set (e.g., items in a store 
that are commonly purchased together). The classic market basket analysis 
example using association rules is the "beer and diapers" rule. According to 
data mining urban legend, a study of customer purchase behavior in a 
supermarket found that men often purchased beer and diapers together. After 
making this discovery, the managers strategically placed beer and diapers 
closer together on the shelves and saw a dramatic increase in sales. In 
addition to market basket analysis, association rules are also used in 
bioinformatics, web analytics, and several other fields.</p>
+<p>This type of data mining algorithm uses transactional data. Every 
transaction event has a unique identification, and each transaction consists of 
a set of items (or itemset). Purchases are considered binary (either it was 
purchased or not), and this implementation does not take into consideration the 
quantity of each item. For the MADlib association rules function, it is assumed 
that the data is stored in two columns with one item and transaction id per 
row. Transactions with multiple items will span multiple rows with one row per 
item.</p>
+<pre>
+    trans_id | product
+    ---------+---------
+           1 | 1
+           1 | 2
+           1 | 3
+           1 | 4
+           2 | 3
+           2 | 4
+           2 | 5
+           3 | 1
+           3 | 4
+           3 | 6
+    ...
+</pre><p><a class="anchor" id="rules"></a></p><dl class="section 
user"><dt>Rules</dt><dd></dd></dl>
+<p>Association rules take the form "If X, then Y", where X and Y are non-empty 
itemsets. X and Y are called the antecedent and consequent, or the 
left-hand-side and right-hand-side, of the rule respectively. Using our 
previous example, the association rule may state "If {diapers}, then {beer}" 
with .2 support and .85 confidence.</p>
+<p>The following metrics are defined for any given itemset "X".</p><ul>
+<li>Count: The number of transactions that contain X</li>
+<li>Support: The ratio of transactions that contain X to all transactions, T 
<p class="formulaDsp">
+\[ S (X) = \frac{Total X}{Total transactions} \]
+</p>
+</li>
+</ul>
+<p>Given any association rule "If X, then Y", the association rules function 
will also calculate the following metrics:</p><ul>
+<li>Count: The number of transactions that contain X,Y</li>
+<li>Support: The ratio of transactions that contain X,Y to all transactions, T 
<p class="formulaDsp">
+\[ S (X \Rightarrow Y) = \frac{Total(X \cup Y)}{Total transactions} \]
+</p>
+</li>
+<li>Confidence: The ratio of transactions that contain \( X,Y \) to 
transactions that contain \( X \). One could view this metric as the 
conditional probability of \( Y \) , given \( X \) . \( P(Y|X) \) <p 
class="formulaDsp">
+\[ C (X \Rightarrow Y) = \frac{s(X \cap Y )}{s(X)} \]
+</p>
+</li>
+<li>Lift: The ratio of observed support of \( X,Y \) to the expected support 
of \( X,Y \) , assuming \( X \) and \( Y \) are independent. <p 
class="formulaDsp">
+\[ L (X \Rightarrow Y) = \frac{s(X \cap Y )}{s(X) \cdot s(Y)} \]
+</p>
+</li>
+<li><p class="startli">Conviction: The ratio of expected support of \( X \) 
occurring without \( Y \) assuming \( X \) and \( \neg Y \) are independent, to 
the observed support of \( X \) occuring without \( Y \). If conviction is 
greater than 1, then this metric shows that incorrect predictions ( \( X 
\Rightarrow Y \) ) occur less often than if these two actions were independent. 
This metric can be viewed as the ratio that the association rule would be 
incorrect if the actions were independent (i.e. a conviction of 1.5 indicates 
that if the variables were independent, this rule would be incorrect 50% more 
often.)</p>
+<p class="formulaDsp">
+\[ Conv (X \Rightarrow Y) = \frac{1 - S(Y)}{1 - C(X \Rightarrow Y)} \]
+</p>
+</li>
+</ul>
+<p><a class="anchor" id="algorithm"></a></p><dl class="section 
user"><dt>Apriori Algorithm</dt><dd></dd></dl>
+<p>Although there are many algorithms that generate association rules, the 
classic algorithm is called Apriori [1] which we have implemented in this 
module. It is a breadth-first search, as opposed to depth-first searches like 
Eclat. Frequent itemsets of order \( n \) are generated from sets of order \( n 
- 1 \). Using the downward closure property, all sets must have frequent 
subsets. There are two steps in this algorithm; generating frequent itemsets, 
and using these itemsets to construct the association rules. A simplified 
version of the algorithm is as follows, and assumes a minimum level of support 
and confidence is provided:</p>
+<p><em>Initial</em> <em>step</em> </p><ol type="1">
+<li>Generate all itemsets of order 1.</li>
+<li>Eliminate itemsets that have support less than minimum support.</li>
+</ol>
+<p><em>Main</em> <em>algorithm</em> </p><ol type="1">
+<li>For \( n \ge 2 \), generate itemsets of order \( n \) by combining the 
itemsets of order \( n - 1 \). This is done by doing the union of two itemsets 
that have identical items except one.</li>
+<li>Eliminate itemsets that have (n-1) order subsets with insufficient 
support.</li>
+<li>Eliminate itemsets with insufficient support.</li>
+<li>Repeat until itemsets cannot be generated, or maximum itemset size is 
exceeded.</li>
+</ol>
+<p><em>Association</em> <em>rule</em> <em>generation</em> </p>
+<p>Given a frequent itemset \( A \) generated from the Apriori algorithm, and 
all subsets \( B \) , we generate rules such that \( B \Rightarrow (A - B) \) 
meets minimum confidence requirements.</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function 
Syntax</dt><dd>Association rules has the following syntax: <pre class="syntax">
+assoc_rules( support,
+             confidence,
+             tid_col,
+             item_col,
+             input_table,
+             output_schema,
+             verbose,
+             max_itemset_size
+           );</pre> This generates all association rules that satisfy the 
specified minimum <em>support</em> and <em>confidence</em>.</dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>support </dt>
+<dd><p class="startdd">Minimum level of support needed for each itemset to be 
included in result.</p>
+<p class="enddd"></p>
+</dd>
+<dt>confidence </dt>
+<dd><p class="startdd">Minimum level of confidence needed for each rule to be 
included in result.</p>
+<p class="enddd"></p>
+</dd>
+<dt>tid_col </dt>
+<dd><p class="startdd">Name of the column storing the transaction ids.</p>
+<p class="enddd"></p>
+</dd>
+<dt>item_col </dt>
+<dd><p class="startdd">Name of the column storing the products.</p>
+<p class="enddd"></p>
+</dd>
+<dt>input_table </dt>
+<dd><p class="startdd">Name of the table containing the input data.</p>
+<p>The input data is expected to be of the following form: 
</p><pre>{TABLE|VIEW} <em>input_table</em> (
+    <em>trans_id</em> INTEGER,
+    <em>product</em> TEXT
+)</pre><p>The algorithm maps the product names to consecutive integer ids 
starting at 1. If they are already structured this way, then the ids will not 
change. </p>
+<p class="enddd"></p>
+</dd>
+<dt>output_schema </dt>
+<dd><p class="startdd">The name of the schema where the final results will be 
stored. The schema must be created before calling the function. Alternatively, 
use <code>NULL</code> to output to the current schema.</p>
+<p>The results containing the rules, support, count, confidence, lift, and 
conviction are stored in the table <code>assoc_rules</code> in the schema 
specified by <code>output_schema</code>.</p>
+<p>The table has the following columns. </p><table class="output">
+<tr>
+<th>ruleid </th><td>integer  </td></tr>
+<tr>
+<th>pre </th><td>text  </td></tr>
+<tr>
+<th>post </th><td>text  </td></tr>
+<tr>
+<th>count </th><td>integer  </td></tr>
+<tr>
+<th>support </th><td>double  </td></tr>
+<tr>
+<th>confidence </th><td>double  </td></tr>
+<tr>
+<th>lift </th><td>double  </td></tr>
+<tr>
+<th>conviction </th><td>double  </td></tr>
+</table>
+<p>On Greenplum Database or Apache HAWQ, the table is distributed by the 
<code>ruleid</code> column.</p>
+<p>The <code>pre</code> and <code>post</code> columns are the itemsets of left 
and right hand sides of the association rule respectively. The 
<code>support</code>, <code>confidence</code>, <code>lift</code>, and 
<code>conviction</code> columns are calculated as described earlier. </p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose </dt>
+<dd><p class="startdd">BOOLEAN, default: FALSE. Determines if details are 
printed for each iteration as the algorithm progresses.</p>
+<p class="enddd"></p>
+</dd>
+<dt>max_itemset_size </dt>
+<dd>INTEGER, default: generate itemsets of all sizes. Determines the maximum 
size of frequent itemsets that are used for generating association rules. Must 
be 2 or more. This parameter can be used to reduce run time for data sets where 
itemset size is large.  </dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<p>Let's look at some sample transactional data and generate association 
rules.</p>
+<ol type="1">
+<li>Create an input dataset: <pre class="example">
+DROP TABLE IF EXISTS test_data;
+CREATE TABLE test_data (
+    trans_id INT,
+    product TEXT
+);
+INSERT INTO test_data VALUES (1, 'beer');
+INSERT INTO test_data VALUES (1, 'diapers');
+INSERT INTO test_data VALUES (1, 'chips');
+INSERT INTO test_data VALUES (2, 'beer');
+INSERT INTO test_data VALUES (2, 'diapers');
+INSERT INTO test_data VALUES (3, 'beer');
+INSERT INTO test_data VALUES (3, 'diapers');
+INSERT INTO test_data VALUES (4, 'beer');
+INSERT INTO test_data VALUES (4, 'chips');
+INSERT INTO test_data VALUES (5, 'beer');
+INSERT INTO test_data VALUES (6, 'beer');
+INSERT INTO test_data VALUES (6, 'diapers');
+INSERT INTO test_data VALUES (6, 'chips');
+INSERT INTO test_data VALUES (7, 'beer');
+INSERT INTO test_data VALUES (7, 'diapers');
+</pre></li>
+<li>Let \( min(support) = .25 \) and \( min(confidence) = .5 \), and the 
output schema is set to <code>NULL</code> indicating output to the current 
schema. In this example we set verbose to TRUE so that we have some insight 
into progress of the function. We can now generate association rules as 
follows: <pre class="example">
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE            -- Verbose output
+                                );
+</pre> Result (iteration details not shown): <pre class="result">
+ output_schema | output_table | total_rules |   total_time    
+---------------+--------------+-------------+-----------------
+ public        | assoc_rules  |           7 | 00:00:00.569254
+(1 row)
+</pre> The association rules are stored in the assoc_rules table: <pre 
class="example">
+SELECT * FROM assoc_rules
+ORDER BY support DESC, confidence DESC;
+</pre> Result: <pre class="result">
+ ruleid |       pre       |      post      | count |      support      |    
confidence     |       lift        |    conviction     
+--------+-----------------+----------------+-------+-------------------+-------------------+-------------------+-------------------
+      2 | {diapers}       | {beer}         |     5 | 0.714285714285714 |       
          1 |                 1 |                 0
+      6 | {beer}          | {diapers}      |     5 | 0.714285714285714 | 
0.714285714285714 |                 1 |                 1
+      5 | {chips}         | {beer}         |     3 | 0.428571428571429 |       
          1 |                 1 |                 0
+      4 | {chips,diapers} | {beer}         |     2 | 0.285714285714286 |       
          1 |                 1 |                 0
+      1 | {chips}         | {diapers,beer} |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      7 | {chips}         | {diapers}      |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      3 | {beer,chips}    | {diapers}      |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+(7 rows)
+</pre></li>
+<li>Limit association rules generated from itemsets of size at most 2: <pre 
class="example">
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE,           -- Verbose output
+                                  2               -- Max itemset size
+                                );
+</pre> Result (iteration details not shown): <pre class="result">
+ output_schema | output_table | total_rules |   total_time    
+---------------+--------------+-------------+-----------------
+ public        | assoc_rules  |           4 | 00:00:00.565176
+(1 row)
+</pre> The association rules are again stored in the assoc_rules table: <pre 
class="example">
+SELECT * FROM assoc_rules
+ORDER BY support DESC, confidence DESC;
+</pre> Result: <pre class="result">
+ ruleid |    pre    |   post    | count |      support      |    confidence    
 |       lift        |    conviction     
+--------+-----------+-----------+-------+-------------------+-------------------+-------------------+-------------------
+      1 | {diapers} | {beer}    |     5 | 0.714285714285714 |                 
1 |                 1 |                 0
+      2 | {beer}    | {diapers} |     5 | 0.714285714285714 | 
0.714285714285714 |                 1 |                 1
+      3 | {chips}   | {beer}    |     3 | 0.428571428571429 |                 
1 |                 1 |                 0
+      4 | {chips}   | {diapers} |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+(4 rows)
+</pre></li>
+<li>Post-processing can now be done on the output table in the case that you 
want to filter the results. For example, if you want any single item on the 
left hand side and a particular item on the right hand side: <pre 
class="example">
+SELECT * FROM assoc_rules WHERE array_upper(pre,1) = 1 AND post = 
array['beer'];
+</pre> Result: <pre class="result">
+ ruleid |    pre    |  post  | count |      support      | confidence | lift | 
conviction 
+--------+-----------+--------+-------+-------------------+------------+------+------------
+      1 | {diapers} | {beer} |     5 | 0.714285714285714 |          1 |    1 | 
         0
+      3 | {chips}   | {beer} |     3 | 0.428571428571429 |          1 |    1 | 
         0
+(2 rows)
+</pre></li>
+</ol>
+<p><a class="anchor" id="notes"></a></p><dl class="section 
user"><dt>Notes</dt><dd></dd></dl>
+<p>The association rules function always creates a table named 
<code>assoc_rules</code>. Make a copy of this table before running the function 
again if you would like to keep multiple association rule tables.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] <a 
href="https://en.wikipedia.org/wiki/Apriori_algorithm";>https://en.wikipedia.org/wiki/Apriori_algorithm</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="assoc__rules_8sql__in.html" title="The assoc_rules 
function computes association rules for a given set of data. The data is 
assumed to h...">assoc_rules.sql_in</a> documenting the SQL function. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__association__rules.html
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__association__rules.html 
b/docs/v1.14/group__grp__association__rules.html
new file mode 100644
index 0000000..dacc247
--- /dev/null
+++ b/docs/v1.14/group__grp__association__rules.html
@@ -0,0 +1,139 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Association Rules</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" 
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__association__rules.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Association Rules<div class="ingroups"><a class="el" 
href="group__grp__unsupervised.html">Unsupervised Learning</a></div></div>  
</div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>A collection of methods used to uncover interesting patterns in 
transactional datasets. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__assoc__rules"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__assoc__rules.html">Apriori 
Algorithm</a></td></tr>
+<tr class="memdesc:group__grp__assoc__rules"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Computes association rules 
for a given set of data. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__association__rules.js
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__association__rules.js 
b/docs/v1.14/group__grp__association__rules.js
new file mode 100644
index 0000000..e10c849
--- /dev/null
+++ b/docs/v1.14/group__grp__association__rules.js
@@ -0,0 +1,4 @@
+var group__grp__association__rules =
+[
+    [ "Apriori Algorithm", "group__grp__assoc__rules.html", null ]
+];
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__balance__sampling.html
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__balance__sampling.html 
b/docs/v1.14/group__grp__balance__sampling.html
new file mode 100644
index 0000000..c5717c7
--- /dev/null
+++ b/docs/v1.14/group__grp__balance__sampling.html
@@ -0,0 +1,600 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Balanced Sampling</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" 
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__balance__sampling.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Balanced Sampling<div class="ingroups"><a class="el" 
href="group__grp__sampling.html">Sampling</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#strs">Balanced Sampling</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>Some classification algorithms only perform optimally when the number 
of samples in each class is roughly the same. Highly skewed datasets are common 
in many domains (e.g., fraud detection), so resampling to offset this imbalance 
can produce a better decision boundary.</p>
+<p>This module offers a number of resampling techniques including 
undersampling majority classes, oversampling minority classes, and combinations 
of the two.</p>
+<p><a class="anchor" id="strs"></a></p><dl class="section user"><dt>Balanced 
Sampling</dt><dd></dd></dl>
+<pre class="syntax">
+balance_sample( source_table,
+                output_table,
+                class_col,
+                class_sizes,
+                output_table_size,
+                grouping_cols,
+                with_replacement,
+                keep_null
+              )
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the input data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of output table that contains the sampled 
data. The output table contains all columns present in the source table, plus a 
new generated id called "__madlib_id__" added as the first column. </p>
+<p class="enddd"></p>
+</dd>
+<dt>class_col </dt>
+<dd><p class="startdd">TEXT, Name of the column containing the class to be 
balanced. </p>
+<p class="enddd"></p>
+</dd>
+<dt>class_sizes (optional) </dt>
+<dd><p class="startdd">VARCHAR, default âuniformâ. Parameter to define the 
size of the different class values. (Class values are sometimes also called 
levels). Can be set to the following:</p>
+<ul>
+<li>
+<b>âuniformâ</b>: All class values will be resampled to have the same 
number of rows.  </li>
+<li>
+<b>'undersample'</b>: Undersample such that all class values end up with the 
same number of observations as the minority class. Done without replacement by 
default unless the parameter âwith_replacementâ is set to TRUE.  </li>
+<li>
+<b>'oversample'</b>: Oversample with replacement such that all class values 
end up with the same number of observations as the majority class. Not affected 
by the parameter âwith_replacementâ since oversampling is always done with 
replacement.  Short forms of the above will work too, e.g., 'uni' works the 
same as 'uniform'. </li>
+</ul>
+<p>Alternatively, you can also explicitly set class size in a string 
containing a comma-delimited list. Order does not matter and all class values 
do not need to be specified. Use the format âclass_value_1=x, 
class_value_2=y, â¦â where 'class_value' in the list must exist in the 
column 'class_col'. Set to an integer representing the desired number of 
observations. E.g., âred=3000, blue=4000â means you want to resample the 
dataset to result in exactly 3000 red and 4000 blue rows in the 
âoutput_tableâ.  </p>
+<dl class="section note"><dt>Note</dt><dd>The allowed names for class values 
follows object naming rules in PostgreSQL [1]. Quoted identifiers are allowed 
and should be enclosed in double quotes in the usual way. If for some reason 
the class values in the examples above were âReDâ and âBluEâ then the 
comma delimited list for âclass_sizeâ would be: ââReDâ=3000, 
âBluEâ=4000â. </dd></dl>
+</dd>
+<dt>output_table_size (optional) </dt>
+<dd><p class="startdd">INTEGER, default NULL. Desired size of the output data 
set. This parameter is ignored if âclass_sizeâ parameter is set to either 
âoversampleâ or âundersampleâ since output table size is already 
determined. If NULL, the resulting output table size will depend on the 
settings for the âclass_sizeâ parameter (see table below for more details). 
</p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_cols (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. A single column or a list of 
comma-separated columns that defines the strata. When this parameter is NULL, 
no grouping is used so the sampling is non-stratified, that is, the whole table 
is treated as a single group.</p>
+<dl class="section note"><dt>Note</dt><dd>The 'output_table_size' and the 
'class_sizes' are defined for the whole table. When grouping is used, these 
parameters are split evenly for each group. Further, if a specific class value 
is specified in the 'class_sizes' parameter, that particular class value should 
be present in each group. If not, an error will be thrown. </dd></dl>
+</dd>
+<dt>with_replacement (optional) </dt>
+<dd><p class="startdd">BOOLEAN, default FALSE. Determines whether to sample 
with replacement or without replacement (default). With replacement means that 
it is possible that the same row may appear in the sample set more than once. 
Without replacement means a given row can be selected only once. This parameter 
affects undersampling only since oversampling is always done with 
replacement.</p>
+<p class="enddd"></p>
+</dd>
+<dt>keep_null (optional) </dt>
+<dd>BOOLEAN, default FALSE. Determines whether to sample rows whose class 
values are NULL. By default, all rows with NULL class values are ignored. If 
this is set to TRUE, then NULL is treated as another class value. </dd>
+</dl>
+<p><b>How Output Table Size is Determined</b></p>
+<p>The rule of thumb is that if you specify a value for 'output_table_size', 
then you will generally get an output table of that size, with some minor 
rounding variations. If you set 'output_table_size' to NULL, then the size of 
the output table will be calculated depending on what you put for the 
'class_size' parameter. The following table shows how the parameters 
'class_size' and 'output_table_size' work together:</p>
+<table class="doxtable">
+<tr>
+<th align="left">Case </th><th align="left">'class_size' </th><th 
align="left">'output_table_size' </th><th align="left">Result  </th></tr>
+<tr>
+<td align="left">1 </td><td align="left">'uniform' </td><td align="left">NULL 
</td><td align="left">Resample for uniform class size with output size = input 
size (i.e., balanced). </td></tr>
+<tr>
+<td align="left">2 </td><td align="left">'uniform' </td><td align="left">10000 
</td><td align="left">Resample for uniform class size with output size = 10K 
(i.e., balanced). </td></tr>
+<tr>
+<td align="left">3 </td><td align="left">NULL </td><td align="left">NULL 
</td><td align="left">Resample for uniform class size with output size = input 
size (i.e., balanced). Class_size=NULL has same behavior as âuniformâ. 
</td></tr>
+<tr>
+<td align="left">4 </td><td align="left">NULL </td><td align="left">10000 
</td><td align="left">Resample for uniform class size with output size = 10K 
(i.e., balanced). Class_size=NULL has same behavior as âuniformâ. </td></tr>
+<tr>
+<td align="left">5 </td><td align="left">'undersample' </td><td 
align="left">n/a </td><td align="left">Undersample such that all class values 
end up with the same number of observations as the minority. </td></tr>
+<tr>
+<td align="left">6 </td><td align="left">'oversample' </td><td 
align="left">n/a </td><td align="left">Oversample with replacement (always) 
such that all class values end up with the same number of observations as the 
majority. </td></tr>
+<tr>
+<td align="left">7 </td><td align="left">'red=3000' </td><td align="left">NULL 
</td><td align="left">Resample red to 3K, leave rest of the class values (blue, 
green, etc.) as is. </td></tr>
+<tr>
+<td align="left">8 </td><td align="left">'red=3000, blue=4000' </td><td 
align="left">10000 </td><td align="left">Resample red to 3K and blue to 4K, 
divide remaining class values evenly 3K/(n-2) each, where n=number of class 
values. Note that if red and blue are the only class values, then output table 
size will be 7K not 10K. (This is the only case where specifying a value for 
'output_table_size' may not actually result in an output table of that size.) 
</td></tr>
+</table>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<p>Note that due to the random nature of sampling, your results may look 
different from those below.</p>
+<ol type="1">
+<li>Create an input table using part of the flags data set from <a 
href="https://archive.ics.uci.edu/ml/datasets/Flags";>https://archive.ics.uci.edu/ml/datasets/Flags</a>
 : <pre class="syntax">
+DROP TABLE IF EXISTS flags;
+CREATE TABLE flags (
+    id INTEGER,
+    name TEXT,
+    landmass INTEGER,
+    zone INTEGER,
+    area INTEGER,
+    population INTEGER,
+    language INTEGER,
+    colours INTEGER,
+    mainhue TEXT
+);
+INSERT INTO flags VALUES
+(1, 'Argentina', 2, 3, 2777, 28, 2, 2, 'blue'),
+(2, 'Australia', 6, 2, 7690, 15, 1, 3, 'blue'),
+(3, 'Austria', 3, 1, 84, 8, 4, 2, 'red'),
+(4, 'Brazil', 2, 3, 8512, 119, 6, 4, 'green'),
+(5, 'Canada', 1, 4, 9976, 24, 1, 2, 'red'),
+(6, 'China', 5, 1, 9561, 1008, 7, 2, 'red'),
+(7, 'Denmark', 3, 1, 43, 5, 6, 2, 'red'),
+(8, 'Greece', 3, 1, 132, 10, 6, 2, 'blue'),
+(9, 'Guatemala', 1, 4, 109, 8, 2, 2, 'blue'),
+(10, 'Ireland', 3, 4, 70, 3, 1, 3, 'white'),
+(11, 'Jamaica', 1, 4, 11, 2, 1, 3, 'green'),
+(12, 'Luxembourg', 3, 1, 3, 0, 4, 3, 'red'),
+(13, 'Mexico', 1, 4, 1973, 77, 2, 4, 'green'),
+(14, 'Norway', 3, 1, 324, 4, 6, 3, 'red'),
+(15, 'Portugal', 3, 4, 92, 10, 6, 5, 'red'),
+(16, 'Spain', 3, 4, 505, 38, 2, 2, 'red'),
+(17, 'Sweden', 3, 1, 450, 8, 6, 2, 'blue'),
+(18, 'Switzerland', 3, 1, 41, 6, 4, 2, 'red'),
+(19, 'UK', 3, 4, 245, 56, 1, 3, 'red'),
+(20, 'USA', 1, 4, 9363, 231, 1, 3, 'white'),
+(21, 'xElba', 3, 1, 1, 1, 6, NULL, NULL),
+(22, 'xPrussia', 3, 1, 249, 61, 4, NULL, NULL);
+</pre></li>
+<li>Uniform sampling. All class values will be resampled so that they have the 
same number of rows. The output data size will be the same as the input data 
size, ignoring NULL values. Uniform sampling is the default for the 
'class_size' parameter so we do not need to explicitly set it: <pre 
class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue');          -- Class column
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name     | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+             5 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+             2 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+             3 |  8 | Greece      |        3 |    1 |  132 |         10 |      
  6 |       2 | blue
+             4 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+             1 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+            11 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+            12 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+            14 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            15 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            13 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             8 |  3 | Austria     |        3 |    1 |   84 |          8 |      
  4 |       2 | red
+            10 |  5 | Canada      |        1 |    4 | 9976 |         24 |      
  1 |       2 | red
+             9 |  7 | Denmark     |        3 |    1 |   43 |          5 |      
  6 |       2 | red
+             6 | 15 | Portugal    |        3 |    4 |   92 |         10 |      
  6 |       5 | red
+             7 | 18 | Switzerland |        3 |    1 |   41 |          6 |      
  4 |       2 | red
+            19 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            20 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            18 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            16 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+            17 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+(20 rows)
+</pre> Next we do uniform sampling again, but this time we specify a size for 
the output table: <pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue',           -- Class column
+                              'uniform',           -- Uniform sample
+                               12);                -- Desired output table size
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |   name    | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-----------+----------+------+------+------------+----------+---------+---------
+            10 |  1 | Argentina |        2 |    3 | 2777 |         28 |        
2 |       2 | blue
+            12 |  2 | Australia |        6 |    2 | 7690 |         15 |        
1 |       3 | blue
+            11 |  8 | Greece    |        3 |    1 |  132 |         10 |        
6 |       2 | blue
+             2 |  4 | Brazil    |        2 |    3 | 8512 |        119 |        
6 |       4 | green
+             3 | 11 | Jamaica   |        1 |    4 |   11 |          2 |        
1 |       3 | green
+             1 | 13 | Mexico    |        1 |    4 | 1973 |         77 |        
2 |       4 | green
+             5 |  7 | Denmark   |        3 |    1 |   43 |          5 |        
6 |       2 | red
+             6 | 14 | Norway    |        3 |    1 |  324 |          4 |        
6 |       3 | red
+             4 | 15 | Portugal  |        3 |    4 |   92 |         10 |        
6 |       5 | red
+             9 | 10 | Ireland   |        3 |    4 |   70 |          3 |        
1 |       3 | white
+             7 | 20 | USA       |        1 |    4 | 9363 |        231 |        
1 |       3 | white
+             8 | 20 | USA       |        1 |    4 | 9363 |        231 |        
1 |       3 | white
+(12 rows)
+</pre></li>
+<li>Oversampling. Oversample with replacement such that all class values 
except NULLs end up with the same number of observations as the majority class. 
Countries with red flags is the majority class with 10 observations, so other 
class values will be oversampled to 10 observations: <pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue',           -- Class column
+                              'oversample');       -- Oversample
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name     | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+            35 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            33 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            37 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            34 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            36 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            32 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            31 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+            39 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+            38 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+            40 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+            19 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+            20 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+            12 | 11 | Jamaica     |        1 |    4 |   11 |          2 |      
  1 |       3 | green
+            11 | 11 | Jamaica     |        1 |    4 |   11 |          2 |      
  1 |       3 | green
+            13 | 11 | Jamaica     |        1 |    4 |   11 |          2 |      
  1 |       3 | green
+            17 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            15 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            16 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            18 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            14 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             9 |  3 | Austria     |        3 |    1 |   84 |          8 |      
  4 |       2 | red
+             8 |  5 | Canada      |        1 |    4 | 9976 |         24 |      
  1 |       2 | red
+             1 |  6 | China       |        5 |    1 | 9561 |       1008 |      
  7 |       2 | red
+            10 |  7 | Denmark     |        3 |    1 |   43 |          5 |      
  6 |       2 | red
+             2 | 12 | Luxembourg  |        3 |    1 |    3 |          0 |      
  4 |       3 | red
+             4 | 14 | Norway      |        3 |    1 |  324 |          4 |      
  6 |       3 | red
+             6 | 15 | Portugal    |        3 |    4 |   92 |         10 |      
  6 |       5 | red
+             3 | 16 | Spain       |        3 |    4 |  505 |         38 |      
  2 |       2 | red
+             5 | 18 | Switzerland |        3 |    1 |   41 |          6 |      
  4 |       2 | red
+             7 | 19 | UK          |        3 |    4 |  245 |         56 |      
  1 |       3 | red
+            22 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            26 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            24 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            21 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            27 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            25 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            23 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            29 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+            30 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+            28 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+(40 rows)
+</pre></li>
+<li>Undersampling. Undersample such that all class values except NULLs end up 
with the same number of observations as the minority class. Countries with 
white flags is the minority class with 2 observations, so other class values 
will be undersampled to 2 observations: <pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue',           -- Class column
+                              'undersample');      -- Undersample
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name     | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+             1 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+             2 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+             4 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+             3 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             5 | 16 | Spain       |        3 |    4 |  505 |         38 |      
  2 |       2 | red
+             6 | 18 | Switzerland |        3 |    1 |   41 |          6 |      
  4 |       2 | red
+             8 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+             7 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+(8 rows)
+</pre> We may also want to undersample with replacement, so we set the 
'with_replacement' parameter to TRUE: <pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue',           -- Class column
+                              'undersample',       -- Undersample
+                               NULL,               -- Output table size will 
be calculated
+                               NULL,               -- No grouping
+                              'TRUE');             -- Sample with replacement
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |   name    | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-----------+----------+------+------+------------+----------+---------+---------
+             2 |  9 | Guatemala |        1 |    4 |  109 |          8 |        
2 |       2 | blue
+             1 |  9 | Guatemala |        1 |    4 |  109 |          8 |        
2 |       2 | blue
+             3 |  4 | Brazil    |        2 |    3 | 8512 |        119 |        
6 |       4 | green
+             4 | 13 | Mexico    |        1 |    4 | 1973 |         77 |        
2 |       4 | green
+             6 |  5 | Canada    |        1 |    4 | 9976 |         24 |        
1 |       2 | red
+             5 | 19 | UK        |        3 |    4 |  245 |         56 |        
1 |       3 | red
+             7 | 20 | USA       |        1 |    4 | 9363 |        231 |        
1 |       3 | white
+             8 | 20 | USA       |        1 |    4 | 9363 |        231 |        
1 |       3 | white
+(8 rows)
+</pre> Note above that some rows may appear multiple times above since we 
sampled with replacement.</li>
+<li>Setting class size by count. Here we set the number of rows for red and 
blue flags, and leave green and white flags unchanged: <pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue',           -- Class column
+                              'red=7, blue=7');    -- Want 7 reds and 7 blues
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name    | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+------------+----------+------+------+------------+----------+---------+---------
+             5 |  2 | Australia  |        6 |    2 | 7690 |         15 |       
 1 |       3 | blue
+             7 |  8 | Greece     |        3 |    1 |  132 |         10 |       
 6 |       2 | blue
+             6 |  8 | Greece     |        3 |    1 |  132 |         10 |       
 6 |       2 | blue
+             1 |  9 | Guatemala  |        1 |    4 |  109 |          8 |       
 2 |       2 | blue
+             3 | 17 | Sweden     |        3 |    1 |  450 |          8 |       
 6 |       2 | blue
+             2 | 17 | Sweden     |        3 |    1 |  450 |          8 |       
 6 |       2 | blue
+             4 | 17 | Sweden     |        3 |    1 |  450 |          8 |       
 6 |       2 | blue
+             8 |  4 | Brazil     |        2 |    3 | 8512 |        119 |       
 6 |       4 | green
+            18 | 11 | Jamaica    |        1 |    4 |   11 |          2 |       
 1 |       3 | green
+            19 | 13 | Mexico     |        1 |    4 | 1973 |         77 |       
 2 |       4 | green
+            13 |  3 | Austria    |        3 |    1 |   84 |          8 |       
 4 |       2 | red
+            14 |  5 | Canada     |        1 |    4 | 9976 |         24 |       
 1 |       2 | red
+            17 |  6 | China      |        5 |    1 | 9561 |       1008 |       
 7 |       2 | red
+            15 | 12 | Luxembourg |        3 |    1 |    3 |          0 |       
 4 |       3 | red
+            16 | 14 | Norway     |        3 |    1 |  324 |          4 |       
 6 |       3 | red
+            11 | 15 | Portugal   |        3 |    4 |   92 |         10 |       
 6 |       5 | red
+            12 | 16 | Spain      |        3 |    4 |  505 |         38 |       
 2 |       2 | red
+             9 | 10 | Ireland    |        3 |    4 |   70 |          3 |       
 1 |       3 | white
+            10 | 20 | USA        |        1 |    4 | 9363 |        231 |       
 1 |       3 | white
+(19 rows)
+</pre> Next we set the number of rows for red and blue flags, and also set an 
output table size. This means that green and white flags will be uniformly 
sampled to get to the desired output table size: <pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue',           -- Class column
+                              'red=7, blue=7',     -- Want 7 reds and 7 blues
+                               22);                -- Desired output table size
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name     | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+            16 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            20 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+            21 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+            22 |  8 | Greece      |        3 |    1 |  132 |         10 |      
  6 |       2 | blue
+            18 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+            19 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+            17 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+             9 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+            10 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+             8 | 11 | Jamaica     |        1 |    4 |   11 |          2 |      
  1 |       3 | green
+            11 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             6 |  3 | Austria     |        3 |    1 |   84 |          8 |      
  4 |       2 | red
+             7 |  5 | Canada      |        1 |    4 | 9976 |         24 |      
  1 |       2 | red
+             2 |  7 | Denmark     |        3 |    1 |   43 |          5 |      
  6 |       2 | red
+             1 | 12 | Luxembourg  |        3 |    1 |    3 |          0 |      
  4 |       3 | red
+             3 | 15 | Portugal    |        3 |    4 |   92 |         10 |      
  6 |       5 | red
+             5 | 16 | Spain       |        3 |    4 |  505 |         38 |      
  2 |       2 | red
+             4 | 18 | Switzerland |        3 |    1 |   41 |          6 |      
  4 |       2 | red
+            14 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            13 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            15 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            12 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+(22 rows)
+</pre></li>
+<li>To make NULL a valid class value, set the parameter to keep NULLs: <pre 
class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+                              'flags',             -- Source table
+                              'output_table',      -- Output table
+                              'mainhue',           -- Class column
+                               NULL,               -- Uniform
+                               NULL,               -- Output table size
+                               NULL,               -- No grouping
+                               NULL,               -- Sample without 
replacement
+                              'TRUE');             -- Make NULLs a valid class 
value
+SELECT * FROM output_table ORDER BY mainhue, name;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name     | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+            25 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+            22 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+            24 |  8 | Greece      |        3 |    1 |  132 |         10 |      
  6 |       2 | blue
+            21 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+            23 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+             7 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+             6 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+            10 | 11 | Jamaica     |        1 |    4 |   11 |          2 |      
  1 |       3 | green
+             8 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             9 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             3 |  3 | Austria     |        3 |    1 |   84 |          8 |      
  4 |       2 | red
+             1 |  5 | Canada      |        1 |    4 | 9976 |         24 |      
  1 |       2 | red
+             2 | 16 | Spain       |        3 |    4 |  505 |         38 |      
  2 |       2 | red
+             4 | 18 | Switzerland |        3 |    1 |   41 |          6 |      
  4 |       2 | red
+             5 | 19 | UK          |        3 |    4 |  245 |         56 |      
  1 |       3 | red
+            13 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            11 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            14 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            12 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+            15 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+            17 | 21 | xElba       |        3 |    1 |    1 |          1 |      
  6 |         |
+            18 | 21 | xElba       |        3 |    1 |    1 |          1 |      
  6 |         |
+            16 | 21 | xElba       |        3 |    1 |    1 |          1 |      
  6 |         |
+            20 | 22 | xPrussia    |        3 |    1 |  249 |         61 |      
  4 |         |
+            19 | 22 | xPrussia    |        3 |    1 |  249 |         61 |      
  4 |         |
+(25 rows)
+</pre></li>
+<li>To perform the balance sampling for independent groups, use the 
'grouping_cols' parameter. Note below that each group (zone) has a different 
count of the classes (mainhue), with some groups not containing some class 
values. <pre class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+    'flags',          -- Source table
+    'output_table',   -- Output table
+    'mainhue',        -- Class column
+    NULL,             -- Uniform
+    NULL,             -- Output table size
+    'zone'            -- Grouping by zone
+);
+SELECT * FROM output_table ORDER BY zone, mainhue;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name     | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+             6 |  8 | Greece      |        3 |    1 |  132 |         10 |      
  6 |       2 | blue
+             5 |  8 | Greece      |        3 |    1 |  132 |         10 |      
  6 |       2 | blue
+             8 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+             7 |  8 | Greece      |        3 |    1 |  132 |         10 |      
  6 |       2 | blue
+             2 |  7 | Denmark     |        3 |    1 |   43 |          5 |      
  6 |       2 | red
+             1 |  6 | China       |        5 |    1 | 9561 |       1008 |      
  7 |       2 | red
+             4 | 12 | Luxembourg  |        3 |    1 |    3 |          0 |      
  4 |       3 | red
+             3 | 18 | Switzerland |        3 |    1 |   41 |          6 |      
  4 |       2 | red
+             1 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+             1 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+             2 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+             6 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+             5 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+             4 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+            12 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            10 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+            11 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             1 | 19 | UK          |        3 |    4 |  245 |         56 |      
  1 |       3 | red
+             3 |  5 | Canada      |        1 |    4 | 9976 |         24 |      
  1 |       2 | red
+             2 | 15 | Portugal    |        3 |    4 |   92 |         10 |      
  6 |       5 | red
+             8 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+             7 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+             9 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+(23 rows)
+</pre></li>
+<li>Grouping can be used with class size specification as well. Note below 
that 'blue=&lt;Integer&gt;' is the only valid class value since 'blue' is the 
only class value that is present in each group. Further, 'blue=8' will be split 
between the four groups, resulting in two blue rows for each group. <pre 
class="syntax">
+DROP TABLE IF EXISTS output_table;
+SELECT madlib.balance_sample(
+    'flags',          -- Source table
+    'output_table',   -- Output table
+    'mainhue',        -- Class column
+    'blue=8',         -- Specified class value size. Rest of the values are 
outputed as is.
+    NULL,             -- Output table size
+    'zone'            -- Group by zone
+);
+SELECT * FROM output_table ORDER BY zone, mainhue;
+</pre> <pre class="result">
+ __madlib_id__ | id |    name     | landmass | zone | area | population | 
language | colours | mainhue
+---------------+----+-------------+----------+------+------+------------+----------+---------+---------
+             2 | 17 | Sweden      |        3 |    1 |  450 |          8 |      
  6 |       2 | blue
+             1 |  8 | Greece      |        3 |    1 |  132 |         10 |      
  6 |       2 | blue
+             3 |  3 | Austria     |        3 |    1 |   84 |          8 |      
  4 |       2 | red
+             5 |  7 | Denmark     |        3 |    1 |   43 |          5 |      
  6 |       2 | red
+             4 |  6 | China       |        5 |    1 | 9561 |       1008 |      
  7 |       2 | red
+             8 | 18 | Switzerland |        3 |    1 |   41 |          6 |      
  4 |       2 | red
+             7 | 14 | Norway      |        3 |    1 |  324 |          4 |      
  6 |       3 | red
+             6 | 12 | Luxembourg  |        3 |    1 |    3 |          0 |      
  4 |       3 | red
+             1 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+             2 |  2 | Australia   |        6 |    2 | 7690 |         15 |      
  1 |       3 | blue
+             1 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+             2 |  1 | Argentina   |        2 |    3 | 2777 |         28 |      
  2 |       2 | blue
+             3 |  4 | Brazil      |        2 |    3 | 8512 |        119 |      
  6 |       4 | green
+             2 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+             1 |  9 | Guatemala   |        1 |    4 |  109 |          8 |      
  2 |       2 | blue
+             5 | 11 | Jamaica     |        1 |    4 |   11 |          2 |      
  1 |       3 | green
+             6 | 13 | Mexico      |        1 |    4 | 1973 |         77 |      
  2 |       4 | green
+             3 |  5 | Canada      |        1 |    4 | 9976 |         24 |      
  1 |       2 | red
+             7 | 15 | Portugal    |        3 |    4 |   92 |         10 |      
  6 |       5 | red
+             8 | 16 | Spain       |        3 |    4 |  505 |         38 |      
  2 |       2 | red
+             9 | 19 | UK          |        3 |    4 |  245 |         56 |      
  1 |       3 | red
+            10 | 20 | USA         |        1 |    4 | 9363 |        231 |      
  1 |       3 | white
+             4 | 10 | Ireland     |        3 |    4 |   70 |          3 |      
  1 |       3 | white
+(23 rows)
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] Object naming in PostgreSQL <a 
href="https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS";>https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="balance__sample_8sql__in.html" title="SQL 
functions for balanced data sets sampling. ">balance_sample.sql_in</a> for list 
of functions and usage. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:11 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

[32/51] [partial] madlib-site git commit: Doc: Add v1.14 documentation

Reply via email to