[30/51] [partial] incubator-madlib-site git commit: Add v1.11 docs

riyer Tue, 16 May 2017 13:30:00 -0700

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__assoc__rules.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__assoc__rules.html 
b/docs/v1.11/group__grp__assoc__rules.html
new file mode 100644
index 0000000..47b5923
--- /dev/null
+++ b/docs/v1.11/group__grp__assoc__rules.html
@@ -0,0 +1,362 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Apriori Algorithm</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__assoc__rules.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Apriori Algorithm<div class="ingroups"><a class="el" 
href="group__grp__unsupervised.html">Unsupervised Learning</a> &raquo; <a 
class="el" href="group__grp__association__rules.html">Association 
Rules</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#rules">Rules</a> </li>
+<li>
+<a href="#algorithm">Apriori Algorithm</a> </li>
+<li>
+<a href="#syntax">Function Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#notes">Notes</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>This module implements the association rules data mining technique on 
a transactional data set. Given the names of a table and the columns, minimum 
support and confidence values, this function generates all single and 
multidimensional association rules that meet the minimum thresholds.</p>
+<p>Association rule mining is a widely used technique for discovering 
relationships between variables in a large data set (e.g., items in a store 
that are commonly purchased together). The classic market basket analysis 
example using association rules is the "beer and diapers" rule. According to 
data mining urban legend, a study of customer purchase behavior in a 
supermarket found that men often purchased beer and diapers together. After 
making this discovery, the managers strategically placed beer and diapers 
closer together on the shelves and saw a dramatic increase in sales. In 
addition to market basket analysis, association rules are also used in 
bioinformatics, web analytics, and several other fields.</p>
+<p>This type of data mining algorithm uses transactional data. Every 
transaction event has a unique identification, and each transaction consists of 
a set of items (or itemset). Purchases are considered binary (either it was 
purchased or not), and this implementation does not take into consideration the 
quantity of each item. For the MADlib association rules function, it is assumed 
that the data is stored in two columns with one item and transaction id per 
row. Transactions with multiple items will span multiple rows with one row per 
item.</p>
+<pre>
+    trans_id | product
+    ---------+---------
+           1 | 1
+           1 | 2
+           1 | 3
+           1 | 4
+           2 | 3
+           2 | 4
+           2 | 5
+           3 | 1
+           3 | 4
+           3 | 6
+    ...
+</pre><p><a class="anchor" id="rules"></a></p><dl class="section 
user"><dt>Rules</dt><dd></dd></dl>
+<p>Association rules take the form "If X, then Y", where X and Y are non-empty 
itemsets. X and Y are called the antecedent and consequent, or the 
left-hand-side and right-hand-side, of the rule respectively. Using our 
previous example, the association rule may state "If {diapers}, then {beer}" 
with .2 support and .85 confidence.</p>
+<p>The following metrics are defined for any given itemset "X".</p><ul>
+<li>Count: The number of transactions that contain X</li>
+<li>Support: The ratio of transactions that contain X to all transactions, T 
<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ S (X) = \frac{Total X}{Total transactions} \]" 
src="form_0.png"/>
+</p>
+</li>
+</ul>
+<p>Given any association rule "If X, then Y", the association rules function 
will also calculate the following metrics:</p><ul>
+<li>Count: The number of transactions that contain X,Y</li>
+<li>Support: The ratio of transactions that contain X,Y to all transactions, T 
<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ S (X \Rightarrow Y) = \frac{Total(X \cup 
Y)}{Total transactions} \]" src="form_1.png"/>
+</p>
+</li>
+<li>Confidence: The ratio of transactions that contain <img class="formulaInl" 
alt="$ X,Y $" src="form_2.png"/> to transactions that contain <img 
class="formulaInl" alt="$ X $" src="form_3.png"/>. One could view this metric 
as the conditional probability of <img class="formulaInl" alt="$ Y $" 
src="form_4.png"/> , given <img class="formulaInl" alt="$ X $" 
src="form_3.png"/> . <img class="formulaInl" alt="$ P(Y|X) $" 
src="form_5.png"/> <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ C (X \Rightarrow Y) = \frac{s(X \cap Y 
)}{s(X)} \]" src="form_6.png"/>
+</p>
+</li>
+<li>Lift: The ratio of observed support of <img class="formulaInl" alt="$ X,Y 
$" src="form_2.png"/> to the expected support of <img class="formulaInl" alt="$ 
X,Y $" src="form_2.png"/> , assuming <img class="formulaInl" alt="$ X $" 
src="form_3.png"/> and <img class="formulaInl" alt="$ Y $" src="form_4.png"/> 
are independent. <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ L (X \Rightarrow Y) = \frac{s(X \cap Y )}{s(X) 
\cdot s(Y)} \]" src="form_7.png"/>
+</p>
+</li>
+<li><p class="startli">Conviction: The ratio of expected support of <img 
class="formulaInl" alt="$ X $" src="form_3.png"/> occurring without <img 
class="formulaInl" alt="$ Y $" src="form_4.png"/> assuming <img 
class="formulaInl" alt="$ X $" src="form_3.png"/> and <img class="formulaInl" 
alt="$ \neg Y $" src="form_8.png"/> are independent, to the observed support of 
<img class="formulaInl" alt="$ X $" src="form_3.png"/> occuring without <img 
class="formulaInl" alt="$ Y $" src="form_4.png"/>. If conviction is greater 
than 1, then this metric shows that incorrect predictions ( <img 
class="formulaInl" alt="$ X \Rightarrow Y $" src="form_9.png"/> ) occur less 
often than if these two actions were independent. This metric can be viewed as 
the ratio that the association rule would be incorrect if the actions were 
independent (i.e. a conviction of 1.5 indicates that if the variables were 
independent, this rule would be incorrect 50% more often.)</p>
+<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ Conv (X \Rightarrow Y) = \frac{1 - S(Y)}{1 - 
C(X \Rightarrow Y)} \]" src="form_10.png"/>
+</p>
+</li>
+</ul>
+<p><a class="anchor" id="algorithm"></a></p><dl class="section 
user"><dt>Apriori Algorithm</dt><dd></dd></dl>
+<p>Although there are many algorithms that generate association rules, the 
classic algorithm is called Apriori [1] which we have implemented in this 
module. It is a breadth-first search, as opposed to depth-first searches like 
Eclat. Frequent itemsets of order <img class="formulaInl" alt="$ n $" 
src="form_11.png"/> are generated from sets of order <img class="formulaInl" 
alt="$ n - 1 $" src="form_12.png"/>. Using the downward closure property, all 
sets must have frequent subsets. There are two steps in this algorithm; 
generating frequent itemsets, and using these itemsets to construct the 
association rules. A simplified version of the algorithm is as follows, and 
assumes a minimum level of support and confidence is provided:</p>
+<p><em>Initial</em> <em>step</em> </p><ol type="1">
+<li>Generate all itemsets of order 1.</li>
+<li>Eliminate itemsets that have support less than minimum support.</li>
+</ol>
+<p><em>Main</em> <em>algorithm</em> </p><ol type="1">
+<li>For <img class="formulaInl" alt="$ n \ge 2 $" src="form_13.png"/>, 
generate itemsets of order <img class="formulaInl" alt="$ n $" 
src="form_11.png"/> by combining the itemsets of order <img class="formulaInl" 
alt="$ n - 1 $" src="form_12.png"/>. This is done by doing the union of two 
itemsets that have identical items except one.</li>
+<li>Eliminate itemsets that have (n-1) order subsets with insufficient 
support.</li>
+<li>Eliminate itemsets with insufficient support.</li>
+<li>Repeat until itemsets cannot be generated, or maximum itemset size is 
exceeded.</li>
+</ol>
+<p><em>Association</em> <em>rule</em> <em>generation</em> </p>
+<p>Given a frequent itemset <img class="formulaInl" alt="$ A $" 
src="form_14.png"/> generated from the Apriori algorithm, and all subsets <img 
class="formulaInl" alt="$ B $" src="form_15.png"/> , we generate rules such 
that <img class="formulaInl" alt="$ B \Rightarrow (A - B) $" 
src="form_16.png"/> meets minimum confidence requirements.</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function 
Syntax</dt><dd>Association rules has the following syntax: <pre class="syntax">
+assoc_rules( support,
+             confidence,
+             tid_col,
+             item_col,
+             input_table,
+             output_schema,
+             verbose,
+             max_itemset_size
+           );</pre> This generates all association rules that satisfy the 
specified minimum <em>support</em> and <em>confidence</em>.</dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>support </dt>
+<dd><p class="startdd">Minimum level of support needed for each itemset to be 
included in result.</p>
+<p class="enddd"></p>
+</dd>
+<dt>confidence </dt>
+<dd><p class="startdd">Minimum level of confidence needed for each rule to be 
included in result.</p>
+<p class="enddd"></p>
+</dd>
+<dt>tid_col </dt>
+<dd><p class="startdd">Name of the column storing the transaction ids.</p>
+<p class="enddd"></p>
+</dd>
+<dt>item_col </dt>
+<dd><p class="startdd">Name of the column storing the products.</p>
+<p class="enddd"></p>
+</dd>
+<dt>input_table </dt>
+<dd><p class="startdd">Name of the table containing the input data.</p>
+<p>The input data is expected to be of the following form: 
</p><pre>{TABLE|VIEW} <em>input_table</em> (
+    <em>trans_id</em> INTEGER,
+    <em>product</em> TEXT
+)</pre><p>The algorithm maps the product names to consecutive integer ids 
starting at 1. If they are already structured this way, then the ids will not 
change. </p>
+<p class="enddd"></p>
+</dd>
+<dt>output_schema </dt>
+<dd><p class="startdd">The name of the schema where the final results will be 
stored. The schema must be created before calling the function. Alternatively, 
use <code>NULL</code> to output to the current schema.</p>
+<p>The results containing the rules, support, count, confidence, lift, and 
conviction are stored in the table <code>assoc_rules</code> in the schema 
specified by <code>output_schema</code>.</p>
+<p>The table has the following columns. </p><table class="output">
+<tr>
+<th>ruleid </th><td>integer  </td></tr>
+<tr>
+<th>pre </th><td>text  </td></tr>
+<tr>
+<th>post </th><td>text  </td></tr>
+<tr>
+<th>count </th><td>integer  </td></tr>
+<tr>
+<th>support </th><td>double  </td></tr>
+<tr>
+<th>confidence </th><td>double  </td></tr>
+<tr>
+<th>lift </th><td>double  </td></tr>
+<tr>
+<th>conviction </th><td>double  </td></tr>
+</table>
+<p>On Greenplum Database or Apache HAWQ, the table is distributed by the 
<code>ruleid</code> column.</p>
+<p>The <code>pre</code> and <code>post</code> columns are the itemsets of left 
and right hand sides of the association rule respectively. The 
<code>support</code>, <code>confidence</code>, <code>lift</code>, and 
<code>conviction</code> columns are calculated as described earlier. </p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose </dt>
+<dd><p class="startdd">BOOLEAN, default: FALSE. Determines if details are 
printed for each iteration as the algorithm progresses.</p>
+<p class="enddd"></p>
+</dd>
+<dt>max_itemset_size </dt>
+<dd>INTEGER, default: generate itemsets of all sizes. Determines the maximum 
size of frequent itemsets that are used for generating association rules. Must 
be 2 or more. This parameter can be used to reduce run time for data sets where 
itemset size is large.  </dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<p>Let's look at some sample transactional data and generate association 
rules.</p>
+<ol type="1">
+<li>Create an input dataset: <pre class="example">
+DROP TABLE IF EXISTS test_data;
+CREATE TABLE test_data (
+    trans_id INT,
+    product TEXT
+);
+INSERT INTO test_data VALUES (1, 'beer');
+INSERT INTO test_data VALUES (1, 'diapers');
+INSERT INTO test_data VALUES (1, 'chips');
+INSERT INTO test_data VALUES (2, 'beer');
+INSERT INTO test_data VALUES (2, 'diapers');
+INSERT INTO test_data VALUES (3, 'beer');
+INSERT INTO test_data VALUES (3, 'diapers');
+INSERT INTO test_data VALUES (4, 'beer');
+INSERT INTO test_data VALUES (4, 'chips');
+INSERT INTO test_data VALUES (5, 'beer');
+INSERT INTO test_data VALUES (6, 'beer');
+INSERT INTO test_data VALUES (6, 'diapers');
+INSERT INTO test_data VALUES (6, 'chips');
+INSERT INTO test_data VALUES (7, 'beer');
+INSERT INTO test_data VALUES (7, 'diapers');
+</pre></li>
+<li>Let <img class="formulaInl" alt="$ min(support) = .25 $" 
src="form_17.png"/> and <img class="formulaInl" alt="$ min(confidence) = .5 $" 
src="form_18.png"/>, and the output schema is set to <code>NULL</code> 
indicating output to the current schema. In this example we set verbose to TRUE 
so that we have some insight into progress of the function. We can now generate 
association rules as follows: <pre class="example">
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE            -- Verbose output
+                                );
+</pre> Result (iteration details not shown): <pre class="result">
+ output_schema | output_table | total_rules |   total_time    
+---------------+--------------+-------------+-----------------
+ public        | assoc_rules  |           7 | 00:00:00.569254
+(1 row)
+</pre> The association rules are stored in the assoc_rules table: <pre 
class="example">
+SELECT * FROM assoc_rules
+ORDER BY support DESC, confidence DESC;
+</pre> Result: <pre class="result">
+ ruleid |       pre       |      post      | count |      support      |    
confidence     |       lift        |    conviction     
+--------+-----------------+----------------+-------+-------------------+-------------------+-------------------+-------------------
+      2 | {diapers}       | {beer}         |     5 | 0.714285714285714 |       
          1 |                 1 |                 0
+      6 | {beer}          | {diapers}      |     5 | 0.714285714285714 | 
0.714285714285714 |                 1 |                 1
+      5 | {chips}         | {beer}         |     3 | 0.428571428571429 |       
          1 |                 1 |                 0
+      4 | {chips,diapers} | {beer}         |     2 | 0.285714285714286 |       
          1 |                 1 |                 0
+      1 | {chips}         | {diapers,beer} |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      7 | {chips}         | {diapers}      |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      3 | {beer,chips}    | {diapers}      |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+(7 rows)
+</pre></li>
+<li>Limit association rules generated from itemsets of size at most 2: <pre 
class="example">
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE,           -- Verbose output
+                                  2               -- Max itemset size
+                                );
+</pre> Result (iteration details not shown): <pre class="result">
+ output_schema | output_table | total_rules |   total_time    
+---------------+--------------+-------------+-----------------
+ public        | assoc_rules  |           4 | 00:00:00.565176
+(1 row)
+</pre> The association rules are again stored in the assoc_rules table: <pre 
class="example">
+SELECT * FROM assoc_rules
+ORDER BY support DESC, confidence DESC;
+</pre> Result: <pre class="result">
+ ruleid |    pre    |   post    | count |      support      |    confidence    
 |       lift        |    conviction     
+--------+-----------+-----------+-------+-------------------+-------------------+-------------------+-------------------
+      1 | {diapers} | {beer}    |     5 | 0.714285714285714 |                 
1 |                 1 |                 0
+      2 | {beer}    | {diapers} |     5 | 0.714285714285714 | 
0.714285714285714 |                 1 |                 1
+      3 | {chips}   | {beer}    |     3 | 0.428571428571429 |                 
1 |                 1 |                 0
+      4 | {chips}   | {diapers} |     2 | 0.285714285714286 | 
0.666666666666667 | 0.933333333333333 | 0.857142857142857
+(4 rows)
+</pre></li>
+<li>Post-processing can now be done on the output table in the case that you 
want to filter the results. For example, if you want any single item on the 
left hand side and a particular item on the right hand side: <pre 
class="example">
+SELECT * FROM assoc_rules WHERE array_upper(pre,1) = 1 AND post = 
array['beer'];
+</pre> Result: <pre class="result">
+ ruleid |    pre    |  post  | count |      support      | confidence | lift | 
conviction 
+--------+-----------+--------+-------+-------------------+------------+------+------------
+      1 | {diapers} | {beer} |     5 | 0.714285714285714 |          1 |    1 | 
         0
+      3 | {chips}   | {beer} |     3 | 0.428571428571429 |          1 |    1 | 
         0
+(2 rows)
+</pre></li>
+</ol>
+<p><a class="anchor" id="notes"></a></p><dl class="section 
user"><dt>Notes</dt><dd></dd></dl>
+<p>The association rules function always creates a table named 
<code>assoc_rules</code>. Make a copy of this table before running the function 
again if you would like to keep multiple association rule tables.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] <a 
href="https://en.wikipedia.org/wiki/Apriori_algorithm";>https://en.wikipedia.org/wiki/Apriori_algorithm</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="assoc__rules_8sql__in.html" title="The assoc_rules 
function computes association rules for a given set of data. The data is 
assumed to h...">assoc_rules.sql_in</a> documenting the SQL function. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__association__rules.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__association__rules.html 
b/docs/v1.11/group__grp__association__rules.html
new file mode 100644
index 0000000..719bc88
--- /dev/null
+++ b/docs/v1.11/group__grp__association__rules.html
@@ -0,0 +1,133 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Association Rules</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__association__rules.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Association Rules<div class="ingroups"><a class="el" 
href="group__grp__unsupervised.html">Unsupervised Learning</a></div></div>  
</div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>A collection of methods used to uncover interesting patterns in 
transactional datasets. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__assoc__rules"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__assoc__rules.html">Apriori 
Algorithm</a></td></tr>
+<tr class="memdesc:group__grp__assoc__rules"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Computes association rules 
for a given set of data. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:38 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__association__rules.js
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__association__rules.js 
b/docs/v1.11/group__grp__association__rules.js
new file mode 100644
index 0000000..e10c849
--- /dev/null
+++ b/docs/v1.11/group__grp__association__rules.js
@@ -0,0 +1,4 @@
+var group__grp__association__rules =
+[
+    [ "Apriori Algorithm", "group__grp__assoc__rules.html", null ]
+];
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__bayes.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__bayes.html 
b/docs/v1.11/group__grp__bayes.html
new file mode 100644
index 0000000..aab5b5b
--- /dev/null
+++ b/docs/v1.11/group__grp__bayes.html
@@ -0,0 +1,482 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Naive Bayes Classification</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__bayes.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Naive Bayes Classification<div class="ingroups"><a 
class="el" href="group__grp__early__stage.html">Early Stage 
Development</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#train">Training Function(s)</a> </li>
+<li>
+<a href="#classify">Classify Function(s)</a> </li>
+<li>
+<a href="#probabilities">Probabilities Function(s)</a> </li>
+<li>
+<a href="#adhoc">Ad Hoc Computation</a> </li>
+<li>
+<a href="#notes">Implementation Notes</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#background">Technical Background</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method 
is still in early stage development. There may be some issues that will be 
addressed in a future version. Interface and implementation is subject to 
change. </em></dd></dl>
+<p>Naive Bayes refers to a stochastic model where all independent variables 
<img class="formulaInl" alt="$ a_1, \dots, a_n $" src="form_19.png"/> (often 
referred to as attributes in this context) independently contribute to the 
probability that a data point belongs to a certain class <img 
class="formulaInl" alt="$ c $" src="form_20.png"/>.</p>
+<p>Naives Bayes classification estimates feature probabilities and class 
priors using maximum likelihood or Laplacian smoothing. For numeric attributes, 
Gaussian smoothing can be used to estimate the feature probabilities.These 
parameters are then used to classify new data.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function(s)</dt><dd></dd></dl>
+<p>For data with only categorical attributes, precompute feature probabilities 
and class priors using the following function:</p>
+<pre class="syntax">
+create_nb_prepared_data_tables ( trainingSource,
+                                 trainingClassColumn,
+                                 trainingAttrColumn,
+                                 numAttrs,
+                                 featureProbsName,
+                                 classPriorsName
+                               )
+</pre><p>For data containing both categorical and numeric attributes, use the 
following form to precompute the Gaussian parameters (mean and variance) for 
numeric attributes alongside the feature probabilities for categorical 
attributes and class priors.</p>
+<pre class="syntax">
+create_nb_prepared_data_tables ( trainingSource,
+                                 trainingClassColumn,
+                                 trainingAttrColumn,
+                                 numericAttrsColumnIndices,
+                                 numAttrs,
+                                 featureProbsName,
+                                 numericAttrParamsName,
+                                 classPriorsName
+                               )
+</pre><p>The <em>trainingSource</em> is expected to be of the following form: 
</p><pre>{TABLE|VIEW} <em>trainingSource</em> (
+    ...
+    <em>trainingClassColumn</em> INTEGER,
+    <em>trainingAttrColumn</em> INTEGER[] OR NUMERIC[] OR FLOAT8[],
+    ...
+)</pre><p><em>numericAttrsColumnIndices</em> should be of type TEXT, specified 
as an array of indices (starting from 1) in the <em>trainingAttrColumn</em> 
attributes-array that correspond to numeric attributes.</p>
+<p>The two output tables are:</p><ul>
+<li><em>featureProbsName</em> &ndash; stores feature probabilities</li>
+<li><em>classPriorsName</em> &ndash; stores the class priors</li>
+</ul>
+<p>In addition to the above, if the function specifying numeric attributes is 
used, an additional table <em>numericAttrParamsName</em> is created which 
stores the Gaussian parameters for the numeric attributes.</p>
+<p><a class="anchor" id="classify"></a></p><dl class="section 
user"><dt>Classify Function(s)</dt><dd></dd></dl>
+<p>Perform Naive Bayes classification: </p><pre class="syntax">
+create_nb_classify_view ( featureProbsName,
+                          classPriorsName,
+                          classifySource,
+                          classifyKeyColumn,
+                          classifyAttrColumn,
+                          numAttrs,
+                          destName
+                        )
+</pre><p>For data with numeric attributes, use the following version:</p>
+<pre class="syntax">
+create_nb_classify_view ( featureProbsName,
+                          classPriorsName,
+                          classifySource,
+                          classifyKeyColumn,
+                          classifyAttrColumn,
+                          numAttrs,
+                          numericAttrParamsName,
+                          destName
+                        )
+</pre><p>The <b>data to classify</b> is expected to be of the following form: 
</p><pre>{TABLE|VIEW} <em>classifySource</em> (
+    ...
+    <em>classifyKeyColumn</em> ANYTYPE,
+    <em>classifyAttrColumn</em> INTEGER[],
+    ...
+)</pre><p>This function creates the view <code><em>destName</em></code> 
mapping <em>classifyKeyColumn</em> to the Naive Bayes classification. </p><pre 
class="result">
+key | nb_classification
+&#160;---+------------------
+...
+</pre><p><a class="anchor" id="probabilities"></a></p><dl class="section 
user"><dt>Probabilities Function(s)</dt><dd></dd></dl>
+<p>Compute Naive Bayes probabilities. </p><pre class="syntax">
+create_nb_probs_view( featureProbsName,
+                      classPriorsName,
+                      classifySource,
+                      classifyKeyColumn,
+                      classifyAttrColumn,
+                      numAttrs,
+                      destName
+                    )
+</pre><p>For data with numeric attributes , use the following version:</p>
+<pre class="syntax">
+create_nb_probs_view( featureProbsName,
+                      classPriorsName,
+                      classifySource,
+                      classifyKeyColumn,
+                      classifyAttrColumn,
+                      numAttrs,
+                      numericAttrParamsName,
+                      destName
+                    )
+</pre><p>This creates the view <code><em>destName</em></code> mapping 
<em>classifyKeyColumn</em> and every single class to the Naive Bayes 
probability: </p><pre class="result">
+key | class | nb_prob
+&#160;---+-------+--------
+...
+</pre><p><a class="anchor" id="adhoc"></a></p><dl class="section user"><dt>Ad 
Hoc Computation Function</dt><dd></dd></dl>
+<p>With ad hoc execution (no precomputation), the functions <a class="el" 
href="bayes_8sql__in.html#a798402280fc6db710957ae3ab58767e0" title="Create a 
view with columns (key, nb_classification) ">create_nb_classify_view()</a> and 
<a class="el" href="bayes_8sql__in.html#a163afffd0c845d325f060f74bcf02243" 
title="Create view with columns (key, class, nb_prob) 
">create_nb_probs_view()</a> can be used in an ad-hoc fashion without the 
precomputation step. In this case, replace the function arguments</p>
+<pre>'<em>featureProbsName</em>', '<em>classPriorsName</em>'</pre><p> with 
</p><pre>'<em>trainingSource</em>', '<em>trainingClassColumn</em>', 
'<em>trainingAttrColumn</em>'</pre><p> for data without any any numeric 
attributes and with </p><pre>'<em>trainingSource</em>', 
'<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>', 
'<em>numericAttrsColumnIndices</em>'</pre><p> for data containing numeric 
attributes as well.</p>
+<p><a class="anchor" id="notes"></a></p><dl class="section 
user"><dt>Implementation Notes</dt><dd><ul>
+<li>The probabilities computed on the platforms of PostgreSQL and Greenplum 
database have a small difference due to the nature of floating point 
computation. Usually this is not important. However, if a data point has <p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(C=c_i \mid A) \approx P(C=c_j \mid A) \]" 
src="form_21.png"/>
+</p>
+ for two classes, this data point might be classified into diferent classes on 
PostgreSQL and Greenplum. This leads to the differences in classifications on 
PostgreSQL and Greenplum for some data sets, but this should not affect the 
quality of the results.</li>
+<li>When two classes have equal and highest probability among all classes, the 
classification result is an array of these two classes, but the order of the 
two classes is random.</li>
+<li>The current implementation of Naive Bayes classification is suitable for 
discontinuous (categorial) attributes as well as continuous (numeric) 
attributes.<br />
+For continuous data, a typical assumption, usually used for small datasets, is 
that the continuous values associated with each class are distributed according 
to a Gaussian distribution, and the probabilities <img class="formulaInl" 
alt="$ P(A_i = a \mid C=c) $" src="form_22.png"/> are estimated using the 
Gaussian Distribution formula: <p class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(A_i=a \mid C=c) = 
\frac{1}{\sqrt{2\pi\sigma^{2}_c}}exp\left(-\frac{(a-\mu_c)^{2}}{2\sigma^{2}_c}\right)
 \]" src="form_23.png"/>
+</p>
+ where <img class="formulaInl" alt="$\mu_c$" src="form_24.png"/> and <img 
class="formulaInl" alt="$\sigma^{2}_c$" src="form_25.png"/> are the population 
mean and variance of the attribute for the class <img class="formulaInl" 
alt="$c$" src="form_26.png"/>.<br />
+Another common technique for handling continuous values, which is better for 
large data sets, is to use binning to discretize the values, and convert the 
continuous data into categorical bins. This approach is currently not 
implemented.</li>
+<li>One can provide floating point data to the Naive Bayes classification 
function. If the corresponding attribute index is not specified in 
<em>numericAttrsColumnIndices</em>, floating point numbers will be used as 
symbolic substitutions for categorial data. In this case, the classification 
would work best if there are sufficient data points for each floating point 
attribute. However, if floating point numbers are used as continuous data 
without the attribute being marked as of type numeric in 
<em>numericAttrsColumnIndices</em>, no warning is raised and the result may not 
be as expected.</li>
+</ul>
+</dd></dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<p>The following is an extremely simplified example of the above option #1 
which can by verified by hand.</p>
+<ol type="1">
+<li>The training and the classification data. <pre class="example">
+SELECT * FROM training;
+</pre> Result: <pre class="result">
+ id | class | attributes
+&#160;---+-------+------------
+  1 |     1 | {1,2,3}
+  2 |     1 | {1,2,1}
+  3 |     1 | {1,4,3}
+  4 |     2 | {1,2,2}
+  5 |     2 | {0,2,2}
+  6 |     2 | {0,1,3}
+(6 rows)
+</pre> <pre class="example">
+SELECT * FROM toclassify;
+</pre> Result: <pre class="result">
+ id | attributes
+&#160;---+------------
+  1 | {0,2,1}
+  2 | {1,2,3}
+(2 rows)
+</pre></li>
+<li>Precompute feature probabilities and class priors. <pre class="example">
+SELECT madlib.create_nb_prepared_data_tables( 'training',
+                                              'class',
+                                              'attributes',
+                                              3,
+                                              'nb_feature_probs',
+                                              'nb_class_priors'
+                                            );
+</pre></li>
+<li>Optionally check the contents of the precomputed tables. <pre 
class="example">
+SELECT * FROM nb_class_priors;
+</pre> Result: <pre class="result">
+ class | class_cnt | all_cnt
+&#160;------+-----------+---------
+     1 |         3 |       6
+     2 |         3 |       6
+(2 rows)
+</pre> <pre class="example">
+SELECT * FROM nb_feature_probs;
+</pre> Result: <pre class="result">
+ class | attr | value | cnt | attr_cnt
+&#160;------+------+-------+-----+----------
+     1 |    1 |     0 |   0 |        2
+     1 |    1 |     1 |   3 |        2
+     1 |    2 |     1 |   0 |        3
+     1 |    2 |     2 |   2 |        3
+...
+</pre></li>
+<li>Create the view with Naive Bayes classification and check the results. 
<pre class="example">
+SELECT madlib.create_nb_classify_view( 'nb_feature_probs',
+                                       'nb_class_priors',
+                                       'toclassify',
+                                       'id',
+                                       'attributes',
+                                       3,
+                                       'nb_classify_view_fast'
+                                     );
+&#160;
+SELECT * FROM nb_classify_view_fast;
+</pre> Result: <pre class="result">
+ key | nb_classification
+&#160;----+-------------------
+   1 | {2}
+   2 | {1}
+(2 rows)
+</pre></li>
+<li>Look at the probabilities for each class (note that we use "Laplacian 
smoothing"), <pre class="example">
+SELECT madlib.create_nb_probs_view( 'nb_feature_probs',
+                                    'nb_class_priors',
+                                    'toclassify',
+                                    'id',
+                                    'attributes',
+                                    3,
+                                    'nb_probs_view_fast'
+                                  );
+&#160;
+SELECT * FROM nb_probs_view_fast;
+</pre> Result: <pre class="result">
+ key | class | nb_prob
+&#160;----+-------+---------
+   1 |     1 |     0.4
+   1 |     2 |     0.6
+   2 |     1 |    0.75
+   2 |     2 |    0.25
+(4 rows)
+</pre></li>
+</ol>
+<p>The following is an example of using a dataset with both numeric and 
categorical attributes</p>
+<ol type="1">
+<li>The training and the classification data. Attributes 
{height(numeric),weight(numeric),shoe size(categorical)}, 
Class{sex(1=male,2=female)} <pre class="example">
+SELECT * FROM gaussian_data;
+</pre> Result: <pre class="result">
+ id | sex |  attributes   
+&#160;----+-----+---------------
+  1 |   1 | {6,180,12}
+  2 |   1 | {5.92,190,12}
+  3 |   1 | {5.58,170,11}
+  4 |   1 | {5.92,165,11}
+  5 |   2 | {5,100,6}
+  6 |   2 | {5.5,150,6}
+  7 |   2 | {5.42,130,7}
+  8 |   2 | {5.75,150,8}
+(8 rows)
+</pre> <pre class="example">
+SELECT * FROM gaussian_test;
+</pre> Result: <pre class="result">
+ id | sex |  attributes  
+----+-----+--------------
+  9 |   1 | {5.8,180,11}
+ 10 |   2 | {5,160,6}
+(2 rows)
+</pre></li>
+<li>Precompute feature probabilities and class priors. <pre class="example">
+SELECT madlib.create_nb_prepared_data_tables( 'gaussian_data',
+                                              'sex',
+                                              'attributes',
+                                              'ARRAY[1,2]',
+                                              3,
+                                              'categ_feature_probs',
+                                              'numeric_attr_params',
+                                              'class_priors'
+                                            );
+</pre></li>
+<li>Optionally check the contents of the precomputed tables. <pre 
class="example">
+SELECT * FROM class_priors;
+</pre> Result: <pre class="result">
+class | class_cnt | all_cnt 
+&#160;-------+-----------+---------
+     1 |         4 |       8
+     2 |         4 |       8
+(2 rows)
+</pre> <pre class="example">
+SELECT * FROM categ_feature_probs;
+</pre> Result: <pre class="result">
+ class | attr | value | cnt | attr_cnt 
+-------+------+-------+-----+----------
+     2 |    3 |     6 |   2 |        5
+     1 |    3 |    12 |   2 |        5
+     2 |    3 |     7 |   1 |        5
+     1 |    3 |    11 |   2 |        5
+     2 |    3 |     8 |   1 |        5
+     2 |    3 |    12 |   0 |        5
+     1 |    3 |     6 |   0 |        5
+     2 |    3 |    11 |   0 |        5
+     1 |    3 |     8 |   0 |        5
+     1 |    3 |     7 |   0 |        5
+(10 rows)
+</pre> <pre class="example">
+SELECT * FROM numeric_attr_params;
+</pre> Result: <pre class="result">
+class | attr |      attr_mean       |        attr_var        
+-------+------+----------------------+------------------------
+     1 |    1 |   5.8550000000000000 | 0.03503333333333333333
+     1 |    2 | 176.2500000000000000 |   122.9166666666666667
+     2 |    1 |   5.4175000000000000 | 0.09722500000000000000
+     2 |    2 | 132.5000000000000000 |   558.3333333333333333
+(4 rows)
+</pre></li>
+<li>Create the view with Naive Bayes classification and check the results. 
<pre class="example">
+SELECT madlib.create_nb_classify_view( 'categ_feature_probs',
+                                       'class_priors',
+                                       'gaussian_test',
+                                       'id',
+                                       'attributes',
+                                       3,
+                                       'numeric_attr_params',
+                                       'classify_view'
+                                     );
+&#160;
+SELECT * FROM classify_view;
+</pre> Result: <pre class="result">
+ key | nb_classification
+&#160;----+-------------------
+   9 | {1}
+   10 | {2}
+(2 rows)
+</pre></li>
+<li>Look at the probabilities for each class <pre class="example">
+SELECT madlib.create_nb_probs_view( 'categ_feature_probs',
+                                       'class_priors',
+                                       'gaussian_test',
+                                       'id',
+                                       'attributes',
+                                       3,
+                                       'numeric_attr_params',
+                                       'probs_view'
+                                  );
+&#160;
+SELECT * FROM probs_view;
+</pre> Result: <pre class="result">
+ key | class |       nb_prob        
+-----+-------+----------------------
+   9 |     1 |    0.993556745948775
+   9 |     2 |  0.00644325405122553
+  10 |     1 | 5.74057538627122e-05
+  10 |     2 |    0.999942594246137
+(4 rows)
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>In detail, <b>Bayes'</b> theorem states that </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \Pr(C = c \mid A_1 = a_1, \dots, A_n = a_n) = 
\frac{\Pr(C = c) \cdot \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c)} {\Pr(A_1 = 
a_1, \dots, A_n = a_n)} \,, \]" src="form_27.png"/>
+</p>
+<p> and the <b>naive</b> assumption is that </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c) = 
\prod_{i=1}^n \Pr(A_i = a_i \mid C = c) \,. \]" src="form_28.png"/>
+</p>
+<p> Naives Bayes classification estimates feature probabilities and class 
priors using maximum likelihood or Laplacian smoothing. These parameters are 
then used to classifying new data.</p>
+<p>A Naive Bayes classifier computes the following formula: </p><p 
class="formulaDsp">
+<img class="formulaDsp" alt="\[ \text{classify}(a_1, ..., a_n) = \arg\max_c 
\left\{ \Pr(C = c) \cdot \prod_{i=1}^n \Pr(A_i = a_i \mid C = c) \right\} \]" 
src="form_29.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$ c $" src="form_20.png"/> ranges over 
all classes in the training data and probabilites are estimated with relative 
frequencies from the training set. There are different ways to estimate the 
feature probabilities <img class="formulaInl" alt="$ P(A_i = a \mid C = c) $" 
src="form_30.png"/>. The maximum likelihood estimate takes the relative 
frequencies. That is: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(A_i = a \mid C = c) = \frac{\#(c,i,a)}{\#c} 
\]" src="form_31.png"/>
+</p>
+<p> where</p><ul>
+<li><img class="formulaInl" alt="$ \#(c,i,a) $" src="form_32.png"/> denotes 
the # of training samples where attribute <img class="formulaInl" alt="$ i $" 
src="form_33.png"/> is <img class="formulaInl" alt="$ a $" src="form_34.png"/> 
and class is <img class="formulaInl" alt="$ c $" src="form_20.png"/></li>
+<li><img class="formulaInl" alt="$ \#c $" src="form_35.png"/> denotes the # of 
training samples where class is <img class="formulaInl" alt="$ c $" 
src="form_20.png"/>.</li>
+</ul>
+<p>Since the maximum likelihood sometimes results in estimates of "0", you 
might want to use a "smoothed" estimate. To do this, you add a number of 
"virtual" samples and make the assumption that these samples are evenly 
distributed among the values assumed by attribute <img class="formulaInl" 
alt="$ i $" src="form_33.png"/> (that is, the set of all values observed for 
attribute <img class="formulaInl" alt="$ a $" src="form_34.png"/> for any 
class):</p>
+<p class="formulaDsp">
+<img class="formulaDsp" alt="\[ P(A_i = a \mid C = c) = \frac{\#(c,i,a) + 
s}{\#c + s \cdot \#i} \]" src="form_36.png"/>
+</p>
+<p> where</p><ul>
+<li><img class="formulaInl" alt="$ \#i $" src="form_37.png"/> denotes the # of 
distinct values for attribute <img class="formulaInl" alt="$ i $" 
src="form_33.png"/> (for all classes)</li>
+<li><img class="formulaInl" alt="$ s \geq 0 $" src="form_38.png"/> denotes the 
smoothing factor.</li>
+</ul>
+<p>The case <img class="formulaInl" alt="$ s = 1 $" src="form_39.png"/> is 
known as "Laplace smoothing". The case <img class="formulaInl" alt="$ s = 0 $" 
src="form_40.png"/> trivially reduces to maximum-likelihood estimates.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] Tom Mitchell: Machine Learning, McGraw Hill, 1997. Book chapter 
<em>Generativ and Discriminative Classifiers: Naive Bayes and Logistic 
Regression</em> available at: <a 
href="http://www.cs.cmu.edu/~tom/NewChapters.html";>http://www.cs.cmu.edu/~tom/NewChapters.html</a></p>
+<p>[2] Wikipedia, Naive Bayes classifier, <a 
href="http://en.wikipedia.org/wiki/Naive_Bayes_classifier";>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd>File <a class="el" href="bayes_8sql__in.html" title="SQL 
functions for naive Bayes. ">bayes.sql_in</a> documenting the SQL 
functions.</dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:39 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/b5b51c69/docs/v1.11/group__grp__cg.html
----------------------------------------------------------------------
diff --git a/docs/v1.11/group__grp__cg.html b/docs/v1.11/group__grp__cg.html
new file mode 100644
index 0000000..0b279f1
--- /dev/null
+++ b/docs/v1.11/group__grp__cg.html
@@ -0,0 +1,179 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Conjugate Gradient</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org";><img 
alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ 
></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.11</span>
+   </div>
+   <div id="projectbrief">User Documentation for MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__cg.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Conjugate Gradient<div class="ingroups"><a class="el" 
href="group__grp__early__stage.html">Early Stage Development</a></div></div>  
</div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#syntax">Function Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method 
is still in early stage development. There may be some issues that will be 
addressed in a future version. Interface and implementation is subject to 
change. </em></dd></dl>
+<p>This function uses the iterative conjugate gradient method [1] to find a 
solution to the function: </p><p class="formulaDsp">
+<img class="formulaDsp" alt="\[ \boldsymbol Ax = \boldsymbol b \]" 
src="form_45.png"/>
+</p>
+<p> where <img class="formulaInl" alt="$ \boldsymbol A $" src="form_46.png"/> 
is a symmetric, positive definite matrix and <img class="formulaInl" alt="$x$" 
src="form_43.png"/> and <img class="formulaInl" alt="$ \boldsymbol b $" 
src="form_44.png"/> are vectors.</p>
+<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function 
Syntax</dt><dd>Conjugate gradient returns x as an array. It has the following 
syntax.</dd></dl>
+<pre class="syntax">
+conjugate_gradient( table_name,
+                    name_of_row_values_col,
+                    name_of_row_number_col,
+                    aray_of_b_values,
+                    desired_precision
+                  )
+</pre><p>Matrix <img class="formulaInl" alt="$ \boldsymbol A $" 
src="form_46.png"/> is assumed to be stored in a table where each row consists 
of at least two columns: array containing values of a given row, row number: 
</p><pre>{TABLE|VIEW} <em>matrix_A</em> (
+    <em>row_number</em> FLOAT,
+    <em>row_values</em> FLOAT[],
+)</pre><p> The number of elements in each row should be the same.</p>
+<p><img class="formulaInl" alt="$ \boldsymbol b $" src="form_44.png"/> is 
passed as a FLOAT[] to the function.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd><ol type="1">
+<li>Construct matrix A according to structure. <pre class="example">
+SELECT * FROM data;
+</pre> Result: <pre class="result">
+ row_num | row_val
+&#160;--------+---------
+       1 | {2,1}
+       2 | {1,4}
+(2 rows)
+</pre></li>
+<li>Call the conjugate gradient function. <pre class="example">
+SELECT conjugate_gradient( 'data',
+                           'row_val',
+                           'row_num',
+                           '{2,1}',
+                           1E-6,1
+                         );
+</pre> <pre class="result">
+INFO:  COMPUTE RESIDUAL ERROR 14.5655661859659
+INFO:  ERROR 0.144934004246004
+INFO:  ERROR 3.12963615962926e-31
+INFO:  TEST FINAL ERROR 2.90029642185163e-29
+    conjugate_gradient
+&#160;--------------------------
+ {1,-1.31838984174237e-15}
+(1 row)
+</pre></li>
+</ol>
+</dd></dl>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd>[1] "Conjugate gradient method" Wikipedia - <a 
href="http://en.wikipedia.org/wiki/Conjugate_gradient_method";>http://en.wikipedia.org/wiki/Conjugate_gradient_method</a></dd></dl>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd>File <a class="el" href="conjugate__gradient_8sql__in.html" 
title="SQL function computing Conjugate Gradient. 
">conjugate_gradient.sql_in</a> documenting the SQL function. </dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Tue May 16 2017 13:24:39 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

[30/51] [partial] incubator-madlib-site git commit: Add v1.11 docs

Reply via email to