[07/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

nkak Mon, 15 Oct 2018 11:48:52 -0700

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__svec.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__svec.html 
b/docs/v1.15.1/group__grp__svec.html
new file mode 100644
index 0000000..cffbba4
--- /dev/null
+++ b/docs/v1.15.1/group__grp__svec.html
@@ -0,0 +1,455 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Sparse Vectors</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__svec.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Sparse Vectors<div class="ingroups"><a class="el" 
href="group__grp__datatrans.html">Data Types and Transformations</a> &raquo; <a 
class="el" href="group__grp__arraysmatrix.html">Arrays and 
Matrices</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#usage">Using Sparse Vectors</a> </li>
+<li>
+<a href="#vectorization">Document Vectorization into Sparse Vectors</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>This module implements a sparse vector data type, named "svec", which 
provides compressed storage of vectors that have many duplicate elements.</p>
+<p>Arrays of floating point numbers for various calculations sometimes have 
long runs of zeros (or some other default value). This is common in 
applications like scientific computing, retail optimization, and text 
processing. Each floating point number takes 8 bytes of storage in memory 
and/or disk, so saving those zeros is often worthwhile. There are also many 
computations that can benefit from skipping over the zeros.</p>
+<p>Consider, for example, the following array of doubles stored as a 
Postgres/Greenplum "float8[]" data type:</p>
+<pre class="example">
+'{0, 33,...40,000 zeros..., 12, 22 }'::float8[]
+</pre><p>This array would occupy slightly more than 320KB of memory or disk, 
most of it zeros. Even if we were to exploit the null bitmap and store the 
zeros as nulls, we would still end up with a 5KB null bitmap, which is still 
not nearly as memory efficient as we'd like. Also, as we perform various 
operations on the array, we do work on 40,000 fields that turn out to be 
unimportant.</p>
+<p>To solve the problems associated with the processing of vectors discussed 
above, the svec type employs a simple Run Length Encoding (RLE) scheme to 
represent sparse vectors as pairs of count-value arrays. For example, the array 
above would be represented as</p>
+<pre class="example">
+'{1,1,40000,1,1}:{0,33,0,12,22}'::madlib.svec
+</pre><p>which says there is 1 occurrence of 0, followed by 1 occurrence of 
33, followed by 40,000 occurrences of 0, etc. This uses just 5 integers and 5 
floating point numbers to store the array. Further, it is easy to implement 
vector operations that can take advantage of the RLE representation to make 
computations faster. The SVEC module provides a library of such functions.</p>
+<p>The current version only supports sparse vectors of float8 values. Future 
versions will support other base types.</p>
+<p><a class="anchor" id="usage"></a></p><dl class="section user"><dt>Using 
Sparse Vectors</dt><dd></dd></dl>
+<p>An SVEC can be constructed directly with a constant expression, as follows: 
</p><pre class="example">
+SELECT '{n1,n2,...,nk}:{v1,v2,...vk}'::madlib.svec;
+</pre><p> where <code>n1,n2,...,nk</code> specifies the counts for the values 
<code>v1,v2,...,vk</code>.</p>
+<p>A float array can be cast to an SVEC: </p><pre class="example">
+SELECT ('{v1,v2,...vk}'::float[])::madlib.svec;
+</pre><p>An SVEC can be created with an aggregation: </p><pre class="example">
+SELECT madlib.svec_agg(v1) FROM generate_series(1,k);
+</pre><p>An SVEC can be created using the 
<code>madlib.svec_cast_positions_float8arr()</code> function by supplying an 
array of positions and an array of values at those positions: </p><pre 
class="example">
+SELECT madlib.svec_cast_positions_float8arr(
+    array[n1,n2,...nk],    -- positions of values in vector
+    array[v1,v2,...vk],    -- values at each position
+    length,                -- length of vector
+    base)                  -- value at unspecified positions
+</pre><p> For example, the following expression: </p><pre class="example">
+SELECT madlib.svec_cast_positions_float8arr(
+    array[1,3,5],
+    array[2,4,6],
+    10,
+    0.0)
+</pre><p> produces this SVEC: </p><pre class="result">
+ svec_cast_positions_float8arr
+ &#160;------------------------------
+ {1,1,1,1,1,5}:{2,0,4,0,6,0}
+</pre><p>Add madlib to the search_path to use the svec operators defined in 
the module.</p>
+<p><a class="anchor" id="vectorization"></a></p><dl class="section 
user"><dt>Document Vectorization into Sparse Vectors</dt><dd>This module 
implements an efficient way for document vectorization, converting text 
documents into sparse vector representation (MADlib.svec), required by various 
machine learning algorithms in MADlib.</dd></dl>
+<p>The function accepts two tables as input, dictionary table and documents 
table, and produces the specified output table containing sparse vectors for 
the represented documents (in documents table).</p>
+<pre class="syntax">
+madlib.gen_doc_svecs(output_tbl,
+                     dictionary_tbl,
+                     dict_id_col,
+                     dict_term_col,
+                     documents_tbl,
+                     doc_id_col,
+                     doc_term_col,
+                     doc_term_info_col
+                    )
+</pre><p> <b>Arguments</b> </p><dl class="arglist">
+<dt>output_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the output table to be created containing 
the sparse vector representation of the documents. It has the following 
columns: </p><table class="output">
+<tr>
+<th>doc_id </th><td>__TYPE_DOC__. Document id. <br />
+ __TYPE_DOC__: Column type depends on the type of <code>doc_id_col</code> in 
<code>documents_tbl</code>.   </td></tr>
+<tr>
+<th>sparse_vector </th><td>MADlib.svec. Corresponding sparse vector 
representation.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>dictionary_tbl </dt>
+<dd><p class="startdd">TEXT. Name of the dictionary table containing features. 
</p><table class="output">
+<tr>
+<th>dict_id_col </th><td>TEXT. Name of the id column in the 
<code>dictionary_tbl</code>. <br />
+ Expected Type: INTEGER or BIGINT. <br />
+ NOTE: Values must be continuous ranging from 0 to total number of elements in 
the dictionary - 1.  </td></tr>
+<tr>
+<th>dict_term_col </th><td>TEXT. Name of the column containing term (features) 
in <code>dictionary_tbl</code>.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>documents_tbl </dt>
+<dd>TEXT. Name of the documents table representing documents. <table 
class="output">
+<tr>
+<th>doc_id_col </th><td>TEXT. Name of the id column in the 
<code>documents_tbl</code>.  </td></tr>
+<tr>
+<th>doc_term_col </th><td>TEXT. Name of the term column in the 
<code>documents_tbl</code>.  </td></tr>
+<tr>
+<th>doc_term_info_col </th><td>TEXT. Name of the term info column in 
<code>documents_tbl</code>. The expected type of this column should be: <br />
+ - INTEGER, BIGINT or DOUBLE PRECISION: Values directly used to populate 
vector. <br />
+ - ARRAY: Length of the array used to populate the vector. <br />
+ ** For an example use case on using these types of column types, please refer 
to the example below.   </td></tr>
+</table>
+</dd>
+</dl>
+<p><b>Example:</b> <br />
+ Consider a corpus consisting of set of documents consisting of features 
(terms) along with doc ids: </p><pre class="example">
+1, {this,is,one,document,in,the,corpus}
+2, {i,am,the,second,document,in,the,corpus}
+3, {being,third,never,really,bothered,me,until,now}
+4, {the,document,before,me,is,the,third,document}
+</pre><ol type="1">
+<li>Prepare documents table in appropriate format. <br />
+ The corpus specified above can be represented by any of the following 
<code>documents_table:</code> <pre class="example">
+SELECT * FROM documents_table ORDER BY id;
+</pre> Result: <pre class="result">
+  id |   term   | count                 id |   term   | positions
+&#160;----+----------+-------               ----+----------+-----------
+   1 | is       |     1                  1 | is       | {1}
+   1 | in       |     1                  1 | in       | {4}
+   1 | one      |     1                  1 | one      | {2}
+   1 | this     |     1                  1 | this     | {0}
+   1 | the      |     1                  1 | the      | {5}
+   1 | document |     1                  1 | document | {3}
+   1 | corpus   |     1                  1 | corpus   | {6}
+   2 | second   |     1                  2 | second   | {3}
+   2 | document |     1                  2 | document | {4}
+   2 | corpus   |     1                  2 | corpus   | {7}
+   . | ...      |    ..                  . | ...      | ...
+   4 | document |     2                  4 | document | {1,7}
+...
+</pre></li>
+<li>Prepare dictionary table in appropriate format. <pre class="example">
+SELECT * FROM dictionary_table ORDER BY id;
+</pre> Result: <pre class="result">
+  id |   term
+&#160;----+----------
+   0 | am
+   1 | before
+   2 | being
+   3 | bothered
+   4 | corpus
+   5 | document
+   6 | i
+   7 | in
+   8 | is
+   9 | me
+...
+</pre></li>
+<li>Generate sparse vector for the documents using dictionary_table and 
documents_table. <br />
+ <code>doc_term_info_col</code> <code></code>(count) of type INTEGER: <pre 
class="example">
+SELECT * FROM madlib.gen_doc_svecs('svec_output', 'dictionary_table', 'id', 
'term',
+                            'documents_table', 'id', 'term', 'count');
+</pre> <code>doc_term_info_col</code> <code></code>(positions) of type ARRAY: 
<pre class="example">
+SELECT * FROM madlib.gen_doc_svecs('svec_output', 'dictionary_table', 'id', 
'term',
+                            'documents_table', 'id', 'term', 'positions');
+</pre> Result: <pre class="result">
+                                 gen_doc_svecs
+&#160;--------------------------------------------------------------------------------------
+ Created table svec_output (doc_id, sparse_vector) containing sparse vectors
+(1 row)
+</pre></li>
+<li>Analyze the sparse vectors created. <pre class="example">
+SELECT * FROM svec_output ORDER by doc_id;
+</pre> Result: <pre class="result">
+ doc_id |                  sparse_vector
+&#160;--------+-------------------------------------------------
+      1 | {4,2,1,2,3,1,2,1,1,1,1}:{0,1,0,1,0,1,0,1,0,1,0}
+      2 | {1,3,4,6,1,1,3}:{1,0,1,0,1,2,0}
+      3 | {2,2,5,3,1,1,2,1,1,1}:{0,1,0,1,0,1,0,1,0,1}
+      4 | {1,1,3,1,2,2,5,1,1,2}:{0,1,0,2,0,1,0,2,1,0}
+(4 rows)
+</pre></li>
+</ol>
+<p>See the file <a class="el" href="svec_8sql__in.html" title="SQL type 
definitions and functions for sparse vector data type svec ">svec.sql_in</a> 
for complete syntax.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<p>We can use operations with svec type like &lt;, &gt;, *, **, /, =, +, SUM, 
etc, and they have meanings associated with typical vector operations. For 
example, the plus (+) operator adds each of the terms of two vectors having the 
same dimension together. </p><pre class="example">
+SELECT ('{0,1,5}'::float8[]::madlib.svec + 
'{4,3,2}'::float8[]::madlib.svec)::float8[];
+</pre><p> Result: </p><pre class="result">
+ float8
+&#160;--------
+ {4,4,7}
+</pre><p>Without the casting into float8[] at the end, we get: </p><pre 
class="example">
+SELECT '{0,1,5}'::float8[]::madlib.svec + '{4,3,2}'::float8[]::madlib.svec;
+</pre><p> Result: </p><pre class="result">
+ ?column?
+&#160;---------
+{2,1}:{4,7}
+</pre><p>A dot product (%*%) between the two vectors will result in a scalar 
result of type float8. The dot product should be (0*4 + 1*3 + 5*2) = 13, like 
this: </p><pre class="example">
+SELECT '{0,1,5}'::float8[]::madlib.svec %*% '{4,3,2}'::float8[]::madlib.svec;
+</pre> <pre class="result">
+ ?column?
+&#160;---------
+    13
+</pre><p>Special vector aggregate functions are also available. SUM is self 
explanatory. SVEC_COUNT_NONZERO evaluates the count of non-zero terms in each 
column found in a set of n-dimensional svecs and returns an svec with the 
counts. For instance, if we have the vectors {0,1,5}, {10,0,3},{0,0,3},{0,1,0}, 
then executing the SVEC_COUNT_NONZERO() aggregate function would result in 
{1,2,3}:</p>
+<pre class="example">
+CREATE TABLE list (a madlib.svec);
+INSERT INTO list VALUES ('{0,1,5}'::float8[]), ('{10,0,3}'::float8[]), 
('{0,0,3}'::float8[]),('{0,1,0}'::float8[]);
+SELECT madlib.svec_count_nonzero(a)::float8[] FROM list;
+</pre><p> Result: </p><pre class="result">
+svec_count_nonzero
+&#160;----------------
+    {1,2,3}
+</pre><p>We do not use null bitmaps in the svec data type. A null value in an 
svec is represented explicitly as an NVP (No Value Present) value. For example, 
we have: </p><pre class="example">
+SELECT '{1,2,3}:{4,null,5}'::madlib.svec;
+</pre><p> Result: </p><pre class="result">
+      svec
+&#160;------------------
+ {1,2,3}:{4,NVP,5}
+</pre><p>Adding svecs with null values results in NVPs in the sum: </p><pre 
class="example">
+SELECT '{1,2,3}:{4,null,5}'::madlib.svec + '{2,2,2}:{8,9,10}'::madlib.svec;
+</pre><p> Result: </p><pre class="result">
+         ?column?
+ &#160;-------------------------
+  {1,2,1,2}:{12,NVP,14,15}
+</pre><p>An element of an svec can be accessed using the <a class="el" 
href="svec__util_8sql__in.html#a8787222aec691f94d9808d1369aa401c">svec_proj()</a>
 function, which takes an svec and the index of the element desired. </p><pre 
class="example">
+SELECT madlib.svec_proj('{1,2,3}:{4,5,6}'::madlib.svec, 1) + 
madlib.svec_proj('{4,5,6}:{1,2,3}'::madlib.svec, 15);
+</pre><p> Result: </p><pre class="result"> ?column?
+&#160;---------
+    7
+</pre><p>A subvector of an svec can be accessed using the <a class="el" 
href="svec__util_8sql__in.html#a5cb3446de5fc117befe88ccb1ebb0e4e">svec_subvec()</a>
 function, which takes an svec and the start and end index of the subvector 
desired. </p><pre class="example">
+SELECT madlib.svec_subvec('{2,4,6}:{1,3,5}'::madlib.svec, 2, 11);
+</pre><p> Result: </p><pre class="result">   svec_subvec
+&#160;----------------
+ {1,4,5}:{1,3,5}
+</pre><p>The elements/subvector of an svec can be changed using the function 
<a class="el" 
href="svec__util_8sql__in.html#a59407764a1cbf1937da39cf39a2f447c">svec_change()</a>.
 It takes three arguments: an m-dimensional svec sv1, a start index j, and an 
n-dimensional svec sv2 such that j + n - 1 &lt;= m, and returns an svec like 
sv1 but with the subvector sv1[j:j+n-1] replaced by sv2. An example follows: 
</p><pre class="example">
+SELECT 
madlib.svec_change('{1,2,3}:{4,5,6}'::madlib.svec,3,'{2}:{3}'::madlib.svec);
+</pre><p> Result: </p><pre class="result">     svec_change
+&#160;--------------------
+ {1,1,2,2}:{4,5,3,6}
+</pre><p>There are also higher-order functions for processing svecs. For 
example, the following is the corresponding function for lapply() in R. 
</p><pre class="example">
+SELECT madlib.svec_lapply('sqrt', '{1,2,3}:{4,5,6}'::madlib.svec);
+</pre><p> Result: </p><pre class="result">
+                  svec_lapply
+&#160;----------------------------------------------
+ {1,2,3}:{2,2.23606797749979,2.44948974278318}
+</pre><p>The full list of functions available for operating on svecs are 
available in svec.sql-in.</p>
+<p><b> A More Extensive Example</b></p>
+<p>For a text classification example, let's assume we have a dictionary 
composed of words in a sorted text array: </p><pre class="example">
+CREATE TABLE features (a text[]);
+INSERT INTO features VALUES
+            ('{am,before,being,bothered,corpus,document,i,in,is,me,
+               never,now,one,really,second,the,third,this,until}');
+</pre><p> We have a set of documents, each represented as an array of words: 
</p><pre class="example">
+CREATE TABLE documents(a int,b text[]);
+INSERT INTO documents VALUES
+            (1,'{this,is,one,document,in,the,corpus}'),
+            (2,'{i,am,the,second,document,in,the,corpus}'),
+            (3,'{being,third,never,really,bothered,me,until,now}'),
+            (4,'{the,document,before,me,is,the,third,document}');
+</pre><p>Now we have a dictionary and some documents, we would like to do some 
document categorization using vector arithmetic on word counts and proportions 
of dictionary words in each document.</p>
+<p>To start this process, we'll need to find the dictionary words in each 
document. We'll prepare what is called a Sparse Feature Vector or SFV for each 
document. An SFV is a vector of dimension N, where N is the number of 
dictionary words, and in each cell of an SFV is a count of each dictionary word 
in the document.</p>
+<p>Inside the sparse vector library, we have a function that will create an 
SFV from a document, so we can just do this (For a more efficient way for 
converting documents into sparse vectors, especially for larger datasets, 
please refer to <a href="#vectorization">Document Vectorization into Sparse 
Vectors</a>):</p>
+<pre class="example">
+SELECT madlib.svec_sfv((SELECT a FROM features LIMIT 1),b)::float8[]
+         FROM documents;
+</pre><p> Result: </p><pre class="result">
+                svec_sfv
+&#160;----------------------------------------
+ {0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,1,0,1,0}
+ {0,0,1,1,0,0,0,0,0,1,1,1,0,1,0,0,1,0,1}
+ {1,0,0,0,1,1,1,1,0,0,0,0,0,0,1,2,0,0,0}
+ {0,1,0,0,0,2,0,0,1,1,0,0,0,0,0,2,1,0,0}
+</pre><p>Note that the output of madlib.svec_sfv() is an svec for each 
document containing the count of each of the dictionary words in the ordinal 
positions of the dictionary. This can more easily be understood by lining up 
the feature vector and text like this:</p>
+<pre class="example">
+SELECT madlib.svec_sfv((SELECT a FROM features LIMIT 1),b)::float8[]
+                , b
+         FROM documents;
+</pre><p> Result: </p><pre class="result">
+                svec_sfv                 |                        b
+&#160;----------------------------------------+--------------------------------------------------
+ {1,0,0,0,1,1,1,1,0,0,0,0,0,0,1,2,0,0,0} | 
{i,am,the,second,document,in,the,corpus}
+ {0,1,0,0,0,2,0,0,1,1,0,0,0,0,0,2,1,0,0} | 
{the,document,before,me,is,the,third,document}
+ {0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,1,0,1,0} | {this,is,one,document,in,the,corpus}
+ {0,0,1,1,0,0,0,0,0,1,1,1,0,1,0,0,1,0,1} | 
{being,third,never,really,bothered,me,until,now}
+</pre> <pre class="example">
+SELECT * FROM features;
+</pre> <pre class="result">
+                                                a
+&#160;-------------------------------------------------------------------------------------------------------
+{am,before,being,bothered,corpus,document,i,in,is,me,never,now,one,really,second,the,third,this,until}
+</pre><p>Now when we look at the document "i am the second document in the 
corpus", its SFV is {1,3*0,1,1,1,1,6*0,1,2}. The word "am" is the first 
ordinate in the dictionary and there is 1 instance of it in the SFV. The word 
"before" has no instances in the document, so its value is "0" and so on.</p>
+<p>The function madlib.svec_sfv() can process large numbers of documents into 
their SFVs in parallel at high speed.</p>
+<p>The rest of the categorization process is all vector math. The actual count 
is hardly ever used. Instead, it's turned into a weight. The most common weight 
is called tf/idf for Term Frequency / Inverse Document Frequency. The 
calculation for a given term in a given document is</p>
+<pre class="example">
+{#Times in document} * log {#Documents / #Documents the term appears in}.
+</pre><p>For instance, the term "document" in document A would have weight 1 * 
log (4/3). In document D, it would have weight 2 * log (4/3). Terms that appear 
in every document would have tf/idf weight 0, since log (4/4) = log(1) = 0. 
(Our example has no term like that.) That usually sends a lot of values to 
0.</p>
+<p>For this part of the processing, we'll need to have a sparse vector of the 
dictionary dimension (19) with the values </p><pre class="example">
+log(#documents/#Documents each term appears in).
+</pre><p> There will be one such vector for the whole list of documents (aka 
the "corpus"). The #documents is just a count of all of the documents, in this 
case 4, but there is one divisor for each dictionary word and its value is the 
count of all the times that word appears in the document. This single vector 
for the whole corpus can then be scalar product multiplied by each document SFV 
to produce the Term Frequency/Inverse Document Frequency weights.</p>
+<p>This can be done as follows: </p><pre class="example">
+CREATE TABLE corpus AS
+            (SELECT a, madlib.svec_sfv((SELECT a FROM features LIMIT 1),b) sfv
+         FROM documents);
+CREATE TABLE weights AS
+          (SELECT a docnum, madlib.svec_mult(sfv, logidf) tf_idf
+           FROM (SELECT 
madlib.svec_log(madlib.svec_div(count(sfv)::madlib.svec,madlib.svec_count_nonzero(sfv)))
 logidf
+                FROM corpus) foo, corpus ORDER BYdocnum);
+SELECT * FROM weights;
+</pre><p> Result </p><pre class="result">
+docnum |                tf_idf
+&#160;------+----------------------------------------------------------------------
+     1 | {4,1,1,1,2,3,1,2,1,1,1,1}:{0,0.69,0.28,0,0.69,0,1.38,0,0.28,0,1.38,0}
+     2 | {1,3,1,1,1,1,6,1,1,3}:{1.38,0,0.69,0.28,1.38,0.69,0,1.38,0.57,0}
+     3 | {2,2,5,1,2,1,1,2,1,1,1}:{0,1.38,0,0.69,1.38,0,1.38,0,0.69,0,1.38}
+     4 | {1,1,3,1,2,2,5,1,1,2}:{0,1.38,0,0.57,0,0.69,0,0.57,0.69,0}
+</pre><p>We can now get the "angular distance" between one document and the 
rest of the documents using the ACOS of the dot product of the document 
vectors: The following calculates the angular distance between the first 
document and each of the other documents: </p><pre class="example">
+SELECT docnum,
+                180. * ( ACOS( madlib.svec_dmin( 1., madlib.svec_dot(tf_idf, 
testdoc)
+                    / 
(madlib.svec_l2norm(tf_idf)*madlib.svec_l2norm(testdoc))))/3.141592654) 
angular_distance
+         FROM weights,(SELECT tf_idf testdoc FROM weights WHERE docnum = 1 
LIMIT 1) foo
+         ORDER BY 1;
+</pre><p> Result: </p><pre class="result">
+docnum | angular_distance
+&#160;-------+------------------
+     1 |                0
+     2 | 78.8235846096986
+     3 | 89.9999999882484
+     4 | 80.0232034288617
+</pre><p>We can see that the angular distance between document 1 and itself is 
0 degrees and between document 1 and 3 is 90 degrees because they share no 
features at all. The angular distance can now be plugged into machine learning 
algorithms that rely on a distance measure between data points.</p>
+<p>SVEC also provides functionality for declaring array given an array of 
positions and array of values, intermediate values betweens those are declared 
to be base value that user provides in the same function call. In the example 
below the fist array of integers represents the positions for the array two 
(array of floats). Positions do not need to come in the sorted order. Third 
value represents desired maximum size of the array. This assures that array is 
of that size even if last position is not. If max size &lt; 1 that value is 
ignored and array will end at the last position in the position vector. Final 
value is a float representing the base value to be used between the declared 
ones (0 would be a common candidate):</p>
+<pre class="example">
+SELECT 
madlib.svec_cast_positions_float8arr(ARRAY[1,2,7,5,87],ARRAY[.1,.2,.7,.5,.87],90,0.0);
+</pre><p> Result: </p><pre class="result">
+        svec_cast_positions_float8arr
+&#160;----------------------------------------------------
+{1,1,2,1,1,1,79,1,3}:{0.1,0.2,0,0.5,0,0.7,0,0.87,0}
+(1 row)
+</pre><p><a class="anchor" id="related"></a></p><dl class="section 
user"><dt>Related Topics</dt><dd></dd></dl>
+<p>Other examples of svecs usage can be found in the k-means module, <a 
class="el" href="group__grp__kmeans.html">k-Means Clustering</a>.</p>
+<p>File <a class="el" href="svec_8sql__in.html" title="SQL type definitions 
and functions for sparse vector data type svec ">svec.sql_in</a> documenting 
the SQL functions.</p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__svm.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__svm.html 
b/docs/v1.15.1/group__grp__svm.html
new file mode 100644
index 0000000..b7dfe58
--- /dev/null
+++ b/docs/v1.15.1/group__grp__svm.html
@@ -0,0 +1,875 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Support Vector Machines</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__svm.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Support Vector Machines<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#svm_classification">Classification Function</a> </li>
+<li class="level1">
+<a href="#svm_regression">Regression Function</a> </li>
+<li class="level1">
+<a href="#novelty_detection">Novelty Detection</a> </li>
+<li class="level1">
+<a href="#kernel_params">Kernel Parameters</a> </li>
+<li class="level1">
+<a href="#parameters">Other Parameters</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Functions</a> </li>
+<li class="level1">
+<a href="#example">Examples</a> </li>
+<li class="level1">
+<a href="#background">Technical Background</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>Support vector machines are models for regression and classification 
tasks. SVM models have two particularly desirable features: robustness in the 
presence of noisy data and applicability to a variety of data configurations. 
At its core, a <em>linear</em> SVM model is a hyperplane separating two 
distinct classes of data (in the case of classification problems), in such a 
way that the distance between the hyperplane and the nearest training data 
point (called the <em>margin</em>) is maximized. Vectors that lie on this 
margin are called support vectors. With the support vectors fixed, 
perturbations of vectors beyond the margin will not affect the model; this 
contributes to the modelâs robustness. By substituting a kernel function for 
the usual inner product, one can approximate a large variety of decision 
boundaries in addition to linear hyperplanes. <a class="anchor" 
id="svm_classification"></a></p><dl class="section user"><dt>Classification 
Training Function</dt><dd>The S
 VM classification training function has the following format: <pre 
class="syntax">
+svm_classification(
+    source_table,
+    model_table,
+    dependent_varname,
+    independent_varname,
+    kernel_func,
+    kernel_params,
+    grouping_col,
+    params,
+    verbose
+    )
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the training 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>model_table </dt>
+<dd><p class="startdd">TEXT. Name of the output table containing the model. 
Details of the output tables are provided below. </p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_varname </dt>
+<dd><p class="startdd">TEXT. Name of the dependent variable column. For 
classification, this column can contain values of any type, but must assume 
exactly two distinct values. Otherwise, an error will be thrown. </p>
+<p class="enddd"></p>
+</dd>
+<dt>independent_varname </dt>
+<dd><p class="startdd">TEXT. Expression list to evaluate for the independent 
variables. An intercept variable should not be included as part of this 
expression. See 'fit_intercept' in the kernel params for info on intercepts. 
Please note that expression should be able to be cast to DOUBLE PRECISION[].</p>
+<p class="enddd"></p>
+</dd>
+<dt>kernel_func (optional) </dt>
+<dd><p class="startdd">TEXT, default: 'linear'. Type of kernel. Currently 
three kernel types are supported: 'linear', 'gaussian', and 'polynomial'. The 
text can be any subset of the three strings; for e.g., kernel_func='ga' will 
create a Gaussian kernel. </p>
+<p class="enddd"></p>
+</dd>
+<dt>kernel_params (optional) </dt>
+<dd><p class="startdd">TEXT, defaults: NULL. Parameters for non-linear kernel 
in a comma-separated string of key-value pairs. The actual parameters differ 
depending on the value of <em>kernel_func</em>. See the description below for 
details. </p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_col (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. An expression list used to group 
the input dataset into discrete groups, which results in running one model per 
group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no 
grouping is used and a single model is generated. Please note that cross 
validation is not supported if grouping is used.</p>
+<p class="enddd"></p>
+</dd>
+<dt>params (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. Parameters for optimization and 
regularization in a comma-separated string of key-value pairs. If a list of 
values is provided, then cross-validation will be performed to select the 
<em>best</em> value from the list. See the description below for details. </p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd>BOOLEAN default: FALSE. Verbose output of the results of training. </dd>
+</dl>
+</dd></dl>
+<p><b>Output tables</b> <br />
+ The model table produced by SVM contains the following columns: </p><table 
class="output">
+<tr>
+<th>coef </th><td>FLOAT8. Vector of coefficients.  </td></tr>
+<tr>
+<th>grouping_key </th><td>TEXT Identifies the group to which the datum 
belongs.  </td></tr>
+<tr>
+<th>num_rows_processed </th><td>BIGINT. Numbers of rows processed.  </td></tr>
+<tr>
+<th>num_rows_skipped </th><td>BIGINT. Numbers of rows skipped due to missing 
values or failures.  </td></tr>
+<tr>
+<th>num_iterations </th><td>INTEGER. Number of iterations completed by 
stochastic gradient descent algorithm. The algorithm either converged in this 
number of iterations or hit the maximum number specified in the optimization 
parameters.   </td></tr>
+<tr>
+<th>loss </th><td>FLOAT8. Value of the objective function of SVM, expressed as 
an average loss per row over the <em>source_table</em>. See Technical 
Background section below for more details.  </td></tr>
+<tr>
+<th>norm_of_gradient </th><td>FLOAT8. Value of the L2-norm of the 
(sub)-gradient of the objective function.  </td></tr>
+<tr>
+<th>__dep_var_mapping </th><td>TEXT[]. Vector of dependent variable labels. 
The first entry corresponds to -1 and the second to +1. For internal use only.  
</td></tr>
+</table>
+<p>An auxiliary table named &lt;model_table&gt;_random is created if the 
kernel is not linear. It contains data needed to embed test data into a random 
feature space (see references [2,3]). This data is used internally by 
svm_predict and not meaningful on its own to the user, so you can ignore it.</p>
+<p>A summary table named &lt;model_table&gt;_summary is also created, which 
has the following columns: </p><table class="output">
+<tr>
+<th>method </th><td>'svm'  </td></tr>
+<tr>
+<th>version_number </th><td>Version of MADlib which was used to generate the 
model.  </td></tr>
+<tr>
+<th>source_table </th><td>The data source table name.  </td></tr>
+<tr>
+<th>model_table </th><td>The model table name.  </td></tr>
+<tr>
+<th>dependent_varname </th><td>The dependent variable.  </td></tr>
+<tr>
+<th>independent_varname </th><td>The independent variables.  </td></tr>
+<tr>
+<th>kernel_func </th><td>The kernel function.  </td></tr>
+<tr>
+<th>kernel_parameters </th><td>The kernel parameters, as well as random 
feature map data.  </td></tr>
+<tr>
+<th>grouping_col </th><td>Columns on which to group.  </td></tr>
+<tr>
+<th>optim_params </th><td>A string containing the optimization parameters.  
</td></tr>
+<tr>
+<th>reg_params </th><td>A string containing the regularization parameters.  
</td></tr>
+<tr>
+<th>num_all_groups </th><td>Number of groups in SVM training.  </td></tr>
+<tr>
+<th>num_failed_groups </th><td>Number of failed groups in SVM training.  
</td></tr>
+<tr>
+<th>total_rows_processed </th><td>Total numbers of rows processed in all 
groups.  </td></tr>
+<tr>
+<th>total_rows_skipped </th><td>Total numbers of rows skipped in all groups 
due to missing values or failures.  </td></tr>
+</table>
+<p>If cross validation is used, a table is created with a user specified name 
having the following columns: </p><table class="output">
+<tr>
+<th>... </th><td>Names of cross validation parameters  </td></tr>
+<tr>
+<th>mean_score </th><td>Mean value of accuracy when predicted on the 
validation fold, averaged over all folds and all rows.  </td></tr>
+<tr>
+<th>std_dev_score </th><td>Standard deviation of accuracy when predicted on 
the validation fold, averaged over all folds and all rows.  </td></tr>
+</table>
+<p><a class="anchor" id="svm_regression"></a></p><dl class="section 
user"><dt>Regression Training Function</dt><dd>The SVM regression training 
function has the following format: <pre class="syntax">
+svm_regression(source_table,
+    model_table,
+    dependent_varname,
+    independent_varname,
+    kernel_func,
+    kernel_params,
+    grouping_col,
+    params,
+    verbose
+    )
+</pre></dd></dl>
+<p><b>Arguments</b> </p>
+<p>Specifications for regression are largely the same as for classification. 
In the model table, there is no dependent variable mapping. The following 
arguments have specifications which differ from svm_classification: </p><dl 
class="arglist">
+<dt>dependent_varname </dt>
+<dd>TEXT. Name of the dependent variable column. For regression, this column 
can contain only values or expressions that can be cast to DOUBLE PRECISION. 
Otherwise, an error will be thrown.  </dd>
+<dt>params (optional) </dt>
+<dd>TEXT, default: NULL. The parameters <em>epsilon</em> and 
<em>eps_table</em> are only meaningful for regression. See description below 
for more details.  </dd>
+</dl>
+<p><a class="anchor" id="novelty_detection"></a></p><dl class="section 
user"><dt>Novelty Detection Training Function</dt><dd>The novelty detection 
function is a one-class SVM classifier, and has the following format: <pre 
class="syntax">
+svm_one_class(
+    source_table,
+    model_table,
+    independent_varname,
+    kernel_func,
+    kernel_params,
+    grouping_col,
+    params,
+    verbose
+    )
+</pre> <b>Arguments</b> </dd></dl>
+<p>Specifications for novelty detection are largely the same as for 
classification, except the dependent variable name is not specified. The model 
table is the same as that for classification.</p>
+<p><a class="anchor" id="kernel_params"></a></p><dl class="section 
user"><dt>Kernel Parameters</dt><dd>Kernel parameters are supplied in a string 
containing a comma-delimited list of name-value pairs. All of these named 
parameters are optional, and their order does not matter. You must use the 
format "&lt;param_name&gt; = &lt;value&gt;" to specify the value of a 
parameter, otherwise the parameter is ignored.</dd></dl>
+<dl class="arglist">
+<dt><em>Parameters common to all kernels</em></dt>
+<dd></dd>
+<dt>fit_intercept </dt>
+<dd>Default: True. The parameter <em>fit_intercept</em> is an indicator to add 
an intercept to the <em>independent_varname</em> array expression. The 
intercept is added to the end of the feature list - thus the last element of 
the coefficient list is the intercept.  </dd>
+<dt>n_components </dt>
+<dd>Default: 2*num_features. The dimensionality of the transformed feature 
space. A larger value lowers the variance of the estimate of the kernel but 
requires more memory and takes longer to train. </dd>
+<dt>random_state </dt>
+<dd>Default: 1. Seed used by a random number generator.  </dd>
+</dl>
+<dl class="arglist">
+<dt><em>Parameters for 'gaussian' kernel</em></dt>
+<dd></dd>
+<dt>gamma </dt>
+<dd>Default: 1/num_features. The parameter \(\gamma\) in the Radius Basis 
Function kernel, i.e., \(\exp(-\gamma||x-y||^2)\). Choosing a proper value for 
<em>gamma</em> is critical to the performance of kernel machine; e.g., while a 
large <em>gamma</em> tends to cause overfitting, a small <em>gamma</em> will 
make the model too constrained to capture the complexity of the data.  </dd>
+</dl>
+<dl class="arglist">
+<dt><em>Parameters for 'polynomial' kernel</em></dt>
+<dd></dd>
+<dt>coef0 </dt>
+<dd>Default: 1.0. The independent term \(q\) in \( (\langle x,y\rangle + q)^r 
\). Must be larger than or equal to 0. When it is 0, the polynomial kernel is 
in homogeneous form.  </dd>
+<dt>degree </dt>
+<dd>Default: 3. The parameter \(r\) in \( (\langle x,y\rangle + q)^r \).  </dd>
+</dl>
+<p><a class="anchor" id="parameters"></a></p><dl class="section 
user"><dt>Other Parameters</dt><dd>Parameters in this section are supplied in 
the <em>params</em> argument as a string containing a comma-delimited list of 
name-value pairs. All of these named parameters are optional, and their order 
does not matter. You must use the format "&lt;param_name&gt; = &lt;value&gt;" 
to specify the value of a parameter, otherwise the parameter is 
ignored.</dd></dl>
+<p>Hyperparameter optimization can be carried out using the built-in cross 
validation mechanism, which is activated by assigning a value greater than 1 to 
the parameter <em>n_folds</em> in <em>params</em>. Please note that cross 
validation is not supported if grouping is used.</p>
+<p>The values of a parameter to cross validate should be provided in a list. 
For example, if one wanted to regularize with the L1 norm and use a lambda 
value from the set {0.3, 0.4, 0.5}, one might input 'lambda={0.3, 0.4, 0.5}, 
norm=L1, n_folds=10' in <em>params</em>. Note that the use of '{}' and '[]' are 
both valid here. </p><dl class="section note"><dt>Note</dt><dd>Note that not 
all of the parameters below can be cross-validated. For parameters where cross 
validation is allowed, their default values are presented in list format; e.g., 
[0.01].</dd></dl>
+<pre class="syntax">
+  'init_stepsize = &lt;value&gt;,
+   decay_factor = &lt;value&gt;,
+   max_iter = &lt;value&gt;,
+   tolerance = &lt;value&gt;,
+   lambda = &lt;value&gt;,
+   norm = &lt;value&gt;,
+   epsilon = &lt;value&gt;,
+   eps_table = &lt;value&gt;,
+   validation_result = &lt;value&gt;,
+   n_folds = &lt;value&gt;,
+   class_weight = &lt;value&gt;'
+</pre><p> <b>Parameters</b> </p><dl class="arglist">
+<dt>init_stepsize </dt>
+<dd><p class="startdd">Default: [0.01]. Also known as the initial learning 
rate. A small value is usually desirable to ensure convergence, while a large 
value provides more room for progress during training. Since the best value 
depends on the condition number of the data, in practice one often searches in 
an exponential grid using built-in cross validation; e.g., "init_stepsize = [1, 
0.1, 0.001]". To reduce training time, it is common to run cross validation on 
a subsampled dataset, since this usually provides a good estimate of the 
condition number of the whole dataset. Then the resulting 
<em>init_stepsize</em> can be run on the whole dataset.</p>
+<p></p>
+<p class="enddd"></p>
+</dd>
+<dt>decay_factor </dt>
+<dd><p class="startdd">Default: [0.9]. Control the learning rate schedule: 0 
means constant rate; &lt;-1 means inverse scaling, i.e., stepsize = 
init_stepsize / iteration; &gt; 0 means &lt;exponential decay, i.e., stepsize = 
init_stepsize * decay_factor^iteration. </p>
+<p class="enddd"></p>
+</dd>
+<dt>max_iter </dt>
+<dd><p class="startdd">Default: [100]. The maximum number of iterations 
allowed. </p>
+<p class="enddd"></p>
+</dd>
+<dt>tolerance </dt>
+<dd><p class="startdd">Default: 1e-10. The criterion to end iterations. The 
training stops whenever &lt;the difference between the training models of two 
consecutive iterations is &lt;smaller than <em>tolerance</em> or the iteration 
number is larger than <em>max_iter</em>. </p>
+<p class="enddd"></p>
+</dd>
+<dt>lambda </dt>
+<dd><p class="startdd">Default: [0.01]. Regularization parameter. Must be 
non-negative. </p>
+<p class="enddd"></p>
+</dd>
+<dt>norm </dt>
+<dd><p class="startdd">Default: 'L2'. Name of the regularization, either 'L2' 
or 'L1'. </p>
+<p class="enddd"></p>
+</dd>
+<dt>epsilon </dt>
+<dd><p class="startdd">Default: [0.01]. Determines the \(\epsilon\) for 
\(\epsilon\)-regression. Ignored during classification. When training the 
model, differences of less than \(\epsilon\) between estimated labels and 
actual labels are ignored. A larger \(\epsilon\) will yield a model with fewer 
support vectors, but will not generalize as well to future data. Generally, it 
has been suggested that epsilon should increase with noisier data, and decrease 
with the number of samples. See [5]. </p>
+<p class="enddd"></p>
+</dd>
+<dt>eps_table </dt>
+<dd><p class="startdd">Default: NULL. Name of the input table that contains 
values of epsilon for different groups. Ignored when <em>grouping_col</em> is 
NULL. Define this input table if you want different epsilon values for 
different groups. The table consists of a column named <em>epsilon</em> which 
specifies the epsilon values, and one or more columns for 
<em>grouping_col</em>. Extra groups are ignored, and groups not present in this 
table will use the epsilon value specified in parameter <em>epsilon</em>. </p>
+<p class="enddd"></p>
+</dd>
+<dt>validation_result </dt>
+<dd><p class="startdd">Default: NULL. Name of the table to store the cross 
validation scores. This table is only created if the name is not NULL. The 
cross validation scores are the mean and standard deviation of the accuracy 
when predicted on the validation fold, averaged over all folds and all rows. 
For classification, the accuracy metric used is the ratio of correct 
classifications. For regression, the accuracy metric used is the negative of 
mean squared error (negative to make it a concave problem, thus selecting 
<em>max</em> means the highest accuracy). </p>
+<p class="enddd"></p>
+</dd>
+<dt>n_folds </dt>
+<dd><p class="startdd">Default: 0. Number of folds (k). Must be at least 2 to 
activate cross validation. If a value of k &gt; 2 is specified, each fold is 
then used as a validation set once, while the other k - 1 folds form the 
training set. </p>
+<p class="enddd"></p>
+</dd>
+<dt>class_weight </dt>
+<dd><p class="startdd">Default: 1 for classification, 'balanced' for one-class 
novelty detection, n/a for regression.</p>
+<p>Set the weight for the positive and negative classes. If not given, all 
classes are set to have weight one. If class_weight = balanced, values of y are 
automatically adjusted as inversely proportional to class frequencies in the 
input data i.e. the weights are set as n_samples / (n_classes * 
bincount(y)).</p>
+<p>Alternatively, class_weight can be a mapping, giving the weight for each 
class. Eg. For dependent variable values 'a' and 'b', the class_weight can be 
{a: 2, b: 3}. This would lead to each 'a' tuple's y value multiplied by 2 and 
each 'b' y value will be multiplied by 3.</p>
+<p class="enddd">For regression, the class weights are always one.  </p>
+</dd>
+</dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function is used to 
estimate the conditional mean given a new predictor. The same syntax is used 
for classification, regression and novelty detection: <pre class="syntax">
+svm_predict(model_table,
+            new_data_table,
+            id_col_name,
+            output_table)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>model_table </dt>
+<dd><p class="startdd">TEXT. Model table produced by the training function.</p>
+<p class="enddd"></p>
+</dd>
+<dt>new_data_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the prediction data. 
This table is expected to contain the same features that were used during 
training. The table should also contain id_col_name used for identifying each 
row.</p>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">TEXT. The name of the id column in the input table.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd>TEXT. Name of the table where output predictions are written. If this 
table name is already in use, then an error is returned. Table contains: <table 
class="output">
+<tr>
+<th>id </th><td>Gives the 'id' for each prediction, corresponding to each row 
from the new_data_table.  </td></tr>
+<tr>
+<th>prediction </th><td>Provides the prediction for each row in 
new_data_table. For regression this would be the same as decision_function. For 
classification, this will be one of the dependent variable values.  </td></tr>
+<tr>
+<th>decision_function </th><td>Provides the distance between each point and 
the separating hyperplane.  </td></tr>
+</table>
+</dd>
+</dl>
+<p><a class="anchor" id="example"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<h4>Classification</h4>
+<ol type="1">
+<li>Create an input data set. <pre class="example">
+DROP TABLE IF EXISTS houses;
+CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
+            size INT, lot INT);
+INSERT INTO houses VALUES
+  (1 ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100),
+  (2 , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000),
+  (3 ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500),
+  (4 ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500),
+  (5 , 1320 ,       3 ,    2 , 133000 , 1500 , 30000),
+  (6 , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700),
+  (7 , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000),
+  (8 ,  680 ,       2 ,    1 , 142500 , 1170 , 22000),
+  (9 , 1840 ,       3 ,    2 , 160000 , 1500 , 19000),
+ (10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000),
+ (11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500),
+ (12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000),
+ (13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),
+ (14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
+ (15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000);
+</pre></li>
+<li>Train linear classification model and view the model. Categorical variable 
is price &lt; $100,0000. <pre class="example">
+DROP TABLE IF EXISTS houses_svm, houses_svm_summary;
+SELECT madlib.svm_classification('houses',
+                                 'houses_svm',
+                                 'price &lt; 100000',
+                                 'ARRAY[1, tax, bath, size]'
+                           );
+-- Set extended display on for easier reading of output
+\x on
+SELECT * FROM houses_svm;
+</pre> <pre class="result">
+-[ RECORD 1 
]------+--------------------------------------------------------------------------------
+coef               | 
{0.103994021495116,-0.00288252192097756,0.0540748706580464,0.00131729978010033}
+loss               | 0.928463796644648
+norm_of_gradient   | 7849.34910604307
+num_iterations     | 100
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {f,t}
+</pre></li>
+<li>Predict using linear model. We want to predict if house price is less than 
$100,000. We use the training data set for prediction as well, which is not 
usual but serves to show the syntax. The predicted results are in the 
<em>prediction</em> column and the actual data is in the <em>actual</em> 
column. <pre class="example">
+DROP TABLE IF EXISTS houses_pred;
+SELECT madlib.svm_predict('houses_svm',
+                          'houses',
+                          'id',
+                          'houses_pred');
+\x off
+SELECT *, price &lt; 100000 AS actual FROM houses JOIN houses_pred USING (id) 
ORDER BY id;
+</pre> <pre class="result">
+  id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function  | actual
+----+------+---------+------+--------+------+-------+------------+--------------------+--------
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |  
0.211310440574799 | t
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |   
0.37546191651855 | t
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | t          |    
2.4021783278516 | t
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | t          |   
0.63967342411632 | t
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | f          | 
-0.179964783767855 | f
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | f          |  
-1.78347623159173 | t
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | f          |  
-2.86795504439645 | f
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | t          |  
0.811108105668757 | f
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | f          |  
-1.61739505790168 | f
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | f          |  
-3.96700444824078 | f
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | f          |  
-2.19489938920329 | t
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | f          |  
-1.53961627668269 | f
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | f          |  
-4.54881979553637 | f
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          |  
-2.06911803381861 | f
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          |   
1.52704061329968 | t
+(15 rows)
+</pre> Count the miss-classifications: <pre class="example">
+SELECT COUNT(*) FROM houses_pred JOIN houses USING (id)
+WHERE houses_pred.prediction != (houses.price &lt; 100000);
+</pre> <pre class="result">
+ count
+-------+
+     3
+</pre></li>
+<li>Train using Gaussian kernel. This time we specify the initial step size 
and maximum number of iterations to run. As part of the kernel parameter, we 
choose 10 as the dimension of the space where we train SVM. A larger number 
will lead to a more powerful model but run the risk of overfitting. As a 
result, the model will be a 10 dimensional vector, instead of 4 as in the case 
of linear model. <pre class="example">
+DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, 
houses_svm_gaussian_random;
+SELECT madlib.svm_classification( 'houses',
+                                  'houses_svm_gaussian',
+                                  'price &lt; 100000',
+                                  'ARRAY[1, tax, bath, size]',
+                                  'gaussian',
+                                  'n_components=10',
+                                  '',
+                                  'init_stepsize=1, max_iter=200'
+                           );
+\x on
+SELECT * FROM houses_svm_gaussian;
+</pre> <pre class="result">
+-[ RECORD 1 
]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coef               | 
{-1.67275666209207,1.5191640881642,-0.503066422926727,1.33250956564454,2.23009854231314,-0.0602475029497936,1.97466397155921,2.3668779833279,0.577739846910355,2.81255996089824}
+loss               | 0.0571869097340991
+norm_of_gradient   | 1.18281830047046
+num_iterations     | 177
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {f,t}
+</pre></li>
+<li>Prediction using the Gaussian model. The predicted results are in the 
<em>prediction</em> column and the actual data is in the <em>actual</em> 
column. <pre class="example">
+DROP TABLE IF EXISTS houses_pred_gaussian;
+SELECT madlib.svm_predict('houses_svm_gaussian',
+                          'houses',
+                          'id',
+                          'houses_pred_gaussian');
+\x off
+SELECT *, price &lt; 100000 AS actual FROM houses JOIN houses_pred_gaussian 
USING (id) ORDER BY id;
+</pre> <pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function  | actual
+----+------+---------+------+--------+------+-------+------------+--------------------+--------
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |   
1.89855833083557 | t
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |   
1.47736856649617 | t
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | t          |  
0.999999992995691 | t
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | t          |  
0.999999989634351 | t
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | f          |  
-1.03645694166465 | f
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | t          |   
1.16430515664766 | t
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | f          | 
-0.545622670134529 | f
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | f          |  
-1.00000000207512 | f
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | f          |   
-1.4748622470053 | f
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | f          |  
-1.00085274698056 | f
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | t          |    
1.8614251155696 | t
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | f          |  
-1.77616417509695 | f
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | f          |  
-1.07759348149549 | f
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          |  
-3.42475835116536 | f
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          |   
1.00000008401961 | t
+(15 rows)
+</pre> Count the miss-classifications. Note this produces a more accurate 
result than the linear case for this data set: <pre class="example">
+SELECT COUNT(*) FROM houses_pred_gaussian JOIN houses USING (id)
+WHERE houses_pred_gaussian.prediction != (houses.price &lt; 100000);
+</pre> <pre class="result">
+ count
+-------+
+     0
+(1 row)
+</pre></li>
+<li>In the case of an unbalanced class-size dataset, use the 'balanced' 
parameter to classify when building the model: <pre class="example">
+DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, 
houses_svm_gaussian_random;
+SELECT madlib.svm_classification( 'houses',
+                                  'houses_svm_gaussian',
+                                  'price &lt; 150000',
+                                  'ARRAY[1, tax, bath, size]',
+                                  'gaussian',
+                                  'n_components=10',
+                                  '',
+                                  'init_stepsize=1, max_iter=200, 
class_weight=balanced'
+                           );
+\x on
+SELECT * FROM houses_svm_gaussian;
+</pre> <pre class="result">
+-[ RECORD 1 
]------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coef               | 
{0.891926151039837,0.169282494673541,-2.26539133689874,0.526518499596676,-0.900664505989526,0.508112011288015,-0.355474591147659,1.23127975981665,1.53694964239487,1.46496058633682}
+loss               | 0.56900274445785
+norm_of_gradient   | 0.989597662458527
+num_iterations     | 183
+num_rows_processed | 15
+num_rows_skipped   | 0
+dep_var_mapping    | {f,t}
+</pre></li>
+</ol>
+<h4>Regression</h4>
+<ol type="1">
+<li>Create input data set. For regression we use part of the well known 
abalone data set <a 
href="https://archive.ics.uci.edu/ml/datasets/abalone";>https://archive.ics.uci.edu/ml/datasets/abalone</a>
 : <pre class="example">
+DROP TABLE IF EXISTS abalone;
+CREATE TABLE abalone (id INT, sex TEXT, length FLOAT, diameter FLOAT, height 
FLOAT, rings INT);
+INSERT INTO abalone VALUES
+(1,'M',0.455,0.365,0.095,15),
+(2,'M',0.35,0.265,0.09,7),
+(3,'F',0.53,0.42,0.135,9),
+(4,'M',0.44,0.365,0.125,10),
+(5,'I',0.33,0.255,0.08,7),
+(6,'I',0.425,0.3,0.095,8),
+(7,'F',0.53,0.415,0.15,20),
+(8,'F',0.545,0.425,0.125,16),
+(9,'M',0.475,0.37,0.125,9),
+(10,'F',0.55,0.44,0.15,19),
+(11,'F',0.525,0.38,0.14,14),
+(12,'M',0.43,0.35,0.11,10),
+(13,'M',0.49,0.38,0.135,11),
+(14,'F',0.535,0.405,0.145,10),
+(15,'F',0.47,0.355,0.1,10),
+(16,'M',0.5,0.4,0.13,12),
+(17,'I',0.355,0.28,0.085,7),
+(18,'F',0.44,0.34,0.1,10),
+(19,'M',0.365,0.295,0.08,7),
+(20,'M',0.45,0.32,0.1,9);
+</pre></li>
+<li>Train a linear regression model: <pre class="example">
+DROP TABLE IF EXISTS abalone_svm_regression, abalone_svm_regression_summary;
+SELECT madlib.svm_regression('abalone',
+                             'abalone_svm_regression',
+                             'rings',
+                             'ARRAY[1, length, diameter, height]'
+                           );
+\x on
+SELECT * FROM abalone_svm_regression;
+</pre> <pre class="result">
+-[ RECORD 1 
]------+-----------------------------------------------------------------------
+coef               | 
{1.998949892503,0.918517478913099,0.712125856084095,0.229379472956877}
+loss               | 8.29033295818392
+norm_of_gradient   | 23.225177785827
+num_iterations     | 100
+num_rows_processed | 20
+num_rows_skipped   | 0
+dep_var_mapping    | {NULL}
+</pre></li>
+<li>Predict using the linear regression model: <pre class="example">
+DROP TABLE IF EXISTS abalone_regr;
+SELECT madlib.svm_predict('abalone_svm_regression',
+                          'abalone',
+                          'id',
+                          'abalone_regr');
+\x off
+SELECT * FROM abalone JOIN abalone_regr USING (id) ORDER BY id;
+</pre> <pre class="result">
+ id | sex | length | diameter | height | rings |    prediction    | 
decision_function
+----+-----+--------+----------+--------+-------+------------------+-------------------
+  1 | M   |  0.455 |    0.365 |  0.095 |    15 | 2.69859240928376 |  
2.69859240928376
+  2 | M   |   0.35 |    0.265 |   0.09 |     7 | 2.52978857282818 |  
2.52978857282818
+  3 | F   |   0.53 |     0.42 |  0.135 |     9 | 2.81582333426116 |  
2.81582333426116
+  4 | M   |   0.44 |    0.365 |  0.125 |    10 | 2.69169603073001 |  
2.69169603073001
+  5 | I   |   0.33 |    0.255 |   0.08 |     7 | 2.50200316683054 |  
2.50200316683054
+  6 | I   |  0.425 |      0.3 |  0.095 |     8 | 2.62474869654157 |  
2.62474869654157
+  7 | F   |   0.53 |    0.415 |   0.15 |    20 | 2.81570339722408 |  
2.81570339722408
+  8 | F   |  0.545 |    0.425 |  0.125 |    16 | 2.83086793257882 |  
2.83086793257882
+  9 | M   |  0.475 |     0.37 |  0.125 |     9 | 2.72740477577673 |  
2.72740477577673
+ 10 | F   |   0.55 |     0.44 |   0.15 |    19 |  2.8518768970598 |   
2.8518768970598
+ 11 | F   |  0.525 |     0.38 |   0.14 |    14 | 2.78389260680315 |  
2.78389260680315
+ 12 | M   |   0.43 |     0.35 |   0.11 |    10 | 2.66838827339779 |  
2.66838827339779
+ 13 | M   |   0.49 |     0.38 |  0.135 |    11 | 2.75059759385832 |  
2.75059759385832
+ 14 | F   |  0.535 |    0.405 |  0.145 |    10 | 2.81202782833915 |  
2.81202782833915
+ 15 | F   |   0.47 |    0.355 |    0.1 |    10 | 2.70639581129576 |  
2.70639581129576
+ 16 | M   |    0.5 |      0.4 |   0.13 |    12 | 2.77287839069521 |  
2.77287839069521
+ 17 | I   |  0.355 |     0.28 |  0.085 |     7 | 2.54391615211472 |  
2.54391615211472
+ 18 | F   |   0.44 |     0.34 |    0.1 |    10 | 2.66815839489651 |  
2.66815839489651
+ 19 | M   |  0.365 |    0.295 |   0.08 |     7 | 2.56263631931732 |  
2.56263631931732
+ 20 | M   |   0.45 |     0.32 |    0.1 |     9 | 2.66310105219146 |  
2.66310105219146
+(20 rows)
+</pre> RMS error: <pre class="example">
+SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone
+JOIN abalone_regr USING (id);
+</pre> <pre class="result">
+    rms_error
+-----------------+
+ 9.0884271818321
+(1 row)
+</pre></li>
+<li>Train a non-linear regression model using a Gaussian kernel: <pre 
class="example">DROP TABLE IF EXISTS abalone_svm_gaussian_regression, 
abalone_svm_gaussian_regression_summary, abalone_svm_gaussian_regression_random;
+SELECT madlib.svm_regression( 'abalone',
+                              'abalone_svm_gaussian_regression',
+                              'rings',
+                              'ARRAY[1, length, diameter, height]',
+                              'gaussian',
+                              'n_components=10',
+                              '',
+                              'init_stepsize=1, max_iter=200'
+                           );
+\x on
+SELECT * FROM abalone_svm_gaussian_regression;
+</pre> <pre class="result">
+-[ RECORD 1 
]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coef               | 
{4.49016341280977,2.19062972461334,-2.04673653356154,1.11216153651262,2.83478599238881,-4.23122821845785,4.17684533744501,-5.36892552740644,0.775782561685621,-3.62606941016707}
+loss               | 2.66850539541894
+norm_of_gradient   | 0.97440079536379
+num_iterations     | 163
+num_rows_processed | 20
+num_rows_skipped   | 0
+dep_var_mapping    | {NULL}
+</pre></li>
+<li>Predict using Gaussian regression model: <pre class="example">
+DROP TABLE IF EXISTS abalone_gaussian_regr;
+SELECT madlib.svm_predict('abalone_svm_gaussian_regression',
+                          'abalone',
+                          'id',
+                          'abalone_gaussian_regr');
+\x off
+SELECT * FROM abalone JOIN abalone_gaussian_regr USING (id) ORDER BY id;
+</pre> <pre class="result">
+ id | sex | length | diameter | height | rings |    prediction    | 
decision_function
+----+-----+--------+----------+--------+-------+------------------+-------------------
+  1 | M   |  0.455 |    0.365 |  0.095 |    15 | 9.92189555675422 |  
9.92189555675422
+  2 | M   |   0.35 |    0.265 |   0.09 |     7 | 9.81553107620013 |  
9.81553107620013
+  3 | F   |   0.53 |     0.42 |  0.135 |     9 | 10.0847384862759 |  
10.0847384862759
+  4 | M   |   0.44 |    0.365 |  0.125 |    10 | 10.0100000075406 |  
10.0100000075406
+  5 | I   |   0.33 |    0.255 |   0.08 |     7 | 9.74093262454458 |  
9.74093262454458
+  6 | I   |  0.425 |      0.3 |  0.095 |     8 | 9.94807651709641 |  
9.94807651709641
+  7 | F   |   0.53 |    0.415 |   0.15 |    20 | 10.1448936105369 |  
10.1448936105369
+  8 | F   |  0.545 |    0.425 |  0.125 |    16 | 10.0579420659954 |  
10.0579420659954
+  9 | M   |  0.475 |     0.37 |  0.125 |     9 |  10.055724626407 |   
10.055724626407
+ 10 | F   |   0.55 |     0.44 |   0.15 |    19 | 10.1225030222559 |  
10.1225030222559
+ 11 | F   |  0.525 |     0.38 |   0.14 |    14 |  10.160706707435 |   
10.160706707435
+ 12 | M   |   0.43 |     0.35 |   0.11 |    10 | 9.95760174386841 |  
9.95760174386841
+ 13 | M   |   0.49 |     0.38 |  0.135 |    11 | 10.0981242315617 |  
10.0981242315617
+ 14 | F   |  0.535 |    0.405 |  0.145 |    10 | 10.1501121415596 |  
10.1501121415596
+ 15 | F   |   0.47 |    0.355 |    0.1 |    10 | 9.97689437628973 |  
9.97689437628973
+ 16 | M   |    0.5 |      0.4 |   0.13 |    12 | 10.0633271219326 |  
10.0633271219326
+ 17 | I   |  0.355 |     0.28 |  0.085 |     7 | 9.79492924255328 |  
9.79492924255328
+ 18 | F   |   0.44 |     0.34 |    0.1 |    10 | 9.94856833428783 |  
9.94856833428783
+ 19 | M   |  0.365 |    0.295 |   0.08 |     7 | 9.78278863173308 |  
9.78278863173308
+ 20 | M   |   0.45 |     0.32 |    0.1 |     9 | 9.98822477687532 |  
9.98822477687532
+(20 rows)
+</pre> Compute the RMS error. Note this produces a more accurate result than 
the linear case for this data set: <pre class="example">
+SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone
+JOIN abalone_gaussian_regr USING (id);
+</pre> <pre class="result">
+    rms_error
+------------------+
+ 3.83678516581768
+(1 row)
+</pre></li>
+<li>Cross validation. Let's run cross validation for different initial step 
sizes and lambda values: <pre class="example">
+DROP TABLE IF EXISTS abalone_svm_gaussian_regression, 
abalone_svm_gaussian_regression_summary,
+abalone_svm_gaussian_regression_random, abalone_svm_gaussian_regression_cv;
+SELECT madlib.svm_regression( 'abalone',
+                              'abalone_svm_gaussian_regression',
+                              'rings',
+                              'ARRAY[1, length, diameter, height]',
+                              'gaussian',
+                              'n_components=10',
+                              '',
+                              'init_stepsize=[0.01,1], n_folds=3, 
max_iter=200, lambda=[0.01, 0.1, 0.5],
+                              
validation_result=abalone_svm_gaussian_regression_cv'
+                           );
+\x on
+SELECT * FROM abalone_svm_gaussian_regression;
+</pre> <pre class="result">
+-[ RECORD 1 
]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coef               | 
{4.46074154389204,2.19335800415975,-2.14775901092668,1.06805891149535,2.91168496475457,-3.95521278459095,4.20496790233169,-5.28144330907061,0.427743633754918,-3.58999505728692}
+loss               | 2.68317592175908
+norm_of_gradient   | 0.69852112502746
+num_iterations     | 169
+num_rows_processed | 20
+num_rows_skipped   | 0
+dep_var_mapping    | {NULL}
+</pre> View the summary table showing the final model parameters are those 
that produced the lowest error in the cross validation runs: <pre 
class="example">
+SELECT * FROM abalone_svm_gaussian_regression_summary;
+</pre> <pre class="result">
+-[ RECORD 1 
]--------+------------------------------------------------------------------------------------
+method               | SVR
+version_number       | 1.15-dev
+source_table         | abalone
+model_table          | abalone_svm_gaussian_regression
+dependent_varname    | rings
+independent_varname  | ARRAY[1, length, diameter, height]
+kernel_func          | gaussian
+kernel_params        | gamma=0.25, n_components=10,random_state=1, 
fit_intercept=False, fit_in_memory=True
+grouping_col         | NULL
+optim_params         | init_stepsize=1.0,
+                     | decay_factor=0.9,
+                     | max_iter=200,
+                     | tolerance=1e-10,
+                     | epsilon=0.01,
+                     | eps_table=,
+                     | class_weight=
+reg_params           | lambda=0.01, norm=l2, n_folds=3
+num_all_groups       | 1
+num_failed_groups    | 0
+total_rows_processed | 20
+total_rows_skipped   | 0
+(6 rows)
+</pre> View the statistics for the various cross validation values: <pre 
class="example">
+\x off
+SELECT * FROM abalone_svm_gaussian_regression_cv;
+</pre> <pre class="result">
+ init_stepsize | lambda |   mean_score   | std_dev_score
+---------------+--------+----------------+----------------
+           1.0 |   0.01 | -4.06711568585 | 0.435966381366
+           1.0 |    0.1 | -4.08068428345 |  0.44660797513
+           1.0 |    0.5 | -4.52576046087 |  0.20597876382
+          0.01 |   0.01 | -11.0231044189 | 0.739956548721
+          0.01 |    0.1 | -11.0244799274 | 0.740029346709
+          0.01 |    0.5 | -11.0305445077 | 0.740350338532
+(6 rows)
+</pre></li>
+<li>Predict using the cross-validated Gaussian regression model: <pre 
class="example">
+DROP TABLE IF EXISTS abalone_gaussian_regr;
+SELECT madlib.svm_predict('abalone_svm_gaussian_regression',
+                          'abalone',
+                          'id',
+                          'abalone_gaussian_regr');
+</pre> Compute the RMS error. Note this produces a more accurate result than 
the previous run with the Gaussian kernel: <pre class="example">
+SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone
+JOIN abalone_gaussian_regr USING (id);
+</pre> <pre class="result">
+    rms_error
+------------------+
+ 3.84208909699442
+(1 row)
+</pre></li>
+</ol>
+<h4>Novelty Detection</h4>
+<ol type="1">
+<li>Now train a non-linear one-class SVM for novelty detection, using a 
Gaussian kernel. Note that the dependent variable is not a parameter for 
one-class: <pre class="example">
+DROP TABLE IF EXISTS houses_one_class_gaussian, 
houses_one_class_gaussian_summary, houses_one_class_gaussian_random;
+select madlib.svm_one_class('houses',
+                            'houses_one_class_gaussian',
+                            'ARRAY[1,tax,bedroom,bath,size,lot,price]',
+                            'gaussian',
+                            'gamma=0.5,n_components=55, random_state=3',
+                            NULL,
+                            'max_iter=100, init_stepsize=10,lambda=10, 
tolerance=0'
+                            );
+\x on
+SELECT * FROM houses_one_class_gaussian;
+</pre> Result: <pre class="result">
+-[ RECORD 1 
]------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coef               | {redacted for brevity}
+loss               | 0.944016313708205
+norm_of_gradient   | 14.5271059047443
+num_iterations     | 100
+num_rows_processed | 16
+num_rows_skipped   | -1
+dep_var_mapping    | {-1,1}
+</pre></li>
+<li>For the novelty detection using one-class, let's create a test data set 
using the last 3 values from the training set plus an outlier at the end (10x 
price): <pre class="example">
+DROP TABLE IF EXISTS houses_one_class_test;
+CREATE TABLE houses_one_class_test (id INT, tax INT, bedroom INT, bath FLOAT, 
price INT,
+            size INT, lot INT);
+INSERT INTO houses_one_class_test VALUES
+ (1 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),
+ (2 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
+ (3 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000),
+ (4 ,  650 ,       3 ,  1.5 ,  650000 , 1450 , 12000);
+</pre> Now run prediction on the Gaussian one-class novelty detection model. 
Result shows the last row predicted to be novel: <pre class="example">
+DROP TABLE IF EXISTS houses_pred;
+SELECT madlib.svm_predict('houses_one_class_gaussian',
+                          'houses_one_class_test',
+                          'id',
+                          'houses_pred');
+\x off
+SELECT * FROM houses_one_class_test JOIN houses_pred USING (id) ORDER BY id;
+</pre> Result showing the last row predicted to be novel: <pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction |  
decision_function
+----+------+---------+------+--------+------+-------+------------+---------------------
+  1 | 3100 |       3 |    2 | 140000 | 1760 | 38000 |          1 |   
0.111497008121437
+  2 | 2070 |       2 |    3 | 148000 | 1550 | 14000 |          1 |  
0.0996021345169148
+  3 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 |          1 |  
0.0435064008756942
+  4 |  650 |       3 |  1.5 | 650000 | 1450 | 12000 |         -1 | 
-0.0168967845338403
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd></dd></dl>
+<p>To solve linear SVM, the following objective function is minimized: </p><p 
class="formulaDsp">
+\[ \underset{w,b}{\text{Minimize }} \lambda||w||^2 + \frac{1}{n}\sum_{i=1}^n 
\ell(y_i,f_{w,b}(x_i)) \]
+</p>
+<p>where \((x_1,y_1),\ldots,(x_n,y_n)\) are labeled training data and 
\(\ell(y,f(x))\) is a loss function. When performing classification, 
\(\ell(y,f(x)) = \max(0,1-yf(x))\) is the <em>hinge loss</em>. For regression, 
the loss function \(\ell(y,f(x)) = \max(0,|y-f(x)|-\epsilon)\) is used.</p>
+<p>If \( f_{w,b}(x) = \langle w, x\rangle + b\) is linear, then the objective 
function is convex and incremental gradient descent (IGD, or SGD) can be 
applied to find a global minimum. See Feng, et al. [1] for more details.</p>
+<p>To learn with Gaussian or polynomial kernels, the training data is first 
mapped via a <em>random feature map</em> in such a way that the usual inner 
product in the feature space approximates the kernel function in the input 
space. The linear SVM training function is then run on the resulting data. See 
the papers [2,3] for more information on random feature maps.</p>
+<p>Also, see the book [4] by Scholkopf and Smola for more details on SVMs in 
general.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p><a class="anchor" id="svm-lit-1"></a>[1] Xixuan Feng, Arun Kumar, Ben 
Recht, and Christopher Re: Towards a Unified Architecture for in-RDBMS 
analytics, in SIGMOD Conference, 2012 <a 
href="http://www.eecs.berkeley.edu/~brecht/papers/12.FengEtAl.SIGMOD.pdf";>http://www.eecs.berkeley.edu/~brecht/papers/12.FengEtAl.SIGMOD.pdf</a></p>
+<p><a class="anchor" id="svm-lit-2"></a>[2] Purushottam Kar and Harish 
Karnick: Random Feature Maps for Dot Product Kernels, Proceedings of the 15th 
International Conference on Artificial Intelligence and Statistics, 2012, <a 
href="http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2012_KarK12.pdf";>http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2012_KarK12.pdf</a></p>
+<p><a class="anchor" id="svm-lit-3"></a>[3] Ali Rahmini and Ben Recht: Random 
Features for Large-Scale Kernel Machines, Neural Information Processing Systems 
2007, <a 
href="http://www.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf";>http://www.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf</a></p>
+<p><a class="anchor" id="svm-lit-4"></a>[4] Bernhard Scholkopf and Alexander 
Smola: Learning with Kernels, The MIT Press, Cambridge, MA, 2002.</p>
+<p><a class="anchor" id="svm-lit-5"></a>[5] Vladimir Cherkassky and Yunqian 
Ma: Practical Selection of SVM Parameters and Noise Estimation for SVM 
Regression, Neural Networks, 2004 <a 
href="http://www.ece.umn.edu/users/cherkass/N2002-SI-SVM-13-whole.pdf";>http://www.ece.umn.edu/users/cherkass/N2002-SI-SVM-13-whole.pdf</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="svm_8sql__in.html" title="SQL functions for SVM 
(Poisson) ">svm.sql_in</a> documenting the training function</p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>

[07/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

Reply via email to