[2/3] madlib git commit: Multiple: Remove trailing whitespace from all SQL

riyer Fri, 07 Sep 2018 15:19:27 -0700

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/linalg/svd.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/svd.sql_in 
b/src/ports/postgres/modules/linalg/svd.sql_in
index b6d763b..070f5e4 100644
--- a/src/ports/postgres/modules/linalg/svd.sql_in
+++ b/src/ports/postgres/modules/linalg/svd.sql_in
@@ -79,17 +79,17 @@ row2        2       {0, 1}
     </pre>
     </dd>
     <dt>output_table_prefix</dt>
-    <dd>TEXT. Prefix for output tables. See 
-    <a href="#output">Output Tables</a> below for a  description 
+    <dd>TEXT. Prefix for output tables. See
+    <a href="#output">Output Tables</a> below for a  description
     of the convention used.</dd>
     <dt>row_id</dt>
     <dd>TEXT. ID for each row.</dd>
     <dt>k</dt>
     <dd>INTEGER. Number of singular values to compute.</dd>
     <dt>n_iterations (optional). </dd>
-    <dd>INTEGER. Number of iterations to run.  
-    @note The number of iterations must be 
-    in the range [k, column dimension], where 
+    <dd>INTEGER. Number of iterations to run.
+    @note The number of iterations must be
+    in the range [k, column dimension], where
     k is number of singular values.</dd>
     <dt>result_summary_table (optional)</dt>
     <dd>TEXT. The name of the table to store the result summary.</dd>
@@ -99,7 +99,7 @@ row2        2       {0, 1}
 <b>SVD Function for Sparse Matrices</b>
 
 Use this function for matrices that are represented in the sparse-matrix
-format (example below).  <b>Note that the input matrix is converted to a 
+format (example below).  <b>Note that the input matrix is converted to a
 dense matrix before the SVD operation, for efficient computation reasons. </b>
 
 <pre class="syntax">
@@ -142,8 +142,8 @@ matrix, indicating that the 4th row and 7th column contain 
all zeros.
 </dd>
 
 <dt>output_table_prefix</dt>
-<dd>TEXT. Prefix for output tables. See 
-    <a href="#output">Output Tables</a> below for a  description 
+<dd>TEXT. Prefix for output tables. See
+    <a href="#output">Output Tables</a> below for a  description
     of the convention used. </dd>
 <dt>row_id</dt>
 <dd>TEXT. Name of the column containing the row index for each entry in sparse 
matrix.</dd>
@@ -158,9 +158,9 @@ matrix, indicating that the 4th row and 7th column contain 
all zeros.
 <dt>k</dt>
 <dd>INTEGER. Number of singular values to compute.</dd>
 <dt>n_iterations (optional)</dt>
-<dd>INTEGER. Number of iterations to run.  
-    @note The number of iterations must be 
-    in the range [k, column dimension], where 
+<dd>INTEGER. Number of iterations to run.
+    @note The number of iterations must be
+    in the range [k, column dimension], where
     k is number of singular values.</dd>
 <dt>result_summary_table (optional)</dt>
 <dd>TEXT. The name of the table to store the result summary.</dd>
@@ -171,10 +171,10 @@ matrix, indicating that the 4th row and 7th column 
contain all zeros.
 <b>Native Implementation for Sparse Matrices</b>
 
 Use this function for matrices that are represented in the sparse-matrix
-format (see sparse matrix example above). This function uses the 
+format (see sparse matrix example above). This function uses the
 native sparse representation while computing the SVD.
-@note Note that this function should be favored if the matrix is 
-highly sparse, since it computes very sparse matrices 
+@note Note that this function should be favored if the matrix is
+highly sparse, since it computes very sparse matrices
 efficiently. </b>
 
 <pre class="syntax">
@@ -195,8 +195,8 @@ svd_sparse_native( source_table,
     <dt>source_table</dt>
     <dd>TEXT. Source table name (sparse matrix - see example above).</dd>
     <dt>output_table_prefix</dt>
-    <dd>TEXT. Prefix for output tables. See 
-    <a href="#output">Output Tables</a> below for a  description 
+    <dd>TEXT. Prefix for output tables. See
+    <a href="#output">Output Tables</a> below for a  description
     of the convention used.</dd>
     <dt>row_id</dt>
     <dd>TEXT. ID for each row.</dd>
@@ -211,9 +211,9 @@ svd_sparse_native( source_table,
     <dt>k</dt>
     <dd>INTEGER. Number of singular values to compute.</dd>
     <dt>n_iterations (optional)</dt>
-    <dd>INTEGER. Number of iterations to run.  
-    @note The number of iterations must be 
-    in the range [k, column dimension], where 
+    <dd>INTEGER. Number of iterations to run.
+    @note The number of iterations must be
+    in the range [k, column dimension], where
     k is number of singular values.</dd>
     <dt>result_summary_table (optional)</dt>
     <dd>TEXT. Table name to store result summary.</dd>
@@ -307,7 +307,7 @@ CREATE TABLE mat (
 );
 INSERT INTO mat VALUES
 (1,'{396,840,353,446,318,886,15,584,159,383}'),
-(2,'{691,58,899,163,159,533,604,582,269,390}'), 
+(2,'{691,58,899,163,159,533,604,582,269,390}'),
 (3,'{293,742,298,75,404,857,941,662,846,2}'),
 (4,'{462,532,787,265,982,306,600,608,212,885}'),
 (5,'{304,151,337,387,643,753,603,531,459,652}'),
@@ -328,7 +328,7 @@ INSERT INTO mat VALUES
 <pre class="example">
 SELECT madlib.svd( 'mat',       -- Input table
                    'svd',       -- Output table prefix
-                   'row_id',    -- Column name with row index 
+                   'row_id',    -- Column name with row index
                    10,          -- Number of singular values to compute
                    NULL,        -- Use default number of iterations
                    'svd_summary_table'  -- Result summary table
@@ -340,7 +340,7 @@ SELECT * FROM svd_s ORDER BY row_id;
 </pre>
 Result:
 <pre class="result">
- row_id | col_id |      value       
+ row_id | col_id |      value
 &nbsp;--------+--------+------------------
       1 |      1 | 6475.67225281804
       2 |      2 | 1875.18065580415
@@ -352,7 +352,7 @@ Result:
       8 |      8 | 709.086240684469
       9 |      9 | 462.473775959371
      10 |     10 | 365.875217945698
-     10 |     10 |                 
+     10 |     10 |
 (11 rows)
 </pre>
 For the summary table:
@@ -361,14 +361,14 @@ SELECT * FROM svd_summary_table;
 </pre>
 Result:
 <pre class="result">
- rows_used | exec_time (ms) | iter |    recon_error    | relative_recon_error 
+ rows_used | exec_time (ms) | iter |    recon_error    | relative_recon_error
 
&nbsp;-----------+----------------+------+-------------------+----------------------
         16 |        1332.47 |   10 | 4.36920148766e-13 |    7.63134130332e-16
 (1 row)
 </pre>
 -# Create a sparse matrix by running the \ref matrix_sparsify() utility 
function on the dense matrix.
 <pre class="example">
-SELECT madlib.matrix_sparsify('mat', 
+SELECT madlib.matrix_sparsify('mat',
                               'row=row_id, val=row_vec',
                               'mat_sparse',
                               'row=row_id, col=col_id, val=value');
@@ -378,11 +378,11 @@ SELECT madlib.matrix_sparsify('mat',
 <pre class="example">
 SELECT madlib.svd_sparse( 'mat_sparse',   -- Input table
                           'svd',          -- Output table prefix
-                          'row_id',       -- Column name with row index 
-                          'col_id',       -- Column name with column index 
+                          'row_id',       -- Column name with row index
+                          'col_id',       -- Column name with column index
                           'value',        -- Matrix cell value
                           16,             -- Number of rows in matrix
-                          10,             -- Number of columns in matrix    
+                          10,             -- Number of columns in matrix
                           10              -- Number of singular values to 
compute
                           );
 </pre>
@@ -391,11 +391,11 @@ SELECT madlib.svd_sparse( 'mat_sparse',   -- Input table
 <pre class="example">
 SELECT madlib.svd_sparse_native ( 'mat_sparse',   -- Input table
                           'svd',          -- Output table prefix
-                          'row_id',       -- Column name with row index 
-                          'col_id',       -- Column name with column index 
+                          'row_id',       -- Column name with row index
+                          'col_id',       -- Column name with column index
                           'value',        -- Matrix cell value
                           16,             -- Number of rows in matrix
-                          10,             -- Number of columns in matrix    
+                          10,             -- Number of columns in matrix
                           10              -- Number of singular values to 
compute
                           );
 </pre>


http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/pca/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.sql_in 
b/src/ports/postgres/modules/pca/pca.sql_in
index 6bcce1c..f7ef305 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -120,12 +120,12 @@ Please refer to the \ref grp_matrix documentation for 
more details on defining m
 </DD>
 
 <DT>out_table</DT>
-<DD>TEXT. The name of the table that will contain the output. 
+<DD>TEXT. The name of the table that will contain the output.
 There are three possible output tables as described below.
 
-The primary output table (<em>out_table</em>) encodes the principal components 
-with the <em>k</em> highest eigenvalues where <em>k</em> is either directly 
-provided by the user or computed according to the proportion of variance. 
+The primary output table (<em>out_table</em>) encodes the principal components
+with the <em>k</em> highest eigenvalues where <em>k</em> is either directly
+provided by the user or computed according to the proportion of variance.
 The table has the following columns:
 <table class="output">
 <tr>
@@ -165,13 +165,13 @@ should be of type INT (or a type that can be cast to INT) 
and should only contai
 describing the full matrix.</DD>
 
 <DT>col_id</DT>
-<DD>TEXT.  Column name containing the column IDs in sparse matrix 
representation. 
-The column should be of type INT (or a type that can be cast to INT) 
-and should only contain values between 1 and <em>M</em>.  <em>This parameter 
applies to 
+<DD>TEXT.  Column name containing the column IDs in sparse matrix 
representation.
+The column should be of type INT (or a type that can be cast to INT)
+and should only contain values between 1 and <em>M</em>.  <em>This parameter 
applies to
 sparse matrices only.</em></DD>
 
 <DT>val_id</DT>
-<DD>TEXT.  Name of 'val_id' column in sparse matrix representation defining 
the values of the nonzero entries. 
+<DD>TEXT.  Name of 'val_id' column in sparse matrix representation defining 
the values of the nonzero entries.
 <em>This parameter applies to sparse matrices only.</em></DD>
 
 <DT>row_dim</DT>
@@ -183,35 +183,35 @@ it would have.
 <DT>col_dim</DT>
 <DD>INTEGER.  The actual number of columns in the matrix.  That is,
 if the matrix was transformed into dense format, this is the number of columns
-it would have. 
+it would have.
 <em>This parameter applies to sparse matrices only.</em></DD>
 
-@note The parameters 'row_dim' and 'col_dim' could actually be inferred from 
the 
+@note The parameters 'row_dim' and 'col_dim' could actually be inferred from 
the
 sparse matrix representation, so they will be removed in the future.
 For now they are maintained for backward compatability so you must enter them.
-Making 'row_dim' or 'col_dim' larger than the actual matrix has the effect of 
padding it with 
+Making 'row_dim' or 'col_dim' larger than the actual matrix has the effect of 
padding it with
 zeros, which is probably not useful.
 
 <DT>components_param</DT>
-<DD>INTEGER or FLOAT.  The parameter to control the number of principal 
-components to calculate from the input data. If 'components_param' is INTEGER, 
-it is used to denote the number of principal components (<em>k</em>) to 
-compute. If 'components_param' is FLOAT, the algorithm will return enough 
-principal vectors so that the ratio of the sum of the eigenvalues collected 
-thus far to the sum of all eigenvalues is greater than this parameter 
-(proportion of variance).  The value of 'components_param' must be either 
+<DD>INTEGER or FLOAT.  The parameter to control the number of principal
+components to calculate from the input data. If 'components_param' is INTEGER,
+it is used to denote the number of principal components (<em>k</em>) to
+compute. If 'components_param' is FLOAT, the algorithm will return enough
+principal vectors so that the ratio of the sum of the eigenvalues collected
+thus far to the sum of all eigenvalues is greater than this parameter
+(proportion of variance).  The value of 'components_param' must be either
 a positive INTEGER or a FLOAT in the range (0.0,1.0]</DD>
 
-@note The difference in interpretation between INTEGER and FLOAT was 
+@note The difference in interpretation between INTEGER and FLOAT was
 introduced to maintain backward campatibility after the proportion of
 variance feature was introduced.  A special case to be aware of:
 'components_param' = 1 (INTEGER) will return 1 principal
-component, but 'components_param' = 1.0 (FLOAT) will return all 
+component, but 'components_param' = 1.0 (FLOAT) will return all
 principal components, i.e., proportion of variance of 100%.
 \n \n
 Also, please note that the number of principal components (<em>k</em>)
 is global, even in the case where grouping is used (see 'grouping_cols'
-below).  In the case of grouping, proportion of variance 
+below).  In the case of grouping, proportion of variance
 might be a better choice; this could result in different numbers
 of principal components for different groups.
 
@@ -220,28 +220,28 @@ of principal components for different groups.
 source data grouped using the combination of all the columns. An independent
 PCA model will be computed for each combination of the grouping columns.</DD>
 
-@note Dense matrices can be different sizes for different groups if desired.  
+@note Dense matrices can be different sizes for different groups if desired.
 Sparse matrices cannot be different sizes for different groups,
-because the 'row_dim' and 'col_dim' parameters used for sparse matrices 
+because the 'row_dim' and 'col_dim' parameters used for sparse matrices
 are global across all groups.
 
 <DT>lanczos_iter (optional)</DT>
 <DD>INTEGER, default: minimum of {<em>k+40</em>, smallest matrix dimension}
-where <em>k</em> is the number of principal components specified in the 
-parameter 'components_param'.  This parameter defines the 
+where <em>k</em> is the number of principal components specified in the
+parameter 'components_param'.  This parameter defines the
 number of Lanczos iterations for the SVD calculation.
 The Lanczos iteration number roughly corresponds to the accuracy of the SVD
 calculation, and a higher number of iterations corresponds to greater accuracy
 but longer computation time. The number of iterations must be at least as
-large as the value of <em>k</em>, but no larger than the smallest dimension 
-of the matrix.  If the number of iterations is set to zero, then the default 
+large as the value of <em>k</em>, but no larger than the smallest dimension
+of the matrix.  If the number of iterations is set to zero, then the default
 number of iterations will be used.</DD>
 
-@note If both 'lanczos_iter' and proportion of variance (via 
-the 'components_param' parameter) are defined, 'lanczos_iter' 
-will take precedence in determining the number of 
-principal components (i.e. the number of principal components 
-will not be greater than 'lanczos_iter' even if the target 
+@note If both 'lanczos_iter' and proportion of variance (via
+the 'components_param' parameter) are defined, 'lanczos_iter'
+will take precedence in determining the number of
+principal components (i.e. the number of principal components
+will not be greater than 'lanczos_iter' even if the target
 proportion had not been reached).
 
 <DT>use_correlation (optional)</DT>
@@ -315,7 +315,7 @@ SELECT madlib.pca_train('mat',             -- Source table
 SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                     principal_components                     |      
std_dev      |    proportion     
+ row_id |                     principal_components                     |      
std_dev      |    proportion
 
--------+--------------------------------------------------------------+-------------------+-------------------
       1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} |  
1.41421356237309 | 0.857142857142244
       2 | {0,1,0}                                                      | 
0.577350269189626 | 0.142857142857041
@@ -333,7 +333,7 @@ SELECT madlib.pca_train('mat',             -- Source table
 SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                     principal_components                     |      
std_dev      |    proportion     
+ row_id |                     principal_components                     |      
std_dev      |    proportion
 
--------+--------------------------------------------------------------+-------------------+-------------------
       1 | {0.707106781186548,-2.77555756156289e-17,-0.707106781186548} |   
1.4142135623731 | 0.857142857142245
       2 | {-1.11022302462516e-16,-1,0}                                 | 
0.577350269189626 | 0.142857142857041
@@ -342,7 +342,7 @@ SELECT * FROM result_table ORDER BY row_id;
 
 -# Now we use grouping in dense form to learn different models for different 
groups.
 First, we create sample data in dense matrix form with a grouping column.
-Note we actually have different matrix sizes for the different groups, which 
+Note we actually have different matrix sizes for the different groups, which
 is allowed for dense:
 <pre class="example">
 DROP TABLE IF EXISTS mat_group;
@@ -371,7 +371,7 @@ SELECT madlib.pca_train('mat_group',             -- Source 
table
 SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |                                      principal_components            
                          |     std_dev     |    proportion     | matrix_id 
+ row_id |                                      principal_components            
                          |     std_dev     |    proportion     | matrix_id
 
--------+------------------------------------------------------------------------------------------------+-----------------+-------------------+-----------
       1 | {0.707106781186548,0,-0.707106781186547}                             
                          | 1.4142135623731 | 0.857142857142245 |         1
       1 | 
{-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023}
 | 3.2315220311722 | 0.764102534485173 |         2
@@ -397,17 +397,17 @@ INSERT INTO mat_sparse VALUES
 (3, 2, 7.0),
 (4, 3, 8.0);
 </pre>
-As an aside, this is what the sparse matrix above looks like when 
+As an aside, this is what the sparse matrix above looks like when
 put in dense form:
 <pre class="example">
 DROP TABLE IF EXISTS mat_dense;
-SELECT madlib.matrix_densify('mat_sparse', 
-                            'row=row_id, col=col_id, val=value', 
+SELECT madlib.matrix_densify('mat_sparse',
+                            'row=row_id, col=col_id, val=value',
                             'mat_dense');
 SELECT * FROM mat_dense ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |    value    
+ row_id |    value
 --------+-------------
       1 | {1,0,0,0,5}
       2 | {0,2,0,6,0}
@@ -425,12 +425,12 @@ SELECT madlib.pca_sparse_train( 'mat_sparse',       -- 
Source table
                                 'value',            -- Value of matrix at 
row_id, col_id
                                 4,                  -- Actual number of rows 
in the matrix
                                 5,                  -- Actual number of 
columns in the matrix
-                                3);                 -- Number of principal 
components                            
+                                3);                 -- Number of principal 
components
 SELECT * FROM result_table ORDER BY row_id;
 </pre>
 Result (with principal components truncated for readability):
 <pre class="result">
- row_id |         principal_components                 |     std_dev      |    
proportion     
+ row_id |         principal_components                 |     std_dev      |    
proportion
 
--------+----------------------------------------------+------------------+-------------------
       1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 
0.436590030617467
       2 | {-0.0647272661608605,0.877639526308692,...   | 3.68408023747461 | 
0.333748701544697
@@ -475,7 +475,7 @@ SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
 Result (with principal components truncated for readability):
 <pre class="result">
- row_id |           principal_components             |     std_dev      |    
proportion     | matrix_id 
+ row_id |           principal_components             |     std_dev      |    
proportion     | matrix_id
 
--------+--------------------------------------------+------------------+-------------------+-----------
       1 | {-0.17805696611353,0.0681313257646983,...  | 2.73659933165925 | 
0.544652792875481 |         1
       2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 
0.308800210823714 |         1
@@ -490,7 +490,7 @@ Result (with principal components truncated for 
readability):
 - Table names can be optionally schema qualified (current_schemas() would be
 searched if a schema name is not provided) and all table and column names
 should follow case-sensitivity and quoting rules per the database.
-(For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 
+(For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e.
 'mytable'.
 If mixed-case or multi-byte characters are desired for entity names then the
 string should be double-quoted; in this case the input would be '"MyTable"').
@@ -504,14 +504,14 @@ performance improvement in using sparse matrix input over 
dense matrix input.
 - For the parameter 'components_param', INTEGER and FLOAT are
 interpreted differently.  A special case to be aware of:
 'components_param' = 1 (INTEGER) will return 1 principal
-component, but 'components_param' = 1.0 (FLOAT) will return all 
+component, but 'components_param' = 1.0 (FLOAT) will return all
 principal components, i.e., proportion of variance of 100%.
 
-- If both 'lanczos_iter' and proportion of variance (via 
-the 'components_param' parameter) are defined, 'lanczos_iter' 
-will take precedence in determining the number of 
-principal components (i.e. the number of principal components 
-will not be greater than 'lanczos_iter' even if the target 
+- If both 'lanczos_iter' and proportion of variance (via
+the 'components_param' parameter) are defined, 'lanczos_iter'
+will take precedence in determining the number of
+principal components (i.e. the number of principal components
+will not be greater than 'lanczos_iter' even if the target
 proportion had not been reached).
 
 @anchor background_pca
@@ -802,7 +802,7 @@ MADLIB_SCHEMA.pca_sparse_train(
     grouping_cols        TEXT,     -- Comma-separated list of grouping columns 
(Default: NULL)
     lanczos_iter         INTEGER,  -- The number of Lanczos iterations for the 
SVD calculation (Default: min(k+40, smallest Matrix dimension))
     use_correlation      BOOLEAN,  -- If True correlation matrix is used for 
principal components (Default: False)
-    result_summary_table TEXT      -- Table name to store summary of results 
(Default: NULL) 
+    result_summary_table TEXT      -- Table name to store summary of results 
(Default: NULL)
 )
 RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, 
$10, $11, $12, NULL)
@@ -895,7 +895,7 @@ MADLIB_SCHEMA.pca_sparse_train(
     grouping_cols        TEXT,     -- Comma-separated list of grouping columns 
(Default: NULL)
     lanczos_iter         INTEGER,  -- The number of Lanczos iterations for the 
SVD calculation (Default: min(k+40, smallest Matrix dimension))
     use_correlation      BOOLEAN,  -- If True correlation matrix is used for 
principal components (Default: False)
-    result_summary_table TEXT      -- Table name to store summary of results 
(Default: NULL) 
+    result_summary_table TEXT      -- Table name to store summary of results 
(Default: NULL)
 )
 RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, NULL, 
$9, $10, $11, $12, $8)

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/pca/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.sql_in 
b/src/ports/postgres/modules/pca/pca_project.sql_in
index 1023569..27dbd01 100644
--- a/src/ports/postgres/modules/pca/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/pca_project.sql_in
@@ -33,7 +33,7 @@ procedure.
 Principal component projection is a mathematical procedure that projects high
 dimensional data onto a lower dimensional space.  This lower dimensional space
 is defined by the \f$ k  \f$  principal components with the highest variance in
-the training data.  
+the training data.
 
 More details on the mathematics of PCA can be found in \ref
 grp_pca_train and some details about principal component projection 
calculations
@@ -109,13 +109,13 @@ should be of type INT (or a type that can be cast to INT) 
and should only contai
 describing the full matrix.</DD>
 
 <DT>col_id</DT>
-<DD>TEXT.  Column name containing the column IDs in sparse matrix 
representation. 
-The column should be of type INT (or a type that can be cast to INT) 
-and should only contain values between 1 and <em>M</em>.  <em>This parameter 
applies to 
+<DD>TEXT.  Column name containing the column IDs in sparse matrix 
representation.
+The column should be of type INT (or a type that can be cast to INT)
+and should only contain values between 1 and <em>M</em>.  <em>This parameter 
applies to
 sparse matrices only.</em></DD>
 
 <DT>val_id</DT>
-<DD>TEXT.  Name of 'val_id' column in sparse matrix representation defining 
the values of the nonzero entries. 
+<DD>TEXT.  Name of 'val_id' column in sparse matrix representation defining 
the values of the nonzero entries.
 <em>This parameter applies to sparse matrices only.</em></DD>
 
 <DT>row_dim</DT>
@@ -127,13 +127,13 @@ it would have.
 <DT>col_dim</DT>
 <DD>INTEGER.  The actual number of columns in the matrix.  That is,
 if the matrix was transformed into dense format, this is the number of columns
-it would have. 
+it would have.
 <em>This parameter applies to sparse matrices only.</em></DD>
 
-@note The parameters 'row_dim' and 'col_dim' could actually be inferred from 
the 
+@note The parameters 'row_dim' and 'col_dim' could actually be inferred from 
the
 sparse matrix representation, so they will be removed in the future.
 For now they are maintained for backward compatability so you must enter them.
-Making 'row_dim' or 'col_dim' larger than the actual matrix has the effect of 
padding it with 
+Making 'row_dim' or 'col_dim' larger than the actual matrix has the effect of 
padding it with
 zeros, which is probably not useful.
 
 <DT>residual_table (optional)</DT>
@@ -202,7 +202,7 @@ SELECT madlib.pca_train('mat',             -- Source table
 SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                     principal_components                     |      
std_dev      |    proportion     
+ row_id |                     principal_components                     |      
std_dev      |    proportion
 
--------+--------------------------------------------------------------+-------------------+-------------------
       1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} |  
1.41421356237309 | 0.857142857142244
       2 | {0,1,0}                                                      | 
0.577350269189626 | 0.142857142857041
@@ -222,7 +222,7 @@ SELECT madlib.pca_project( 'mat',
 SELECT * FROM out_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |               row_vec                
+ row_id |               row_vec
 --------+--------------------------------------
       1 | {-1.41421356237309,-0.33333333333}
       2 | {2.77555756157677e-17,0.66666666667}
@@ -234,7 +234,7 @@ Check the error in the projection:
 SELECT * FROM result_summary_table;
 </pre>
 <pre class="result">
-   exec_time   |   residual_norm   | relative_residual_norm 
+   exec_time   |   residual_norm   | relative_residual_norm
 ---------------+-------------------+------------------------
  331.792116165 | 5.89383520611e-16 |      9.68940539229e-17
 (1 row)
@@ -244,7 +244,7 @@ Check the residuals:
 SELECT * FROM residual_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                              row_vec                               
+ row_id |                              row_vec
 --------+--------------------------------------------------------------------
       1 | {-2.22044604925031e-16,-1.11022302462516e-16,3.33066907387547e-16}
       2 | {-1.12243865646685e-18,0,4.7381731349413e-17}
@@ -254,7 +254,7 @@ SELECT * FROM residual_table ORDER BY row_id;
 
 -# Now we use grouping in dense form to learn different models for different 
groups.
 First, we create sample data in dense matrix form with a grouping column.
-Note we actually have different matrix sizes for the different groups, which 
+Note we actually have different matrix sizes for the different groups, which
 is allowed for dense:
 <pre class="example">
 DROP TABLE IF EXISTS mat_group;
@@ -283,7 +283,7 @@ SELECT madlib.pca_train('mat_group',             -- Source 
table
 SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |                                      principal_components            
                          |     std_dev     |    proportion     | matrix_id 
+ row_id |                                      principal_components            
                          |     std_dev     |    proportion     | matrix_id
 
--------+------------------------------------------------------------------------------------------------+-----------------+-------------------+-----------
       1 | {0.707106781186548,0,-0.707106781186547}                             
                          | 1.4142135623731 | 0.857142857142245 |         1
       1 | 
{-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023}
 | 3.2315220311722 | 0.764102534485173 |         2
@@ -291,9 +291,9 @@ SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 (3 rows)
 </pre>
 
--# Run the PCA projection on subsets of an input table based on grouping 
columns. 
-Note that the parameter 'pc_table' used for projection must be generated in 
training 
-using the same grouping columns. 
+-# Run the PCA projection on subsets of an input table based on grouping 
columns.
+Note that the parameter 'pc_table' used for projection must be generated in 
training
+using the same grouping columns.
 <pre class="example">
 DROP TABLE IF EXISTS mat_group_projected;
 SELECT madlib.pca_project('mat_group',
@@ -303,7 +303,7 @@ SELECT madlib.pca_project('mat_group',
 SELECT * FROM mat_group_projected ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |                row_vec                | matrix_id 
+ row_id |                row_vec                | matrix_id
 --------+---------------------------------------+-----------
       1 | {1.4142135623731}                     |         1
       2 | {7.40148683087139e-17}                |         1
@@ -332,17 +332,17 @@ INSERT INTO mat_sparse VALUES
 (3, 2, 7.0),
 (4, 3, 8.0);
 </pre>
-As an aside, this is what the sparse matrix above looks like when 
+As an aside, this is what the sparse matrix above looks like when
 put in dense form:
 <pre class="example">
 DROP TABLE IF EXISTS mat_dense;
-SELECT madlib.matrix_densify('mat_sparse', 
-                            'row=row_id, col=col_id, val=value', 
+SELECT madlib.matrix_densify('mat_sparse',
+                            'row=row_id, col=col_id, val=value',
                             'mat_dense');
 SELECT * FROM mat_dense ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |    value    
+ row_id |    value
 --------+-------------
       1 | {1,0,0,0,5}
       2 | {0,2,0,6,0}
@@ -360,12 +360,12 @@ SELECT madlib.pca_sparse_train( 'mat_sparse',       -- 
Source table
                                 'value',            -- Value of matrix at 
row_id, col_id
                                 4,                  -- Actual number of rows 
in the matrix
                                 5,                  -- Actual number of 
columns in the matrix
-                                3);                 -- Number of principal 
components                            
+                                3);                 -- Number of principal 
components
 SELECT * FROM result_table ORDER BY row_id;
 </pre>
 Result (with principal components truncated for readability):
 <pre class="result">
- row_id |         principal_components                 |     std_dev      |    
proportion     
+ row_id |         principal_components                 |     std_dev      |    
proportion
 
--------+----------------------------------------------+------------------+-------------------
       1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 
0.436590030617467
       2 | {-0.0647272661608605,0.877639526308692,...   | 3.68408023747461 | 
0.333748701544697
@@ -389,7 +389,7 @@ SELECT madlib.pca_sparse_project(
 SELECT * FROM mat_sparse_out ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                         row_vec                         
+ row_id |                         row_vec
 --------+---------------------------------------------------------
       1 | {4.66617015032369,-2.63552220635847,2.1865220849604}
       2 | {0.228360685652383,-1.21616275892926,-4.46864627611561}
@@ -435,7 +435,7 @@ SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
 Result (with principal components truncated for readability):
 <pre class="result">
- row_id |           principal_components             |     std_dev      |    
proportion     | matrix_id 
+ row_id |           principal_components             |     std_dev      |    
proportion     | matrix_id
 
--------+--------------------------------------------+------------------+-------------------+-----------
       1 | {-0.17805696611353,0.0681313257646983,...  | 2.73659933165925 | 
0.544652792875481 |         1
       2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 
0.308800210823714 |         1
@@ -460,7 +460,7 @@ SELECT madlib.pca_sparse_project(
 SELECT * FROM mat_sparse_group_projected ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |                 row_vec                 | matrix_id 
+ row_id |                 row_vec                 | matrix_id
 --------+-----------------------------------------+-----------
       1 | {-4.00039298524261,-0.626820612715982}  |         1
       2 | {0.765350785238575,0.951348276645455}   |         1

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 5926152..c015556 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -293,8 +293,8 @@ tree_train(
       Importance values are displayed as raw values as per the 
'split_criterion'
       parameter.
       To see importance values normalized to sum to 100 across
-      all variables, use the importance display helper function 
-      described later on this page. 
+      all variables, use the importance display helper function
+      described later on this page.
       Please refer to [1] for more information on variable importance.
       </td>
       </tr>

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index 5b5a0f0..795cc48 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -164,9 +164,9 @@ forest_train(training_table_name,
     Due to nature of permutation, the importance value can end up being
     negative if the number of levels for a categorical variable is small and is
     unbalanced. In such a scenario, the importance values are shifted to ensure
-    that the lowest importance value is 0.  To see importance values 
normalized 
-    to sum to 100 across all variables, use the importance display helper 
function 
-    described later on this page. 
+    that the lowest importance value is 0.  To see importance values normalized
+    to sum to 100 across all variables, use the importance display helper 
function
+    described later on this page.
 
   </DD>
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/regress/linear.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/linear.sql_in 
b/src/ports/postgres/modules/regress/linear.sql_in
index 68c9020..afc14d5 100644
--- a/src/ports/postgres/modules/regress/linear.sql_in
+++ b/src/ports/postgres/modules/regress/linear.sql_in
@@ -157,7 +157,7 @@ in fact the same.
 <DD>TEXT. Expression list to evaluate for the independent variables. An 
intercept variable is not assumed. It is common to provide an explicit 
intercept term by including a single constant <tt>1</tt> term in the 
independent variable list.</DD>
 
 <DT>grouping_cols (optional)</DT>
-<DD>TEXT, default: NULL. An expression list used to group the input dataset 
into discrete groups, running one regression per group. Similar to the SQL 
<tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a 
+<DD>TEXT, default: NULL. An expression list used to group the input dataset 
into discrete groups, running one regression per group. Similar to the SQL 
<tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a
 single result model is generated for the whole data set.</DD>
 
 <DT>heteroskedasticity_option (optional)</DT>
@@ -200,7 +200,7 @@ as was used for the training. </dd>
 DROP TABLE IF EXISTS houses;
 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
             size INT, lot INT);
-INSERT INTO houses VALUES   
+INSERT INTO houses VALUES
   (1 ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100),
   (2 , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000),
   (3 ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500),
@@ -217,7 +217,7 @@ INSERT INTO houses VALUES
  (14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
  (15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000);
 </pre>
--#  Train a regression model. First, we generate a single regression for all 
data.  
+-#  Train a regression model. First, we generate a single regression for all 
data.
 <pre class="example">
 DROP TABLE IF EXISTS houses_linregr, houses_linregr_summary;
 SELECT madlib.linregr_train( 'houses',
@@ -226,9 +226,9 @@ SELECT madlib.linregr_train( 'houses',
                              'ARRAY[1, tax, bath, size]'
                            );
 </pre>
-(Note that in this example we are dynamically creating the array of 
independent variables 
-from column names.  If you have large numbers of independent variables beyond 
the PostgreSQL 
-limit of maximum columns per table, you would pre-build the arrays and store 
them in a 
+(Note that in this example we are dynamically creating the array of 
independent variables
+from column names.  If you have large numbers of independent variables beyond 
the PostgreSQL
+limit of maximum columns per table, you would pre-build the arrays and store 
them in a
 single column.)
 -# Next we generate three output models, one for each value of "bedroom".
 <pre class="example">
@@ -271,7 +271,7 @@ FROM houses_linregr;
 </pre>
 Result:
 <pre class="result">
- attribute |    coefficient    |  standard_error  |      t_stat       |       
pvalue       
+ attribute |    coefficient    |  standard_error  |      t_stat       |       
pvalue
 
-----------+-------------------+------------------+-------------------+--------------------
  intercept | -12849.4168959872 | 33453.0344331391 | -0.38410317968819 |  
0.708223134615422
  tax       |  28.9613922651765 | 15.8992104963997 |  1.82156166004184 | 
0.0958005827189772
@@ -292,7 +292,7 @@ coef                     | 
{0.0112536020318378,41.4132554771633,0.02250720406367
 r2                       | 1
 std_err                  | {0,0,0,0}
 t_stats                  | {Infinity,Infinity,Infinity,Infinity}
-p_values                 | 
+p_values                 |
 condition_no             | Infinity
 num_rows_processed       | 1
 num_missing_rows_skipped | 0
@@ -336,7 +336,7 @@ FROM houses, houses_linregr m ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id | tax  | bedroom | bath | price  | size |  lot  |     predict      |     
residual      
+ id | tax  | bedroom | bath | price  | size |  lot  |     predict      |     
residual
 
----+------+---------+------+--------+------+-------+------------------+-------------------
   1 |  590 |       2 |    1 |  50000 |  770 | 22100 | 53317.4426965542 | 
-3317.44269655424
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | 109152.124955627 | 
-24152.1249556268
@@ -356,7 +356,7 @@ Result:
 (15 rows)
 </pre>
 
--# Compare predicted price with actual with grouping. 
+-# Compare predicted price with actual with grouping.
 It means a different model is used depending on the number of bedrooms.
 <pre class="example">
 \\x OFF
@@ -366,7 +366,7 @@ SELECT houses.*,
                              ) as predict,
         price -
           madlib.linregr_predict( m.coef,
-                                 ARRAY[1,tax,bath,size] 
+                                 ARRAY[1,tax,bath,size]
                                 ) as residual
 FROM houses, houses_linregr_bedroom m
 WHERE houses.bedroom = m.bedroom
@@ -374,7 +374,7 @@ ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id | tax  | bedroom | bath | price  | size |  lot  |     predict      |     
residual      
+ id | tax  | bedroom | bath | price  | size |  lot  |     predict      |     
residual
 
----+------+---------+------+--------+------+-------+------------------+-------------------
   1 |  590 |       2 |    1 |  50000 |  770 | 22100 | 43223.5393423978 |  
6776.46065760222
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | 111527.609949684 |  
-26527.609949684

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/regress/logistic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/logistic.sql_in 
b/src/ports/postgres/modules/regress/logistic.sql_in
index 6ce347a..e15b416 100644
--- a/src/ports/postgres/modules/regress/logistic.sql_in
+++ b/src/ports/postgres/modules/regress/logistic.sql_in
@@ -115,8 +115,8 @@ logregr_train( source_table,
 
       <tr>
         <th>num_missing_rows_skipped</th>
-        <td>INTEGER. The number of rows skipped during the training. 
-        A row will be skipped if the independent_varname is NULL or 
+        <td>INTEGER. The number of rows skipped during the training.
+        A row will be skipped if the independent_varname is NULL or
         contains NULL values.</td>
       </tr>
 
@@ -162,7 +162,7 @@ logregr_train( source_table,
 
     <tr>
     <th>optimizer_params</th>
-    <td>A string that contains all the optimizer parameters, and 
+    <td>A string that contains all the optimizer parameters, and
     has the form of 'optimizer=..., max_iter=..., tolerance=...'</td>
     </tr>
 
@@ -234,7 +234,7 @@ logregr_train( source_table,
   <DD>FLOAT8, default: 0.0001. The difference between
     log-likelihood values in successive iterations that indicate
     convergence. A zero disables the convergence criterion, so that execution
-    stops after the maximum iterations have completed, as set in the 
'max_iter' 
+    stops after the maximum iterations have completed, as set in the 'max_iter'
     parameter above.</DD>
 
   <DT>verbose (optional)</DT>
@@ -292,8 +292,8 @@ DROP TABLE IF EXISTS patients;
 CREATE TABLE patients( id INTEGER NOT NULL,
                        second_attack INTEGER,
                        treatment INTEGER,
-                       trait_anxiety INTEGER);                        
-INSERT INTO patients VALUES 
+                       trait_anxiety INTEGER);
+INSERT INTO patients VALUES
 (1,  1, 1, 70),
 (2,  1, 1, 80),
 (3,  1, 1, 50),
@@ -327,9 +327,9 @@ SELECT madlib.logregr_train( 'patients',                    
         -- Source t
                              'irls'                                  -- 
Optimizer to use
                            );
 </pre>
-Note that in the example above we are dynamically creating the array of 
independent variables 
-from column names.  If you have large numbers of independent variables beyond 
the PostgreSQL 
-limit of maximum columns per table, you would typically pre-build the arrays 
and store them in a 
+Note that in the example above we are dynamically creating the array of 
independent variables
+from column names.  If you have large numbers of independent variables beyond 
the PostgreSQL
+limit of maximum columns per table, you would typically pre-build the arrays 
and store them in a
 single column.
 -# View the regression results.
 <pre class="example">
@@ -365,7 +365,7 @@ SELECT unnest(array['intercept', 'treatment', 
'trait_anxiety']) as attribute,
 </pre>
 Result:
 <pre class="result">
-   attribute   |    coefficient    |   standard_error   |       z_stat       | 
      pvalue       |     odds_ratio      
+   attribute   |    coefficient    |   standard_error   |       z_stat       | 
      pvalue       |     odds_ratio
 
---------------+-------------------+--------------------+--------------------+--------------------+---------------------
  intercept     | -6.36346994178192 |   3.21389766375099 |  -1.97998524145757 | 
0.0477051870698145 | 0.00172337630923221
  treatment     | -1.02410605239327 |   1.17107844860319 | -0.874498248699539 | 
 0.381846973530455 |   0.359117354054956
@@ -387,7 +387,7 @@ ORDER BY p.id;
 </pre>
 Result:
 <pre class="result">
-  id | logregr_predict | second_attack 
+  id | logregr_predict | second_attack
 ----+-----------------+---------------
   1 | t               | t
   2 | t               | t
@@ -423,7 +423,7 @@ ORDER BY p.id;
 </pre>
 Result:
 <pre class="result">
- id | logregr_predict_prob | second_attack 
+ id | logregr_predict_prob | second_attack
 ----+----------------------+---------------
   1 |    0.720223028941527 | t
   2 |    0.894354902502048 | t
@@ -547,7 +547,7 @@ than 1000) indicates the presence of significant 
multicollinearity.
 @anchor literature
 @literature
 
-A selection of references pertaining to logistic regression, 
+A selection of references pertaining to logistic regression,
 with some good pointers to other literature.
 
 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/regress/test/clustered.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/test/clustered.sql_in 
b/src/ports/postgres/modules/regress/test/clustered.sql_in
index c8bc74b..4687516 100644
--- a/src/ports/postgres/modules/regress/test/clustered.sql_in
+++ b/src/ports/postgres/modules/regress/test/clustered.sql_in
@@ -571,8 +571,8 @@ INSERT INTO abalone1 VALUES
 
 DROP TABLE IF EXISTS abalone2;
 create table abalone2 as
-select 
-    length, diameter, height, whole, sex, 
+select
+    length, diameter, height, whole, sex,
     (case when rings < 10 then 1 else 0 end)::integer as y
 from abalone1;
 
@@ -647,8 +647,8 @@ INSERT INTO abalone1_with_null SELECT * FROM 
abalone1_all_null;
 
 DROP TABLE IF EXISTS abalone2_with_null;
 create table abalone2_with_null as
-select 
-    length, diameter, height, whole, sex, 
+select
+    length, diameter, height, whole, sex,
     (rings < 10)::integer as y
 from abalone1_with_null;
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/correlation.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/correlation.sql_in 
b/src/ports/postgres/modules/stats/correlation.sql_in
index 3bf3e46..a8a7a7e 100644
--- a/src/ports/postgres/modules/stats/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/correlation.sql_in
@@ -25,17 +25,17 @@ m4_include(`SQLCommon.m4')
 </ul>
 </div>
 
-@brief Generates a covariance or Pearson correlation matrix for pairs 
+@brief Generates a covariance or Pearson correlation matrix for pairs
 of numeric columns in a table.
 
 A correlation function is the degree and direction of association of two
-variables&mdash;how well one random variable can be predicted from the other. 
+variables&mdash;how well one random variable can be predicted from the other.
 It is a normalized version of covariance.
 The Pearson correlation coefficient is used here, which has a value between -1 
and 1,
-where 1 implies total positive linear correlation, 0 means no linear 
correlation, 
+where 1 implies total positive linear correlation, 0 means no linear 
correlation,
 and -1 means total negative linear correlation.
 
-This function generates an \f$N\f$x\f$N\f$ cross correlation matrix for 
+This function generates an \f$N\f$x\f$N\f$ cross correlation matrix for
 pairs of numeric columns in a <em>source_table</em>. It is square symmetrical
 with the \f$ (i,j) \f$th element equal to the correlation coefficient between 
the
 \f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of
@@ -74,14 +74,14 @@ covariance( source_table,
 
 <dt>output_table</dt>
 <dd>TEXT. Name of the table containing the cross correlation matrix.
-The output table has N rows, where N is the number 
-of '<em>target_cols</em>' in the '<em>source_table'</em> for which 
-correlation or covariance is being computed. 
+The output table has N rows, where N is the number
+of '<em>target_cols</em>' in the '<em>source_table'</em> for which
+correlation or covariance is being computed.
 It has the following columns:
 <table class="output">
 <tr>
 <th>column_position</th>
-<td>An automatically generated sequential counter indicating the order of the 
+<td>An automatically generated sequential counter indicating the order of the
 variable in the '<em>output_table</em>'.</td>
 </tr>
 <tr>
@@ -111,11 +111,11 @@ is also created, which has the following columns:
 <tr><th>method</th><td>'Correlation' or 'Covariance'</td></tr>
 <tr><th>source_table</th><td>VARCHAR. Data source table name.</td></tr>
 <tr><th>output_table</th><td>VARCHAR. Output table name.</td></tr>
-<tr><th>column_names</th><td>VARCHAR. Column names used for correlation 
+<tr><th>column_names</th><td>VARCHAR. Column names used for correlation
 computation, as a comma-separated string.</td></tr>
 <tr><th>grouping_cols</th>
 <td>Contains the grouping columns, if any.</td></tr>
-<tr><th>mean_vector</th><td>FLOAT8[]. Mean value of column 
+<tr><th>mean_vector</th><td>FLOAT8[]. Mean value of column
 for variables of interest.</td></tr>
 <tr>
   <th>total_rows_processed</th>
@@ -142,12 +142,12 @@ If NULL or <tt>'*'</tt>, results are produced for all 
numeric columns.</dd>
 -# Create an input dataset.
 <pre class="example">
 DROP TABLE IF EXISTS example_data CASCADE;
-CREATE TABLE example_data( 
-    id SERIAL, 
+CREATE TABLE example_data(
+    id SERIAL,
     outlook TEXT,
-    temperature FLOAT8, 
+    temperature FLOAT8,
     humidity FLOAT8,
-    windy TEXT, 
+    windy TEXT,
     class TEXT,
     day TEXT
 );
@@ -217,19 +217,19 @@ SELECT madlib.correlation( 'example_data',
 </pre>
 View the correlation matrix:
 <pre class="example">
-SELECT * FROM example_data_output ORDER BY column_position; 
+SELECT * FROM example_data_output ORDER BY column_position;
 </pre>
 <pre class="result">
- column_position |  variable   |     temperature     | humidity 
+ column_position |  variable   |     temperature     | humidity
 -----------------+-------------+---------------------+----------
-               1 | temperature |                   1 | 
+               1 | temperature |                   1 |
                2 | humidity    | 0.00607993890408995 |        1
 (2 rows)
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
-SELECT * FROM example_data_output_summary; 
+SELECT * FROM example_data_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]--------+-----------------------------------
@@ -257,20 +257,20 @@ View the correlation matrix by group:
 SELECT * FROM example_data_output ORDER BY day, column_position;
 </pre>
 <pre class="result">
- column_position |  variable   | day  |    temperature    | humidity 
+ column_position |  variable   | day  |    temperature    | humidity
 -----------------+-------------+------+-------------------+----------
-               1 | temperature | Mon  |                 1 | 
+               1 | temperature | Mon  |                 1 |
                2 | humidity    | Mon  | 0.616876934548786 |        1
-               1 | temperature | Tues |                 1 | 
+               1 | temperature | Tues |                 1 |
                2 | humidity    | Tues | 0.616876934548786 |        1
-               1 | temperature | Wed  |                 1 | 
+               1 | temperature | Wed  |                 1 |
                2 | humidity    | Wed  | -0.28969669368457 |        1
 (6 rows)
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
-SELECT * FROM example_data_output_summary ORDER BY day; 
+SELECT * FROM example_data_output_summary ORDER BY day;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]--------+------------------------------------
@@ -310,19 +310,19 @@ SELECT madlib.covariance( 'example_data',
 </pre>
 View the covariance matrix:
 <pre class="example">
-SELECT * FROM example_data_output ORDER BY column_position; 
+SELECT * FROM example_data_output ORDER BY column_position;
 </pre>
 <pre class="result">
- column_position |  variable   |   temperature    |     humidity     
+ column_position |  variable   |   temperature    |     humidity
 -----------------+-------------+------------------+------------------
-               1 | temperature | 507.926664293343 | 
+               1 | temperature | 507.926664293343 |
                2 | humidity    | 2.40227839088644 | 307.359914560342
 (2 rows)
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
-SELECT * FROM example_data_output_summary; 
+SELECT * FROM example_data_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]--------+-----------------------------------

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/hypothesis_tests.sql_in 
b/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
index 18f9462..61dbb89 100644
--- a/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
+++ b/src/ports/postgres/modules/stats/hypothesis_tests.sql_in
@@ -61,7 +61,7 @@ singleton set), the \f$ p \f$-value is the supremum, over all 
possible
 distributions according to the null hypothesis, of these probabilities.
 
 @note Please refer to \ref hypothesis_tests.sql_in for additional technical
-information on the MADlib implementation of hypothesis tests, and for 
+information on the MADlib implementation of hypothesis tests, and for
 detailed function signatures for all tests.
 
 @anchor input
@@ -205,7 +205,7 @@ SELECT * FROM auto83b_one_sample;
 </pre>
 
 <pre class="result">
-mpg 
+mpg
   18
   15
   18
@@ -250,7 +250,7 @@ SELECT TRUE AS is_us, mpg_us AS mpg
 SELECT * FROM auto83b_two_sample;
 </pre>
 <pre class="result">
- is_us | mpg 
+ is_us | mpg
 -------+-----
  t     |  18
  t     |  15

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/pred_metrics.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/pred_metrics.sql_in 
b/src/ports/postgres/modules/stats/pred_metrics.sql_in
index 5358263..3f62746 100644
--- a/src/ports/postgres/modules/stats/pred_metrics.sql_in
+++ b/src/ports/postgres/modules/stats/pred_metrics.sql_in
@@ -45,7 +45,7 @@ m4_include(`SQLCommon.m4')
 
 @brief Provides various prediction accuracy metrics.
 
-This module provides a set of metrics to evaluate the quality of predictions 
of a model. 
+This module provides a set of metrics to evaluate the quality of predictions 
of a model.
 A typical function will take a set of "prediction" and
 "observation" values and use them to calculate the desired metric, unless 
noted otherwise.
 Grouping is supported for all functions (except confusion matrix).
@@ -114,7 +114,7 @@ This function returns the adjusted R2 score in addition to 
the R-squared score
 described above.  Adjusted R2 score is used to
 counter the problem of the R2 automatically increasing when extra explanatory
 variables are added to the model. It takes two additional parameters describing
-the degrees of freedom of the model (num_predictors) and the size of the 
+the degrees of freedom of the model (num_predictors) and the size of the
 training set over which it was developed (training_size):
 
 - num_predictors: Indicates the number of parameters the model has other than
@@ -124,7 +124,7 @@ following form as an example: 7 + 5x + 39y + 0.91z.
 any NULL rows).
 
 Neither of these arguments can be deduced from the predicted values and the 
test
-data alone which is why they are explicit inputs.  Please refer to 
+data alone which is why they are explicit inputs.  Please refer to
 reference <a href="#r2">[1]</a> for more details.
 
 @anchor bc
@@ -266,7 +266,7 @@ SELECT * FROM table_out;
 </pre>
 Result
 <pre class="result">
-       r2_score      | adjusted_r2_score 
+       r2_score      | adjusted_r2_score
 &nbsp;--------------------+------------------
    0.279929088443375 | 0.257426872457231
 </pre>

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/test/f_test.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/test/f_test.sql_in 
b/src/ports/postgres/modules/stats/test/f_test.sql_in
index d820a09..1fe1fdf 100644
--- a/src/ports/postgres/modules/stats/test/f_test.sql_in
+++ b/src/ports/postgres/modules/stats/test/f_test.sql_in
@@ -1,6 +1,6 @@
 /* 
-----------------------------------------------------------------------------
  * Test F-test.
- * 
+ *
  * Example ("JAHANMI2.DAT") taken from:
  * http://www.itl.nist.gov/div898/handbook/eda/section3/eda359.htm
  * -------------------------------------------------------------------------- 
*/

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/test/ks_test.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/test/ks_test.sql_in 
b/src/ports/postgres/modules/stats/test/ks_test.sql_in
index ed56755..19ead2c 100644
--- a/src/ports/postgres/modules/stats/test/ks_test.sql_in
+++ b/src/ports/postgres/modules/stats/test/ks_test.sql_in
@@ -1,6 +1,6 @@
 /* 
-----------------------------------------------------------------------------
  * Test Kolmogorov-Smirnov test.
- * 
+ *
  * Example taken from:
  * http://www.physics.csbsju.edu/stats/KS-test.html
  * -------------------------------------------------------------------------- 
*/

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/test/mw_test.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/test/mw_test.sql_in 
b/src/ports/postgres/modules/stats/test/mw_test.sql_in
index 0e29cbd..e05c3d2 100644
--- a/src/ports/postgres/modules/stats/test/mw_test.sql_in
+++ b/src/ports/postgres/modules/stats/test/mw_test.sql_in
@@ -1,6 +1,6 @@
 /* 
-----------------------------------------------------------------------------
  * Test Mann-Whitney-test.
- * 
+ *
  * Example taken from:
  * http://www.itl.nist.gov/div898/handbook/prc/section3/prc35.htm
  * -------------------------------------------------------------------------- 
*/

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/test/t_test.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/test/t_test.sql_in 
b/src/ports/postgres/modules/stats/test/t_test.sql_in
index a1b328c..b937677 100644
--- a/src/ports/postgres/modules/stats/test/t_test.sql_in
+++ b/src/ports/postgres/modules/stats/test/t_test.sql_in
@@ -1,6 +1,6 @@
 /* 
-----------------------------------------------------------------------------
  * Test one-sample t-test.
- * 
+ *
  * Example ("ZARR13.DAT") taken from:
  * http://www.itl.nist.gov/div898/handbook/eda/section3/eda352.htm
  * -------------------------------------------------------------------------- 
*/
@@ -220,7 +220,7 @@ SELECT assert(
 
 /* 
-----------------------------------------------------------------------------
  * Test one-sample t-test.
- * 
+ *
  * Example ("AUTO83B.DAT") taken from:
  * http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm
  * -------------------------------------------------------------------------- 
*/

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/stats/test/wsr_test.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/test/wsr_test.sql_in 
b/src/ports/postgres/modules/stats/test/wsr_test.sql_in
index fc7693a..896d601 100644
--- a/src/ports/postgres/modules/stats/test/wsr_test.sql_in
+++ b/src/ports/postgres/modules/stats/test/wsr_test.sql_in
@@ -1,6 +1,6 @@
 /* 
-----------------------------------------------------------------------------
  * Test Wilcoxon signed-rank test.
- * 
+ *
  * Example taken from:
  * http://www-stat.stanford.edu/~susan/courses/s141/hononpara.pdf
  * -------------------------------------------------------------------------- 
*/

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/summary/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.sql_in 
b/src/ports/postgres/modules/summary/summary.sql_in
index a220da8..73f5a0e 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in
@@ -96,10 +96,10 @@ This table contains the following columns:
     </tr>
     <tr>
         <th>distinct_values</th>
-        <td>Number of distinct values in the target column. 
-        If the summary() function is called with the <em>get_estimates</em> 
-        argument set to TRUE (default), then this is an estimated statistic 
based on the 
-        Flajolet-Martin distinct count estimator.  If the 
<em>get_estimates</em> 
+        <td>Number of distinct values in the target column.
+        If the summary() function is called with the <em>get_estimates</em>
+        argument set to TRUE (default), then this is an estimated statistic 
based on the
+        Flajolet-Martin distinct count estimator.  If the 
<em>get_estimates</em>
         argument set to FALSE, will use PostgreSQL COUNT DISTINCT.</td>
     </tr>
     <tr>
@@ -129,8 +129,8 @@ This table contains the following columns:
     <tr>
         <th>zero_values</th>
         <td>Number of zero values in the target column if target is numeric, 
otherwise NULL.
-        Note that we are reporting exact equality to 0.0 here, so even if you 
have a 
-        float value that is extremely small (say due to rounding), it will not 
be 
+        Note that we are reporting exact equality to 0.0 here, so even if you 
have a
+        float value that is extremely small (say due to rounding), it will not 
be
         reported as a zero value.</td>
     </tr>
     <tr>
@@ -157,22 +157,22 @@ This table contains the following columns:
     </tr>
     <tr>
         <th>first_quartile</th>
-        <td>First quartile (25th percentile), only for numeric columns. 
+        <td>First quartile (25th percentile), only for numeric columns.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>median</th>
-        <td>Median value of target column, if target is numeric, otherwise 
NULL. 
+        <td>Median value of target column, if target is numeric, otherwise 
NULL.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>third_quartile</th>
-        <td>Third quartile (25th percentile), only for numeric columns. 
+        <td>Third quartile (25th percentile), only for numeric columns.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>quantile_array</th>
-        <td>Percentile values corresponding to \e ntile_array. 
+        <td>Percentile values corresponding to \e ntile_array.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
@@ -192,7 +192,7 @@ This table contains the following columns:
 </table></dd>
 
 <dt>target_columns (optional)</dt>
-<dd>TEXT, default NULL. A comma-separated list of columns to summarize. 
+<dd>TEXT, default NULL. A comma-separated list of columns to summarize.
 If NULL, summaries are produced for all columns.</dd>
 
 <dt>grouping_cols (optional)</dt>
@@ -207,14 +207,14 @@ target_cols specified.)
 
 <dt>get_distinct (optional)</dt>
 <dd>BOOLEAN, default TRUE. If true, distinct values are counted.
-The method for computing distinct values depends on the setting of 
+The method for computing distinct values depends on the setting of
 the 'get_estimates' parameter below.</dd>
 
 <dt>get_quartiles (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, quartiles are computed.</dd>
 
 <dt>ntile_array (optional)</dt>
-<dd>FLOAT8[], default NULL. An array of quantile values to compute. 
+<dd>FLOAT8[], default NULL. An array of quantile values to compute.
 If NULL, quantile values are not computed.</dd>
 @note Quartile and quantile functions are not available in PostgreSQL 9.3 or
 lower.  If you are using PostgreSQL 9.3 or lower, the output table will not
@@ -223,13 +223,13 @@ provide an array of quantile values for the parameter 
'ntile_array'.
 
 <dt>how_many_mfv (optional)</dt>
 <dd>INTEGER, default: 10. The number of most-frequent-values to compute.
-The method for computing MFV depends on the setting of 
+The method for computing MFV depends on the setting of
 the 'get_estimates' parameter below.</dd>
 
 <dt>get_estimates (optional)</dt>
-<dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for 
-distinct values and most frequent values. If FALSE, exact values are 
-calculated which will take longer to run, with the impact depending on 
+<dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for
+distinct values and most frequent values. If FALSE, exact values are
+calculated which will take longer to run, with the impact depending on
 data size.</dd>
 
 <dt>n_cols_per_run (optional)</dt>
@@ -261,9 +261,9 @@ iris data set.
 <pre class="example">
 DROP TABLE IF EXISTS iris;
 CREATE TABLE iris (id INT, sepal_length FLOAT, sepal_width FLOAT,
-                    petal_length FLOAT, petal_width FLOAT, 
-                   class_name text);                        
-INSERT INTO iris VALUES 
+                    petal_length FLOAT, petal_width FLOAT,
+                   class_name text);
+INSERT INTO iris VALUES
 (1,5.1,3.5,1.4,0.2,'Iris-setosa'),
 (2,4.9,3.0,1.4,0.2,'Iris-setosa'),
 (3,4.7,3.2,1.3,0.2,'Iris-setosa'),
@@ -372,7 +372,7 @@ most_frequent_values | 
{Iris-setosa,Iris-versicolor,Iris-virginica}
 mfv_frequencies      | {10,10,10}
 </pre>
 Note that for the text column in record 6, some statistics are n/a,
-and the min and max values represent the length of the shortest and 
+and the min and max values represent the length of the shortest and
 longest strings respectively.
 
 -# Now group by the class of iris:
@@ -514,7 +514,7 @@ should follow case-sensitivity and quoting rules per the 
database.
 (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 
'mytable'.
 If mixed-case or multi-byte characters are desired for entity names then the
 string should be double-quoted; in this case the input would be '"MyTable"').
-- The <em>get_estimates</em> parameter controls computation for both distinct 
+- The <em>get_estimates</em> parameter controls computation for both distinct
 count and most frequent values:
     -  If <em>get_estimates</em> is TRUE then the distinct value computation is
     estimated using Flajolet-Martin. MFV is computed using a

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/svm/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/svm.sql_in 
b/src/ports/postgres/modules/svm/svm.sql_in
index 733dee4..ccf2c26 100644
--- a/src/ports/postgres/modules/svm/svm.sql_in
+++ b/src/ports/postgres/modules/svm/svm.sql_in
@@ -79,7 +79,7 @@ svm_classification(
   <DD>TEXT. Expression list to evaluate for the
     independent variables. An intercept variable should not be included as part
     of this expression. See 'fit_intercept' in the kernel params for info on
-    intercepts.  Please note that expression should be able to be cast 
+    intercepts.  Please note that expression should be able to be cast
     to DOUBLE PRECISION[].
 
   <DT>kernel_func (optional)</DT>
@@ -144,8 +144,8 @@ svm_classification(
       </tr>
       <tr>
         <th>loss</th>
-        <td>FLOAT8. Value of the objective function of SVM, expressed as an 
-        average loss per row over the \e source_table.  See Technical 
Background 
+        <td>FLOAT8. Value of the objective function of SVM, expressed as an
+        average loss per row over the \e source_table.  See Technical 
Background
         section below for more details.</td>
       </tr>
       <tr>
@@ -341,7 +341,7 @@ the parameter is ignored.
 
 Hyperparameter optimization can be carried out using the built-in cross
 validation mechanism, which is activated by assigning a value greater than 1 to
-the parameter \e n_folds in \e params. 
+the parameter \e n_folds in \e params.
 Please note that cross validation is not
 supported if grouping is used.
 
@@ -430,14 +430,14 @@ will use the epsilon value specified in parameter \e 
epsilon.
 
 <DT>validation_result</dt>
 <DD>Default: NULL.
-Name of the table to store the cross validation scores.  
+Name of the table to store the cross validation scores.
 This table is only created if the name is not NULL.
 The cross validation scores are the mean and standard deviation
-of the accuracy when predicted on the validation fold, 
+of the accuracy when predicted on the validation fold,
 averaged over all folds and all rows.  For classification, the accuracy
-metric used is the ratio of correct classifications.  For regression, the 
-accuracy metric used is the negative of mean squared error (negative to 
-make it a concave problem, thus selecting \e max means the highest accuracy). 
+metric used is the ratio of correct classifications.  For regression, the
+accuracy metric used is the negative of mean squared error (negative to
+make it a concave problem, thus selecting \e max means the highest accuracy).
 </DD>
 
 <DT>n_folds</dt>
@@ -448,7 +448,7 @@ while the other k - 1 folds form the training set.
 </DD>
 
 <DT>class_weight</dt>
-<DD>Default: 1 for classification, 'balanced' for one-class novelty detection, 
+<DD>Default: 1 for classification, 'balanced' for one-class novelty detection,
 n/a for regression.
 
 Set the weight for the positive and negative classes. If not given, all classes
@@ -501,8 +501,8 @@ table name is already in use, then an error is returned.  
Table contains:</DD>
       </tr>
       <tr>
         <th>prediction</th>
-        <td>Provides the prediction for each row in new_data_table. 
-        For regression this would be the same as decision_function. For 
classification, 
+        <td>Provides the prediction for each row in new_data_table.
+        For regression this would be the same as decision_function. For 
classification,
         this will be one of the dependent variable values.</td>
       </tr>
       <tr>
@@ -521,7 +521,7 @@ table name is already in use, then an error is returned.  
Table contains:</DD>
 DROP TABLE IF EXISTS houses;
 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
             size INT, lot INT);
-INSERT INTO houses VALUES   
+INSERT INTO houses VALUES
   (1 ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100),
   (2 , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000),
   (3 ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500),
@@ -538,7 +538,7 @@ INSERT INTO houses VALUES
  (14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
  (15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000);
 </pre>
--#  Train linear classification model and view the model.  
+-#  Train linear classification model and view the model.
 Categorical variable is price < $100,0000.
 <pre class="example">
 DROP TABLE IF EXISTS houses_svm, houses_svm_summary;
@@ -561,22 +561,22 @@ num_rows_processed | 15
 num_rows_skipped   | 0
 dep_var_mapping    | {f,t}
 </pre>
--# Predict using linear model. We want to predict if house price 
-is less than $100,000. We use the training data set for prediction 
-as well, which is not usual but serves to show the syntax. The 
-predicted results are in the \e prediction column and the actual 
+-# Predict using linear model. We want to predict if house price
+is less than $100,000. We use the training data set for prediction
+as well, which is not usual but serves to show the syntax. The
+predicted results are in the \e prediction column and the actual
 data is in the \e actual column.
 <pre class="example">
 DROP TABLE IF EXISTS houses_pred;
-SELECT madlib.svm_predict('houses_svm', 
-                          'houses', 
-                          'id', 
+SELECT madlib.svm_predict('houses_svm',
+                          'houses',
+                          'id',
                           'houses_pred');
 \\x off
 SELECT *, price < 100000 AS actual FROM houses JOIN houses_pred USING (id) 
ORDER BY id;
 </pre>
 <pre class="result">
-  id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function  | actual 
+  id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function  | actual
 
----+------+---------+------+--------+------+-------+------------+--------------------+--------
   1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |  
0.211310440574799 | t
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |   
0.37546191651855 | t
@@ -597,11 +597,11 @@ SELECT *, price < 100000 AS actual FROM houses JOIN 
houses_pred USING (id) ORDER
 </pre>
 Count the miss-classifications:
 <pre class="example">
-SELECT COUNT(*) FROM houses_pred JOIN houses USING (id) 
+SELECT COUNT(*) FROM houses_pred JOIN houses USING (id)
 WHERE houses_pred.prediction != (houses.price < 100000);
 </pre>
 <pre class="result">
- count 
+ count
 -------+
      3
 </pre>
@@ -635,19 +635,19 @@ num_rows_processed | 15
 num_rows_skipped   | 0
 dep_var_mapping    | {f,t}
 </pre>
--# Prediction using the Gaussian model.  The predicted results are 
+-# Prediction using the Gaussian model.  The predicted results are
 in the \e prediction column and the actual data is in the \e actual column.
 <pre class="example">
 DROP TABLE IF EXISTS houses_pred_gaussian;
-SELECT madlib.svm_predict('houses_svm_gaussian', 
-                          'houses', 
-                          'id', 
+SELECT madlib.svm_predict('houses_svm_gaussian',
+                          'houses',
+                          'id',
                           'houses_pred_gaussian');
 \\x off
 SELECT *, price < 100000 AS actual FROM houses JOIN houses_pred_gaussian USING 
(id) ORDER BY id;
 </pre>
 <pre class="result">
- id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function  | actual 
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction | 
decision_function  | actual
 
----+------+---------+------+--------+------+-------+------------+--------------------+--------
   1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |   
1.89855833083557 | t
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |   
1.47736856649617 | t
@@ -666,19 +666,19 @@ SELECT *, price < 100000 AS actual FROM houses JOIN 
houses_pred_gaussian USING (
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          |   
1.00000008401961 | t
 (15 rows)
 </pre>
-Count the miss-classifications. Note this produces a more accurate 
+Count the miss-classifications. Note this produces a more accurate
 result than the linear case for this data set:
 <pre class="example">
-SELECT COUNT(*) FROM houses_pred_gaussian JOIN houses USING (id) 
+SELECT COUNT(*) FROM houses_pred_gaussian JOIN houses USING (id)
 WHERE houses_pred_gaussian.prediction != (houses.price < 100000);
 </pre>
 <pre class="result">
- count 
+ count
 -------+
      0
 (1 row)
 </pre>
--# In the case of an unbalanced class-size dataset, use the 'balanced' 
+-# In the case of an unbalanced class-size dataset, use the 'balanced'
 parameter to classify when building the model:
 <pre class="example">
 DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, 
houses_svm_gaussian_random;
@@ -707,7 +707,7 @@ dep_var_mapping    | {f,t}
 
 <h4>Regression</h4>
 
--# Create input data set.  For regression we use part of the well 
+-# Create input data set.  For regression we use part of the well
 known abalone data set https://archive.ics.uci.edu/ml/datasets/abalone :
 <pre class="example">
 DROP TABLE IF EXISTS abalone;
@@ -759,14 +759,14 @@ dep_var_mapping    | {NULL}
 <pre class="example">
 DROP TABLE IF EXISTS abalone_regr;
 SELECT madlib.svm_predict('abalone_svm_regression',
-                          'abalone', 
-                          'id', 
+                          'abalone',
+                          'id',
                           'abalone_regr');
 \\x off
 SELECT * FROM abalone JOIN abalone_regr USING (id) ORDER BY id;
 </pre>
 <pre class="result">
- id | sex | length | diameter | height | rings |    prediction    | 
decision_function 
+ id | sex | length | diameter | height | rings |    prediction    | 
decision_function
 
----+-----+--------+----------+--------+-------+------------------+-------------------
   1 | M   |  0.455 |    0.365 |  0.095 |    15 | 2.69859240928376 |  
2.69859240928376
   2 | M   |   0.35 |    0.265 |   0.09 |     7 | 2.52978857282818 |  
2.52978857282818
@@ -792,11 +792,11 @@ SELECT * FROM abalone JOIN abalone_regr USING (id) ORDER 
BY id;
 </pre>
 RMS error:
 <pre class="example">
-SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone 
+SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone
 JOIN abalone_regr USING (id);
 </pre>
 <pre class="result">
-    rms_error    
+    rms_error
 -----------------+
  9.0884271818321
 (1 row)
@@ -828,15 +828,15 @@ dep_var_mapping    | {NULL}
 -# Predict using Gaussian regression model:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_gaussian_regr;
-SELECT madlib.svm_predict('abalone_svm_gaussian_regression', 
-                          'abalone', 
-                          'id', 
+SELECT madlib.svm_predict('abalone_svm_gaussian_regression',
+                          'abalone',
+                          'id',
                           'abalone_gaussian_regr');
 \\x off
 SELECT * FROM abalone JOIN abalone_gaussian_regr USING (id) ORDER BY id;
 </pre>
 <pre class="result">
- id | sex | length | diameter | height | rings |    prediction    | 
decision_function 
+ id | sex | length | diameter | height | rings |    prediction    | 
decision_function
 
----+-----+--------+----------+--------+-------+------------------+-------------------
   1 | M   |  0.455 |    0.365 |  0.095 |    15 | 9.92189555675422 |  
9.92189555675422
   2 | M   |   0.35 |    0.265 |   0.09 |     7 | 9.81553107620013 |  
9.81553107620013
@@ -860,21 +860,21 @@ SELECT * FROM abalone JOIN abalone_gaussian_regr USING 
(id) ORDER BY id;
  20 | M   |   0.45 |     0.32 |    0.1 |     9 | 9.98822477687532 |  
9.98822477687532
 (20 rows)
 </pre>
-Compute the RMS error. Note this produces a more accurate result than 
+Compute the RMS error. Note this produces a more accurate result than
 the linear case for this data set:
 <pre class="example">
-SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone 
+SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone
 JOIN abalone_gaussian_regr USING (id);
 </pre>
 <pre class="result">
-    rms_error     
+    rms_error
 ------------------+
  3.83678516581768
 (1 row)
 </pre>
 -# Cross validation.  Let's run cross validation for different initial step 
sizes and lambda values:
 <pre class="example">
-DROP TABLE IF EXISTS abalone_svm_gaussian_regression, 
abalone_svm_gaussian_regression_summary, 
+DROP TABLE IF EXISTS abalone_svm_gaussian_regression, 
abalone_svm_gaussian_regression_summary,
 abalone_svm_gaussian_regression_random, abalone_svm_gaussian_regression_cv;
 SELECT madlib.svm_regression( 'abalone',
                               'abalone_svm_gaussian_regression',
@@ -883,7 +883,7 @@ SELECT madlib.svm_regression( 'abalone',
                               'gaussian',
                               'n_components=10',
                               '',
-                              'init_stepsize=[0.01,1], n_folds=3, 
max_iter=200, lambda=[0.01, 0.1, 0.5], 
+                              'init_stepsize=[0.01,1], n_folds=3, 
max_iter=200, lambda=[0.01, 0.1, 0.5],
                               
validation_result=abalone_svm_gaussian_regression_cv'
                            );
 \\x on
@@ -899,7 +899,7 @@ num_rows_processed | 20
 num_rows_skipped   | 0
 dep_var_mapping    | {NULL}
 </pre>
-View the summary table showing the final model parameters are those that 
produced 
+View the summary table showing the final model parameters are those that 
produced
 the lowest error in the cross validation runs:
 <pre class="example">
 SELECT * FROM abalone_svm_gaussian_regression_summary;
@@ -921,7 +921,7 @@ optim_params         | init_stepsize=1.0,
                      | tolerance=1e-10,
                      | epsilon=0.01,
                      | eps_table=,
-                     | class_weight=               
+                     | class_weight=
 reg_params           | lambda=0.01, norm=l2, n_folds=3
 num_all_groups       | 1
 num_failed_groups    | 0
@@ -935,7 +935,7 @@ View the statistics for the various cross validation values:
 SELECT * FROM abalone_svm_gaussian_regression_cv;
 </pre>
 <pre class="result">
- init_stepsize | lambda |   mean_score   | std_dev_score  
+ init_stepsize | lambda |   mean_score   | std_dev_score
 ---------------+--------+----------------+----------------
            1.0 |   0.01 | -4.06711568585 | 0.435966381366
            1.0 |    0.1 | -4.08068428345 |  0.44660797513
@@ -948,19 +948,19 @@ SELECT * FROM abalone_svm_gaussian_regression_cv;
 -# Predict using the cross-validated Gaussian regression model:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_gaussian_regr;
-SELECT madlib.svm_predict('abalone_svm_gaussian_regression', 
-                          'abalone', 
-                          'id', 
+SELECT madlib.svm_predict('abalone_svm_gaussian_regression',
+                          'abalone',
+                          'id',
                           'abalone_gaussian_regr');
 </pre>
-Compute the RMS error. Note this produces a more accurate result than 
+Compute the RMS error. Note this produces a more accurate result than
 the previous run with the Gaussian kernel:
 <pre class="example">
-SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone 
+SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM 
abalone
 JOIN abalone_gaussian_regr USING (id);
 </pre>
 <pre class="result">
-    rms_error     
+    rms_error
 ------------------+
  3.84208909699442
 (1 row)
@@ -994,33 +994,33 @@ num_rows_processed | 16
 num_rows_skipped   | -1
 dep_var_mapping    | {-1,1}
 </pre>
--#  For the novelty detection using one-class, let's create a test data 
-set using the last 3 values from the training set plus an outlier at 
+-#  For the novelty detection using one-class, let's create a test data
+set using the last 3 values from the training set plus an outlier at
 the end (10x price):
 <pre class="example">
 DROP TABLE IF EXISTS houses_one_class_test;
 CREATE TABLE houses_one_class_test (id INT, tax INT, bedroom INT, bath FLOAT, 
price INT,
             size INT, lot INT);
-INSERT INTO houses_one_class_test VALUES   
+INSERT INTO houses_one_class_test VALUES
  (1 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),
  (2 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
  (3 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000),
- (4 ,  650 ,       3 ,  1.5 ,  650000 , 1450 , 12000);      
+ (4 ,  650 ,       3 ,  1.5 ,  650000 , 1450 , 12000);
 </pre>
 Now run prediction on the Gaussian one-class novelty detection model.
 Result shows the last row predicted to be novel:
 <pre class="example">
 DROP TABLE IF EXISTS houses_pred;
-SELECT madlib.svm_predict('houses_one_class_gaussian', 
-                          'houses_one_class_test', 
-                          'id', 
+SELECT madlib.svm_predict('houses_one_class_gaussian',
+                          'houses_one_class_test',
+                          'id',
                           'houses_pred');
 \\x off
 SELECT * FROM houses_one_class_test JOIN houses_pred USING (id) ORDER BY id;
 </pre>
 Result showing the last row predicted to be novel:
 <pre class="result">
- id | tax  | bedroom | bath | price  | size |  lot  | prediction |  
decision_function  
+ id | tax  | bedroom | bath | price  | size |  lot  | prediction |  
decision_function
 
----+------+---------+------+--------+------+-------+------------+---------------------
   1 | 3100 |       3 |    2 | 140000 | 1760 | 38000 |          1 |   
0.111497008121437
   2 | 2070 |       2 |    3 | 148000 | 1550 | 14000 |          1 |  
0.0996021345169148

[2/3] madlib git commit: Multiple: Remove trailing whitespace from all SQL

Reply via email to