Multiple: Remove trailing whitespace from all SQL

Markup language states that two trailing whitespace should be
interpreted as a break line (<br>), which has been implemented by
Doxygen 1.8+. This commit removes all such instances since the trailing
whitespace is inadvertent in most cases. If a break line is required,
then it should be added explicitly (using HTML tag <br>).

Closes #317

Co-authored-by: Domino Valdano <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/35818fa3
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/35818fa3
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/35818fa3

Branch: refs/heads/master
Commit: 35818fa395f965191b59ddbfcd1469470f44271b
Parents: 92bdf8c
Author: Rahul Iyer <[email protected]>
Authored: Fri Sep 7 15:12:49 2018 -0700
Committer: Rahul Iyer <[email protected]>
Committed: Fri Sep 7 15:12:49 2018 -0700

----------------------------------------------------------------------
 methods/array_ops/src/pg_gp/array_ops.sql_in    |   7 +-
 src/ports/postgres/modules/bayes/bayes.sql_in   |  46 ++---
 .../postgres/modules/bayes/test/bayes.sql_in    | 114 ++++++------
 .../conjugate_gradient/test/conj_grad.sql_in    |  24 +--
 src/ports/postgres/modules/convex/mlp.sql_in    |   4 +-
 .../modules/crf/test/crf_test_large.sql_in      |   6 +-
 .../modules/crf/test/crf_train_small.sql_in     |  10 +-
 .../modules/elastic_net/elastic_net.sql_in      |  12 +-
 src/ports/postgres/modules/glm/glm.sql_in       |   4 +-
 src/ports/postgres/modules/glm/ordinal.sql_in   |  38 ++--
 .../postgres/modules/glm/test/ordinal.sql_in    |  12 +-
 src/ports/postgres/modules/graph/bfs.sql_in     |  88 +++++-----
 src/ports/postgres/modules/graph/hits.sql_in    |  36 ++--
 .../postgres/modules/graph/pagerank.sql_in      |  12 +-
 src/ports/postgres/modules/graph/wcc.sql_in     |  10 +-
 src/ports/postgres/modules/knn/knn.sql_in       |  14 +-
 src/ports/postgres/modules/lda/lda.sql_in       | 172 +++++++++----------
 src/ports/postgres/modules/linalg/svd.sql_in    |  62 +++----
 src/ports/postgres/modules/pca/pca.sql_in       | 104 +++++------
 .../postgres/modules/pca/pca_project.sql_in     |  54 +++---
 .../recursive_partitioning/decision_tree.sql_in |   4 +-
 .../recursive_partitioning/random_forest.sql_in |   6 +-
 .../postgres/modules/regress/linear.sql_in      |  24 +--
 .../postgres/modules/regress/logistic.sql_in    |  26 +--
 .../modules/regress/test/clustered.sql_in       |   8 +-
 .../postgres/modules/stats/correlation.sql_in   |  54 +++---
 .../modules/stats/hypothesis_tests.sql_in       |   6 +-
 .../postgres/modules/stats/pred_metrics.sql_in  |   8 +-
 .../postgres/modules/stats/test/f_test.sql_in   |   2 +-
 .../postgres/modules/stats/test/ks_test.sql_in  |   2 +-
 .../postgres/modules/stats/test/mw_test.sql_in  |   2 +-
 .../postgres/modules/stats/test/t_test.sql_in   |   4 +-
 .../postgres/modules/stats/test/wsr_test.sql_in |   2 +-
 .../postgres/modules/summary/summary.sql_in     |  44 ++---
 src/ports/postgres/modules/svm/svm.sql_in       | 130 +++++++-------
 .../modules/tsa/test/arima_train.sql_in         |  54 +++---
 .../postgres/modules/utilities/cols2vec.sql_in  |  12 +-
 .../postgres/modules/utilities/path.sql_in      |  24 +--
 .../postgres/modules/utilities/pivot.sql_in     |   4 +-
 .../modules/utilities/sessionize.sql_in         |  34 ++--
 .../modules/utilities/text_utilities.sql_in     |  48 +++---
 .../postgres/modules/utilities/utilities.sql_in |   2 +-
 .../postgres/modules/utilities/vec2cols.sql_in  |  42 ++---
 43 files changed, 685 insertions(+), 686 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/methods/array_ops/src/pg_gp/array_ops.sql_in
----------------------------------------------------------------------
diff --git a/methods/array_ops/src/pg_gp/array_ops.sql_in 
b/methods/array_ops/src/pg_gp/array_ops.sql_in
index 3c905ce..e1aa368 100644
--- a/methods/array_ops/src/pg_gp/array_ops.sql_in
+++ b/methods/array_ops/src/pg_gp/array_ops.sql_in
@@ -275,12 +275,11 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
  * @brief Aggregate, element-wise sum of arrays. It requires that all the 
values are NON-NULL. Return type is the same as the input type.
  *
  * @param x Array x
- * @param y Array y
- * @returns Sum of x and y.
+ * @returns Sum of x
  *
  */
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.sum(anyarray) CASCADE;
-CREATE AGGREGATE MADLIB_SCHEMA.sum(anyarray) (
+DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.sum(/* x */ anyarray) CASCADE;
+CREATE AGGREGATE MADLIB_SCHEMA.sum(/* x */ anyarray) (
     SFUNC     = MADLIB_SCHEMA.array_add,
     STYPE     = anyarray
    m4_ifdef( `__POSTGRESQL__', `', `, PREFUNC   = MADLIB_SCHEMA.array_add')

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/bayes/bayes.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/bayes/bayes.sql_in 
b/src/ports/postgres/modules/bayes/bayes.sql_in
index 769b35c..40b71d2 100644
--- a/src/ports/postgres/modules/bayes/bayes.sql_in
+++ b/src/ports/postgres/modules/bayes/bayes.sql_in
@@ -43,8 +43,8 @@ certain class \f$ c \f$.
 
 Naives Bayes classification estimates feature probabilities and class priors
 using maximum likelihood or Laplacian smoothing. For numeric attributes,
-Gaussian smoothing can be used to estimate the feature probabilities.These 
-parameters are then used to classify new data. 
+Gaussian smoothing can be used to estimate the feature probabilities.These
+parameters are then used to classify new data.
 
 
 @anchor train
@@ -64,7 +64,7 @@ create_nb_prepared_data_tables ( trainingSource,
                                )
 </pre>
 
-For data containing both categorical and numeric attributes, use the following 
form to 
+For data containing both categorical and numeric attributes, use the following 
form to
 precompute the Gaussian parameters (mean and variance) for numeric attributes 
alongside
 the feature probabilities for categorical attributes and class priors.
 
@@ -88,8 +88,8 @@ The \e trainingSource is expected to be of the following form:
     ...
 )</pre>
 
-\e numericAttrsColumnIndices should be of type TEXT, specified as an array of 
-indices (starting from 1) in the \e trainingAttrColumn attributes-array that 
+\e numericAttrsColumnIndices should be of type TEXT, specified as an array of
+indices (starting from 1) in the \e trainingAttrColumn attributes-array that
 correspond to numeric attributes.
 
 The two output tables are:
@@ -97,7 +97,7 @@ The two output tables are:
 - \e classPriorsName &ndash; stores the class priors
 
 In addition to the above, if the function specifying numeric attributes is 
used,
-an additional table \e numericAttrParamsName is created which stores the 
+an additional table \e numericAttrParamsName is created which stores the
 Gaussian parameters for the numeric attributes.
 
 @anchor classify
@@ -225,24 +225,24 @@ attributes.\n
 For continuous data, a typical assumption, usually used for small datasets,
 is that the continuous values associated with each class are distributed
 according to a Gaussian distribution,
-and the probabilities \f$ P(A_i = a \mid C=c) \f$ are estimated using the 
+and the probabilities \f$ P(A_i = a \mid C=c) \f$ are estimated using the
 Gaussian Distribution formula:
 \f[
 P(A_i=a \mid C=c) = 
\frac{1}{\sqrt{2\pi\sigma^{2}_c}}exp\left(-\frac{(a-\mu_c)^{2}}{2\sigma^{2}_c}\right)
 \f]
-where \f$\mu_c\f$ and \f$\sigma^{2}_c\f$ are the population mean and variance 
+where \f$\mu_c\f$ and \f$\sigma^{2}_c\f$ are the population mean and variance
 of the attribute for the class \f$c\f$.\n
-Another common technique for handling continuous values, which is better for 
-large data sets, is to use binning to discretize the values, and convert the 
+Another common technique for handling continuous values, which is better for
+large data sets, is to use binning to discretize the values, and convert the
 continuous data into categorical bins. This approach is currently not 
implemented.
 
 - One can provide floating point data to the Naive Bayes
-classification function. If the corresponding attribute index is not specified 
-in \e numericAttrsColumnIndices, floating point numbers will be used as 
symbolic 
-substitutions for categorial data. In this case, the classification would work 
-best if there are sufficient data points for each floating point attribute. 
However, 
-if floating point numbers are used as continuous data without the attribute 
being 
-marked as of type numeric in \e numericAttrsColumnIndices, no warning is 
raised 
+classification function. If the corresponding attribute index is not specified
+in \e numericAttrsColumnIndices, floating point numbers will be used as 
symbolic
+substitutions for categorial data. In this case, the classification would work
+best if there are sufficient data points for each floating point attribute. 
However,
+if floating point numbers are used as continuous data without the attribute 
being
+marked as of type numeric in \e numericAttrsColumnIndices, no warning is raised
 and the result may not be as expected.
 
 @anchor examples
@@ -362,7 +362,7 @@ Result:
 (4 rows)
 </pre>
 
-The following is an example of using a dataset with both numeric and 
+The following is an example of using a dataset with both numeric and
 categorical attributes
 
 -#  The training and the classification data. Attributes 
{height(numeric),weight(numeric),shoe size(categorical)},
@@ -372,7 +372,7 @@ SELECT * FROM gaussian_data;
 </pre>
 Result:
 <pre class="result">
- id | sex |  attributes   
+ id | sex |  attributes
 &nbsp;----+-----+---------------
   1 |   1 | {6,180,12}
   2 |   1 | {5.92,190,12}
@@ -389,7 +389,7 @@ SELECT * FROM gaussian_test;
 </pre>
 Result:
 <pre class="result">
- id | sex |  attributes  
+ id | sex |  attributes
 ----+-----+--------------
   9 |   1 | {5.8,180,11}
  10 |   2 | {5,160,6}
@@ -415,7 +415,7 @@ SELECT * FROM class_priors;
 </pre>
 Result:
 <pre class="result">
-class | class_cnt | all_cnt 
+class | class_cnt | all_cnt
 &nbsp;-------+-----------+---------
      1 |         4 |       8
      2 |         4 |       8
@@ -426,7 +426,7 @@ SELECT * FROM categ_feature_probs;
 </pre>
 Result:
 <pre class="result">
- class | attr | value | cnt | attr_cnt 
+ class | attr | value | cnt | attr_cnt
 -------+------+-------+-----+----------
      2 |    3 |     6 |   2 |        5
      1 |    3 |    12 |   2 |        5
@@ -445,7 +445,7 @@ SELECT * FROM numeric_attr_params;
 </pre>
 Result:
 <pre class="result">
-class | attr |      attr_mean       |        attr_var        
+class | attr |      attr_mean       |        attr_var
 -------+------+----------------------+------------------------
      1 |    1 |   5.8550000000000000 | 0.03503333333333333333
      1 |    2 | 176.2500000000000000 |   122.9166666666666667
@@ -493,7 +493,7 @@ SELECT * FROM probs_view;
 </pre>
 Result:
 <pre class="result">
- key | class |       nb_prob        
+ key | class |       nb_prob
 -----+-------+----------------------
    9 |     1 |    0.993556745948775
    9 |     2 |  0.00644325405122553

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/bayes/test/bayes.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/bayes/test/bayes.sql_in 
b/src/ports/postgres/modules/bayes/test/bayes.sql_in
index ec49d4b..88604a6 100644
--- a/src/ports/postgres/modules/bayes/test/bayes.sql_in
+++ b/src/ports/postgres/modules/bayes/test/bayes.sql_in
@@ -1,5 +1,5 @@
 ---------------------------------------------------------------------------
--- Rules: 
+-- Rules:
 -- ------
 -- 1) Any DB objects should be created w/o schema prefix,
 --    since this file is executed in a separate schema context.
@@ -8,7 +8,7 @@
 ---------------------------------------------------------------------------
 
 ---------------------------------------------------------------------------
--- Setup: 
+-- Setup:
 ---------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION fill_feature(pre_class INT, p FLOAT, total INT)
 RETURNS FLOAT AS $$
@@ -51,18 +51,18 @@ $$ language plpgsql;
 -- ----------------
 -- install_test_1()
 -- ----------------
-CREATE FUNCTION install_test_1() RETURNS VOID AS $$ 
+CREATE FUNCTION install_test_1() RETURNS VOID AS $$
 declare
        num1 INT := 10;
-       num2 INT := 10; 
+       num2 INT := 10;
 
-       result1 INT;    
+       result1 INT;
        count1 INT;
        tempvar INT[];
-       
+
 begin
        -- prepare training data: equal priors
-       --DROP TABLE IF EXISTS data CASCADE;    
+       --DROP TABLE IF EXISTS data CASCADE;
        CREATE TABLE data_1( class INT, attrib FLOAT[] );
        INSERT INTO data_1 SELECT 1, 
ARRAY[fill_feature(id,0.3,num1),fill_feature(id,0.8,num1)] FROM 
generate_series(1,num1) as id;
        INSERT INTO data_1 SELECT 2, 
ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2)] FROM 
generate_series(1,num2) as id;
@@ -97,7 +97,7 @@ begin
 
        -- Check the results
        SELECT * INTO result1,count1 FROM
-               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*) 
+               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*)
                FROM data_results_1 data_results INNER JOIN probs_view_1 as 
probs_view
                ON (data_results.id = probs_view.key AND data_results.class = 
probs_view.class)) AS t;
 
@@ -114,7 +114,7 @@ begin
 
        IF (result1 != 2) THEN
                RAISE EXCEPTION 'Incorrect classification';
-       END IF;         
+       END IF;
 
        -- Repeat using function w/out preprocessing priors
        -- Classify
@@ -124,21 +124,21 @@ begin
        --DROP VIEW IF EXISTS probs_view;
        --PERFORM 
MADLIB_SCHEMA.create_nb_probs_view('data','class','attrib','data_test','id','attrib',2,'probs_view');
 
-end 
+end
 $$ language plpgsql;
 
 -- ----------------
 -- install_test_2()
 -- ----------------
-CREATE FUNCTION install_test_2() RETURNS VOID AS $$ 
+CREATE FUNCTION install_test_2() RETURNS VOID AS $$
 declare
        num1 INT := 10;
-       num2 INT := 10; 
+       num2 INT := 10;
 
-       result1 INT;    
+       result1 INT;
        count1 INT;
        tempvar INT[];
-       
+
 begin
        -- 3 class case
        CREATE TABLE data_2( class INT, attrib FLOAT[] );
@@ -166,16 +166,16 @@ begin
        --DROP TABLE IF EXISTS probs CASCADE;
        --DROP TABLE IF EXISTS priors CASCADE;
        PERFORM 
create_nb_prepared_data_tables('data_2','class','attrib',2,'probs_2','priors_2');
-       
+
        --DROP VIEW IF EXISTS results;
        PERFORM 
create_nb_classify_view('probs_2','priors_2','data_test_2','id','attrib',2,'results_2');
-       
+
        --DROP VIEW IF EXISTS probs_view;
        PERFORM 
create_nb_probs_view('probs_2','priors_2','data_test_2','id','attrib',2,'probs_view_2');
-       
+
        SELECT * INTO result1,count1 FROM
-               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*) 
-               FROM data_results_2 as data_results INNER JOIN probs_view_2 as 
probs_view 
+               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*)
+               FROM data_results_2 as data_results INNER JOIN probs_view_2 as 
probs_view
                ON (data_results.id = probs_view.key AND data_results.class = 
probs_view.class)) AS t;
 
        IF (result1 != 0) OR (count1 != 18) THEN
@@ -183,25 +183,25 @@ begin
        END IF;
 
        RAISE INFO 'Naive Bayes install checks passed';
-       RETURN; 
-end 
+       RETURN;
+end
 $$ language plpgsql;
 
 -- ----------------
 -- install_test_3()
 -- ----------------
-CREATE FUNCTION install_test_3() RETURNS VOID AS $$ 
+CREATE FUNCTION install_test_3() RETURNS VOID AS $$
 declare
        num1 INT := 10;
-       num2 INT := 10; 
+       num2 INT := 10;
 
-       result1 INT;    
+       result1 INT;
        count1 INT;
        tempvar INT[];
-       
+
 begin
        -- Unequal prior probabilities
-       --DROP TABLE IF EXISTS data CASCADE;    
+       --DROP TABLE IF EXISTS data CASCADE;
        CREATE TABLE data_3( class INT, attrib FLOAT[] );
        INSERT INTO data_3 SELECT 1, 
ARRAY[fill_feature(id,0.3,num1*2),fill_feature(id,0.8,num1*2)] FROM 
generate_series(1,num1*2) as id;
        INSERT INTO data_3 SELECT 2, 
ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2)] FROM 
generate_series(1,num2) as id;
@@ -236,41 +236,41 @@ begin
 
        -- Check the results
        SELECT * INTO result1,count1 FROM
-               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*) 
-               FROM data_results_3 as data_results INNER JOIN probs_view_3 as 
probs_view 
+               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*)
+               FROM data_results_3 as data_results INNER JOIN probs_view_3 as 
probs_view
                ON (data_results.id = probs_view.key AND data_results.class = 
probs_view.class)) AS t;
 
        IF (result1 != 0) OR (count1 != 8) THEN
                RAISE EXCEPTION 'Incorrect probabilities';
        END IF;
-       
+
        RAISE INFO 'Naive Bayes install checks passed';
        RETURN;
-end 
+end
 $$ language plpgsql;
 
 -----------------------------------------------------------
---Test where numeric probabilities cancel out, leaving 
+--Test where numeric probabilities cancel out, leaving
 --only nominal probabilities to decide the class
 -----------------------------------------------------------
-CREATE FUNCTION install_test_4() RETURNS VOID AS $$ 
+CREATE FUNCTION install_test_4() RETURNS VOID AS $$
 declare
        num1 INT := 10;
-       num2 INT := 10; 
+       num2 INT := 10;
 
-       result1 INT;    
+       result1 INT;
        count1 INT;
        tempvar INT[];
-       
+
 begin
-        
+
         --equal class priors and numerical attr probabilities
-       --DROP TABLE IF EXISTS data CASCADE;    
+       --DROP TABLE IF EXISTS data CASCADE;
        CREATE TABLE data_4( class INT, attrib FLOAT[] );
        INSERT INTO data_4 SELECT 1, 
ARRAY[fill_feature(id,0.3,num1),fill_feature(id,0.8,num1),id] FROM 
generate_series(1,num1) as id;
        INSERT INTO data_4 SELECT 2, 
ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2),id] FROM 
generate_series(1,num2) as id;
 
-        
+
        -- prepare testing data
        --DROP TABLE IF EXISTS data_test CASCADE;
        CREATE TABLE data_test_4( id INT, attrib NUMERIC[], prob FLOAT[] );
@@ -302,45 +302,45 @@ begin
 
        -- Check the results
        SELECT * INTO result1,count1 FROM
-               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*) 
-               FROM data_results_4 as data_results INNER JOIN probs_view_4 as 
probs_view 
+               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*)
+               FROM data_results_4 as data_results INNER JOIN probs_view_4 as 
probs_view
                ON (data_results.id = probs_view.key AND data_results.class = 
probs_view.class)) AS t;
 
        IF (result1 != 0) OR (count1 != 8) THEN
                RAISE EXCEPTION 'Incorrect probabilities';
        END IF;
-       
+
        RAISE INFO 'Naive Bayes install checks passed';
        RETURN;
 
 
-end 
+end
 $$ language plpgsql;
 ---------------------------------------------------------------------------
--- Test: 
+-- Test:
 ---------------------------------------------------------------------------
 ---------------------------------------------------------------------------
--- Test where nominal probabilities cancel out,leaving only numeric 
--- probabilities to decide the classes. 
+-- Test where nominal probabilities cancel out,leaving only numeric
+-- probabilities to decide the classes.
 ---------------------------------------------------------------------------
-CREATE FUNCTION install_test_5() RETURNS VOID AS $$ 
+CREATE FUNCTION install_test_5() RETURNS VOID AS $$
 declare
        num1 INT := 10;
-       num2 INT := 10; 
+       num2 INT := 10;
 
-       result1 INT;    
+       result1 INT;
        count1 INT;
        tempvar INT[];
-       
+
 begin
-        
+
         --equal class priors and numerical attr probabilities
-       --DROP TABLE IF EXISTS data CASCADE;    
+       --DROP TABLE IF EXISTS data CASCADE;
        CREATE TABLE data_5( class INT, attrib FLOAT[] );
        INSERT INTO data_5 SELECT 1, 
ARRAY[fill_feature(id,0.5,num1),fill_feature(id,0.8,num1),id] FROM 
generate_series(1,num1) as id;
        INSERT INTO data_5 SELECT 2, 
ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.8,num2),id%5] FROM 
generate_series(1,num2) as id;
 
-        
+
        -- prepare testing data
        --DROP TABLE IF EXISTS data_test CASCADE;
        CREATE TABLE data_test_5( id INT, attrib float8[], prob FLOAT8[] );
@@ -372,22 +372,22 @@ begin
 
        -- Check the results
        SELECT * INTO result1,count1 FROM
-               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*) 
-               FROM data_results_5 as data_results INNER JOIN probs_view_5 as 
probs_view 
+               (SELECT sum( abs( (data_results.prob*100)::INT - 
(probs_view.nb_prob*100)::INT) ),count(*)
+               FROM data_results_5 as data_results INNER JOIN probs_view_5 as 
probs_view
                ON (data_results.id = probs_view.key AND data_results.class = 
probs_view.class)) AS t;
 
        IF (result1 != 0) OR (count1 != 8) THEN
                RAISE EXCEPTION 'Incorrect probabilities';
        END IF;
-       
+
        RAISE INFO 'Naive Bayes install checks passed';
        RETURN;
 
 
-end 
+end
 $$ language plpgsql;
 ---------------------------------------------------------------------------
--- Test: 
+-- Test:
 ---------------------------------------------------------------------------
 SELECT install_test_1();
 SELECT install_test_2();

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in 
b/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in
index a3f02f4..52e56ef 100644
--- a/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in
+++ b/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in
@@ -1,5 +1,5 @@
 ---------------------------------------------------------------------------
--- Rules: 
+-- Rules:
 -- ------
 -- 1) Any DB objects should be created w/o schema prefix,
 --    since this file is executed in a separate schema context.
@@ -8,14 +8,14 @@
 ---------------------------------------------------------------------------
 
 ---------------------------------------------------------------------------
--- Setup: 
+-- Setup:
 ---------------------------------------------------------------------------
-CREATE FUNCTION install_test() RETURNS VOID AS $$ 
+CREATE FUNCTION install_test() RETURNS VOID AS $$
 declare
-       
+
        result float;
        x FLOAT[];
-       
+
 begin
        CREATE TABLE A(
                row INT,
@@ -34,28 +34,28 @@ begin
        INSERT INTO A VALUES(10,ARRAY[0.1108536, 0.08383503, 0.06882649, 
0.05882206, 0.05154975, 0.04597544, 0.04154376, 0.03792426, 0.03490583, 
0.23234632]);
 
        SELECT INTO x MADLIB_SCHEMA.conjugate_gradient('A', 'val', 'row', 
ARRAY(SELECT random() FROM generate_series(1,10)), .000001,2);
-       result = x[1]*x[1];     
-               
+       result = x[1]*x[1];
+
 
        IF (result > .00000001) THEN
                RAISE EXCEPTION 'Incorrect results, got %',result;
-       END IF; 
+       END IF;
 
        -- simple symmetric +ve def case
        CREATE TABLE data(row_num INT, row_val FLOAT[]);
        INSERT INTO data VALUES (1,'{2,1}');
        INSERT INTO data VALUES (2,'{1,4}');
 
-       SELECT INTO x 
MADLIB_SCHEMA.conjugate_gradient('data','row_val','row_num','{2,1}',1E-6,1);    
  
+       SELECT INTO x 
MADLIB_SCHEMA.conjugate_gradient('data','row_val','row_num','{2,1}',1E-6,1);
 
        IF (round(x[1]) != 1) OR (round(x[2]) != 0) THEN
                RAISE EXCEPTION 'Incorrect multivariate results, got %',x;
        END IF;
-       
+
        RAISE INFO 'Conjugate gradient install checks passed';
        RETURN;
-       
-end 
+
+end
 $$ language plpgsql;
 
 ---------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in 
b/src/ports/postgres/modules/convex/mlp.sql_in
index 3e2468f..07d7dbc 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -1165,7 +1165,7 @@ SELECT * FROM lin_housing JOIN mlp_regress_prediction 
USING (id) ORDER BY id;
 </pre>
 RMS error:
 <pre class="example">
-SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM 
lin_housing 
+SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM lin_housing
 JOIN mlp_regress_prediction USING (id);
 </pre>
 <pre class="result">
@@ -1257,7 +1257,7 @@ SELECT *, ABS(y-estimated_y) as abs_diff FROM lin_housing 
JOIN mlp_regress_predi
 </pre>
 RMS error:
 <pre class="example">
-SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM 
lin_housing 
+SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM lin_housing
 JOIN mlp_regress_prediction USING (id);
 </pre>
 <pre class="result">

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/crf/test/crf_test_large.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/crf/test/crf_test_large.sql_in 
b/src/ports/postgres/modules/crf/test/crf_test_large.sql_in
index 9ededaa..6658545 100644
--- a/src/ports/postgres/modules/crf/test/crf_test_large.sql_in
+++ b/src/ports/postgres/modules/crf/test/crf_test_large.sql_in
@@ -6,7 +6,7 @@
 -- 2) There should be no DROP statements in this script, since
 --    all objects created in the default schema will be cleaned-up outside.
 ---------------------------------------------------------------------------
-               
+
        -- Features table produced by  Dr. Sunita's CRF java package (as 
provided by Kun, "CRF2" package on github) using the training data in the file 
us50.train.tagged
        CREATE TABLE crf_feature_test_new (id integer,name text,prev_label_id 
integer,label_id integer,weight float);
         INSERT INTO crf_feature_test_new VALUES
@@ -585,7 +585,7 @@ INSERT INTO test_segmenttbl_new VALUES
 (17, 4, '.',  17);
        analyze test_segmenttbl_new;
 
-       -- extract features for tokens stored in segmenttbl 
+       -- extract features for tokens stored in segmenttbl
        SELECT 
crf_test_fgen('test_segmenttbl_new','crf_dictionary_new','crf_label_new','crf_regex_new','crf_feature_test_new','viterbi_mtbl_new','viterbi_rtbl_new');
 
 
@@ -597,7 +597,7 @@ INSERT INTO test_segmenttbl_new VALUES
                'extraction_new');
 
         -- Expected viterbi labeling result
-        -- The result is produced from Dr. Sunita's CRF java package with the 
same input 
+        -- The result is produced from Dr. Sunita's CRF java package with the 
same input
         CREATE TABLE expected_extraction_new(doc_id integer, start_pos 
integer, seg_text text, label character varying);
         INSERT INTO expected_extraction_new VALUES
 (1, 0, 'that', 'DT'),

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/crf/test/crf_train_small.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/crf/test/crf_train_small.sql_in 
b/src/ports/postgres/modules/crf/test/crf_train_small.sql_in
index 9dca05e..1dbd6fa 100644
--- a/src/ports/postgres/modules/crf/test/crf_train_small.sql_in
+++ b/src/ports/postgres/modules/crf/test/crf_train_small.sql_in
@@ -16,7 +16,7 @@ m4_changequote(<!,!>)
 m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!
 
        -- Regex table
-        CREATE TABLE train_regex(pattern text,name text); 
+        CREATE TABLE train_regex(pattern text,name text);
         INSERT INTO train_regex VALUES
         ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
         ('^.*[0-9]+.*$','containsDigit'),('^.+[.]$','endsWithDot'),
@@ -125,8 +125,8 @@ m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!
        (114,'E.',42,6,2.79544657270948);
 
        SELECT assert(
-               SUM(abs(c1.weight-c2.weight)) < 0.1,  
-               'Total difference between extracted feature weights and 
expected feature weights is > 0.1.') 
+               SUM(abs(c1.weight-c2.weight)) < 0.1,
+               'Total difference between extracted feature weights and 
expected feature weights is > 0.1.')
        FROM  expected_crf_feature c1, train_crf_feature c2
         WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND 
c1.label = c2.label_id;;
 
@@ -136,7 +136,7 @@ m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!
        FROM (
                SELECT count(*) FROM(
                        SELECT name, prev_label, label
-                       FROM expected_crf_feature 
+                       FROM expected_crf_feature
                        EXCEPT ALL
                        SELECT name, prev_label_id, label_id
                        FROM  train_crf_feature
@@ -148,7 +148,7 @@ m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!
                        FROM  train_crf_feature
                        EXCEPT ALL
                        SELECT name, prev_label, label
-                       FROM expected_crf_feature 
+                       FROM expected_crf_feature
                ) AS U
        )s2;
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in 
b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index 838a6bd..55f5b25 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -231,14 +231,14 @@ cross validation is used.  Also, cross validation is not 
supported if grouping i
 
 Hyperparameter optimization can be carried out using the built-in cross
 validation mechanism, which is activated by assigning a value greater than 1 to
-the parameter \e n_folds. 
+the parameter \e n_folds.
 
 The cross validation scores are the mean and standard deviation
-of the accuracy when predicted on the validation fold, 
+of the accuracy when predicted on the validation fold,
 averaged over all folds and all rows.  For classification, the accuracy
-metric used is the ratio of correct classifications.  For regression, the 
-accuracy metric used is the negative of mean squared error (negative to 
-make it a concave problem, thus selecting \e max means the highest accuracy). 
+metric used is the ratio of correct classifications.  For regression, the
+accuracy metric used is the negative of mean squared error (negative to
+make it a concave problem, thus selecting \e max means the highest accuracy).
 
 The values of a parameter to cross validate should be provided in a list. For
 example, to regularize with the L1 norm and use a lambda value
@@ -784,7 +784,7 @@ iteration_run     | 10000
 SELECT * FROM houses_en3_cv ORDER BY mean_neg_loss DESC;
 </pre>
 <pre class="result">
- alpha | lambda_value |  mean_neg_loss     |   std_neg_loss                    
   
+ alpha | lambda_value |  mean_neg_loss     |   std_neg_loss
 -------+--------------+------------------------------------------+
    0.0 |          0.1 |     -36094.4685768 |      10524.4473253
    0.1 |          0.1 |     -36136.2448004 |      10682.4136993

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/glm/glm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/glm/glm.sql_in 
b/src/ports/postgres/modules/glm/glm.sql_in
index b0a8da1..377dfea 100644
--- a/src/ports/postgres/modules/glm/glm.sql_in
+++ b/src/ports/postgres/modules/glm/glm.sql_in
@@ -1247,7 +1247,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
  * @param coef Coefficients obtained by running generalized linear model.
  * @param col_ind Predictor variable array
  * @param link    Link function used in training
- * @returns Numeric value of the predicted count, obtained by rounding the 
predicted mean 
+ * @returns Numeric value of the predicted count, obtained by rounding the 
predicted mean
  *          to the nearest integral value.
  *
  * This function computes the dot product of the independent variables and the
@@ -1263,7 +1263,7 @@ AS 'MODULE_PATHNAME', 'glm_predict_poisson'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
- 
+
 -- Help messages -------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict(
     message    TEXT

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/glm/ordinal.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/glm/ordinal.sql_in 
b/src/ports/postgres/modules/glm/ordinal.sql_in
index aba9e93..7cda294 100644
--- a/src/ports/postgres/modules/glm/ordinal.sql_in
+++ b/src/ports/postgres/modules/glm/ordinal.sql_in
@@ -26,11 +26,11 @@ m4_include(`SQLCommon.m4')
 
 @brief Regression to model data with ordinal response variable.
 
-In statistics, ordinal regression is a type of regression analysis 
-used for predicting an ordinal variable, i.e. a variable whose 
-value exists on an arbitrary scale where only the relative ordering 
-between different values is significant. The two most common types 
-of ordinal regression models are ordered logit, which applies to 
+In statistics, ordinal regression is a type of regression analysis
+used for predicting an ordinal variable, i.e. a variable whose
+value exists on an arbitrary scale where only the relative ordering
+between different values is significant. The two most common types
+of ordinal regression models are ordered logit, which applies to
 data that meet the proportional odds assumption, and ordered probit.
 
 @anchor train
@@ -62,25 +62,25 @@ ordinal(source_table,
     <table class="output">
       <tr>
         <th>&lt;...&gt;</th>
-        <td>Grouping columns, if provided in input. This could be 
+        <td>Grouping columns, if provided in input. This could be
             multiple columns depending on the \c grouping_col input.</td>
       </tr>
 
       <tr>
         <th>coef_threshold</th>
-        <td>FLOAT8[]. Vector of the threshold coefficients in linear 
predictor. 
+        <td>FLOAT8[]. Vector of the threshold coefficients in linear predictor.
             The threshold coefficients are the intercepts specific to each 
categorical levels</td>
       </tr>
 
       <tr>
         <th>std_err_threshold</th>
-        <td>FLOAT8[]. Vector of the threshold standard errors 
+        <td>FLOAT8[]. Vector of the threshold standard errors
             of the threshold coefficients.</td>
       </tr>
 
       <tr>
         <th>z_stats_threshold</th>
-        <td>FLOAT8[]. Vector of the threshold z-statistics of 
+        <td>FLOAT8[]. Vector of the threshold z-statistics of
             the thresholdcoefficients.</td>
       </tr>
 
@@ -91,14 +91,14 @@ ordinal(source_table,
 
       <tr>
         <th>log_likelihood</th>
-        <td>FLOAT8. The log-likelihood \f$ l(\boldsymbol \beta) \f$. The value 
will 
+        <td>FLOAT8. The log-likelihood \f$ l(\boldsymbol \beta) \f$. The value 
will
             be the same across categories within the same group.</td>
       </tr>
 
       <tr>
         <th>coef_feature</th>
-        <td>FLOAT8[]. Vector of the feature coefficients in linear predictor. 
The 
-            feature coefficients are the coefficients for the independent 
variables. 
+        <td>FLOAT8[]. Vector of the feature coefficients in linear predictor. 
The
+            feature coefficients are the coefficients for the independent 
variables.
             They are the same across categories.</td>
       </tr>
 
@@ -160,9 +160,9 @@ ordinal(source_table,
 
     <tr>
       <th>independent_varname</th>
-      <td>VARCHAR. Expression for independent variables. The independent 
variables 
-          should not include intercept term. Otherwise there will be an error 
message 
-          indicating Hessian matrix is not finite. In that case, the user 
should 
+      <td>VARCHAR. Expression for independent variables. The independent 
variables
+          should not include intercept term. Otherwise there will be an error 
message
+          indicating Hessian matrix is not finite. In that case, the user 
should
           drop the intercept and rerun the function agian.</td>
     </tr>
 
@@ -183,7 +183,7 @@ ordinal(source_table,
 
     <tr>
       <th>optimizer_params</th>
-      <td>VARCHAR. String that contains optimizer parameters, and has the form 
+      <td>VARCHAR. String that contains optimizer parameters, and has the form
           of 'optimizer=..., max_iter=..., tolerance=...'.</td>
     </tr>
 
@@ -299,7 +299,7 @@ ordinal_predict(
   <DD>TEXT. Either 'response' or 'probability'. Using 'response' will give the 
predicted category with
             the largest probability. Using probability will give the predicted 
probabilities for all
             categories</DD>
-  
+
   <DT>verbose</DT>
   <DD>BOOLEAN. Whether verbose is displayed</DD>
 </DL>
@@ -421,7 +421,7 @@ would be used for prediction.)
 \\x off
 -- Add the id column for prediction function
 ALTER TABLE test3 ADD COLUMN id SERIAL;
--- Predict probabilities for all categories using the original data 
+-- Predict probabilities for all categories using the original data
 SELECT ordinal_predict('test3_output','test3', 'test3_prd_prob', 
'probability');
 -- Display the predicted value
 SELECT * FROM test3_prd_prob;
@@ -432,7 +432,7 @@ SELECT * FROM test3_prd_prob;
 
 The function ordinal() fit the ordinal response model using a cumulative link 
model. The ordinal reponse variable, denoted by \f$ Y_i \f$, can fall in \f$ j 
= 1,.. , J\f$ categories. Then \f$ Y_i \f$ follows a multinomial distribution 
with parameter \f$\pi\f$ where \f$\pi_{ij}\f$ denote the probability that the 
\f$i\f$th observation falls in response category \f$j\f$. We define the 
cumulative probabilities as
 \f[
-\gamma_{ij} = \Pr(Y_i \le j)= \pi_{i1} +...+ \pi_{ij} . 
+\gamma_{ij} = \Pr(Y_i \le j)= \pi_{i1} +...+ \pi_{ij} .
 \f]
 Next we will consider the logit link for illustration purpose. The logit 
function is defined as \f$ \mbox{logit}(\pi) = \log[\pi/(1-\pi)] \f$ and 
cumulative logits are defined as:
 \f[

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/glm/test/ordinal.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/glm/test/ordinal.sql_in 
b/src/ports/postgres/modules/glm/test/ordinal.sql_in
index ffcb131..2998f22 100644
--- a/src/ports/postgres/modules/glm/test/ordinal.sql_in
+++ b/src/ports/postgres/modules/glm/test/ordinal.sql_in
@@ -239,7 +239,7 @@ SELECT * FROM ordinal_out_summary;
 SELECT assert(
        relative_error(coef_threshold, ARRAY[2.6350, 3.8995]) < 1e-4 AND
        relative_error(coef_feature, ARRAY[.53958, .05595]) < 1e-4 AND
-       relative_error(log_likelihood, -190.92) < 1e-4, 
+       relative_error(log_likelihood, -190.92) < 1e-4,
        'Ordinal regression with IRLS optimizer (test): Wrong results'
 )
 FROM ordinal_out;
@@ -268,7 +268,7 @@ SELECT ordinal_predict(
        );
 
 SELECT assert(
-    relative_error(prob, ARRAY[0.5342011, 0.4278446, 0.2375883, 0.5062776])< 
1e-4, 
+    relative_error(prob, ARRAY[0.5342011, 0.4278446, 0.2375883, 0.5062776])< 
1e-4,
        'Ordinal regression prediction: Wrong results'
 )
 FROM (SELECT array_agg("0" ORDER BY id) as prob FROM ordinal_prd)subq;
@@ -293,7 +293,7 @@ SELECT * FROM ordinal_out_summary;
 SELECT assert(
        relative_error(coef_threshold, ARRAY[2.3497, 3.7680]) < 1e-4 AND
        relative_error(coef_feature, ARRAY[0.59758, 0.04845]) < 1e-4 AND
-       relative_error(log_likelihood, -154.6991) < 1e-4, 
+       relative_error(log_likelihood, -154.6991) < 1e-4,
        'Ordinal regression with IRLS optimizer (test grouping case): Wrong 
results'
 )
 FROM ordinal_out
@@ -308,7 +308,7 @@ SELECT ordinal_predict(
        );
 
 SELECT assert(
-    relative_error(prob, ARRAY[0.5141240, 0.3907468])< 1e-4, 
+    relative_error(prob, ARRAY[0.5141240, 0.3907468])< 1e-4,
        'Ordinal regression prediction: Wrong results'
 )
 FROM (SELECT array_agg("0" ORDER BY id) as prob FROM ordinal_prd where id=1 OR 
id=2)subq;
@@ -332,7 +332,7 @@ SELECT * FROM ordinal_out_summary;
 SELECT assert(
        relative_error(coef_threshold, ARRAY[1.5104, 2.2577]) < 1e-4 AND
        relative_error(coef_feature, ARRAY[.31431, .03215]) < 1e-4 AND
-       relative_error(log_likelihood, -191.9912) < 1e-4, 
+       relative_error(log_likelihood, -191.9912) < 1e-4,
        'Ordinal regression with IRLS optimizer (test): Wrong results'
 )
 FROM ordinal_out;
@@ -346,7 +346,7 @@ SELECT ordinal_predict(
     );
 
 SELECT assert(
-    relative_error(prob, ARRAY[0.5281595, 0.4288294, 0.2461692, 0.5025270])< 
1e-4, 
+    relative_error(prob, ARRAY[0.5281595, 0.4288294, 0.2461692, 0.5025270])< 
1e-4,
        'Ordinal regression prediction: Wrong results'
 )
 FROM (SELECT array_agg("0" ORDER BY id) as prob FROM ordinal_prd)subq;

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/bfs.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/bfs.sql_in 
b/src/ports/postgres/modules/graph/bfs.sql_in
index 067c8b6..c1c27fe 100644
--- a/src/ports/postgres/modules/graph/bfs.sql_in
+++ b/src/ports/postgres/modules/graph/bfs.sql_in
@@ -42,7 +42,7 @@ m4_include(`SQLCommon.m4')
 @brief Finds the nodes reachable from a given source vertex using a 
breadth-first approach.
 
 Given a graph and a source vertex, the breadth-first search (BFS) algorithm
-finds all nodes reachable from the source vertex by searching / traversing the 
graph 
+finds all nodes reachable from the source vertex by searching / traversing the 
graph
 in a breadth-first manner.
 
 @anchor bfs
@@ -72,9 +72,9 @@ vertex ids.  The vertex ids are of type INTEGER with no 
duplicates.
 They do not need to be contiguous.</dd>
 
 <dt>edge_table</dt>
-<dd>TEXT. Name of the table containing the edge data. The edge table must 
contain 
-columns for source vertex and destination vertex. Column naming convention is 
-described below in the 'edge_args' parameter. 
+<dd>TEXT. Name of the table containing the edge data. The edge table must 
contain
+columns for source vertex and destination vertex. Column naming convention is
+described below in the 'edge_args' parameter.
 In addition to vertex columns, if grouping is used then the columns specified
 in the 'grouping_cols' parameter must be present. </dd>
 
@@ -82,12 +82,12 @@ in the 'grouping_cols' parameter must be present. </dd>
 <dd>TEXT. A comma-delimited string containing multiple named arguments of
 the form "name=value". The following parameters are supported for
 this string argument:
-  - src (INTEGER): Name of the column containing the source vertex ids in the 
edge table. 
+  - src (INTEGER): Name of the column containing the source vertex ids in the 
edge table.
   Default column name is 'src'.
   (This is not to be confused with the 'source_vertex' argument passed to the 
BFS function.)
-  - dest (INTEGER): Name of the column containing the destination vertex ids 
in 
+  - dest (INTEGER): Name of the column containing the destination vertex ids in
   the edge table. Default column name is 'dest'.
-  
+
 <dt>source_vertex</dt>
 <dd>INTEGER. The source vertex id for the algorithm to start. This vertex id 
must
 exist in the 'vertex_id' column of 'vertex_table'.</dd>
@@ -95,37 +95,37 @@ exist in the 'vertex_id' column of 'vertex_table'.</dd>
 <dt>out_table</dt>
 <dd>TEXT. Name of the table to store the result of BFS.
 It contains a row for every vertex that is reachable from the source_vertex.
-In the presence of grouping columns, only those edges are used for which there 
are no NULL values 
+In the presence of grouping columns, only those edges are used for which there 
are no NULL values
 in any grouping column.
 The output table will have the following columns (in addition to the grouping 
columns):
-  - vertex_id : The id for any node reachable from source_vertex in addition 
to 
-                the source_vertex. Will use the input parameter 'vertex_id' 
for 
+  - vertex_id : The id for any node reachable from source_vertex in addition to
+                the source_vertex. Will use the input parameter 'vertex_id' for
                 column naming.
-  - dist      : The distance in number of edges (or hops) from the 
source_vertex 
-                to where this vertex is located. 
-  - parent    : The parent of this vertex in BFS traversal of the graph from 
source_vertex. 
-                Will use 'parent' for column naming. For the 
+  - dist      : The distance in number of edges (or hops) from the 
source_vertex
+                to where this vertex is located.
+  - parent    : The parent of this vertex in BFS traversal of the graph from 
source_vertex.
+                Will use 'parent' for column naming. For the
                 case where vertex_id = source_vertex, the value for parent is 
NULL.
 
 A summary table named <out_table>_summary is also created. This is an internal 
table that keeps a record of the input parameters.
 </dd>
 
 <dt>max_distance (optional)</dt>
-<dd>INT, default = NULL. Maximum distance to traverse 
-from the source vertex.  When this value is null, 
-traverses until reaches leaf node.  E.g., if set 
-to 1 will return only adjacent vertices, if set 
-to 7 will return vertices up to a maximum distance 
+<dd>INT, default = NULL. Maximum distance to traverse
+from the source vertex.  When this value is null,
+traverses until reaches leaf node.  E.g., if set
+to 1 will return only adjacent vertices, if set
+to 7 will return vertices up to a maximum distance
 of 7 vertices away.
 
 <dt>directed (optional)</dt>
 <dd>BOOLEAN, default = FALSE. If TRUE the graph will be treated as directed, 
else it will be treated as an undirected graph.</dd>
 
 <dt>grouping_cols (optional)</dt>
-<dd>TEXT, default = NULL. A comma-separated list of columns used to group the 
-input into discrete subgraphs. 
-These columns must exist in the edge table. When this value is NULL, no 
grouping is used 
-and a single BFS result is generated. 
+<dd>TEXT, default = NULL. A comma-separated list of columns used to group the
+input into discrete subgraphs.
+These columns must exist in the edge table. When this value is NULL, no 
grouping is used
+and a single BFS result is generated.
 @note Expressions are not currently supported for 'grouping_cols'.</dd>
 
 </dl>
@@ -133,11 +133,11 @@ and a single BFS result is generated.
 @anchor notes
 @par Notes
 
-The graph_bfs function is a SQL implementation of the well-known breadth-first 
-search algorithm [1] modified appropriately for a relational database. It will 
-find any node in the graph reachable from the source_vertex only once. If a 
node 
-is reachable by many different paths from the source_vertex (i.e. has more 
than 
-one parent), then only one of those parents is present in the output table. 
+The graph_bfs function is a SQL implementation of the well-known breadth-first
+search algorithm [1] modified appropriately for a relational database. It will
+find any node in the graph reachable from the source_vertex only once. If a 
node
+is reachable by many different paths from the source_vertex (i.e. has more than
+one parent), then only one of those parents is present in the output table.
 The BFS result will, in general, be different for different choices of 
source_vertex.
 
 @anchor examples
@@ -195,9 +195,9 @@ SELECT madlib.graph_bfs(
 SELECT * FROM out ORDER BY dist,id;
 </pre>
 <pre class="result">
- id | dist | parent 
+ id | dist | parent
 ----+------+--------
-  3 |    0 |       
+  3 |    0 |
   1 |    1 |      3
   4 |    1 |      3
   5 |    1 |      3
@@ -210,7 +210,7 @@ SELECT * FROM out ORDER BY dist,id;
 SELECT * FROM out_summary;
 </pre>
 <pre class="result">
- vertex_table | vertex_id | edge_table | edge_args | source_vertex | out_table 
| max_distance | directed | grouping_cols 
+ vertex_table | vertex_id | edge_table | edge_args | source_vertex | out_table 
| max_distance | directed | grouping_cols
 
--------------+-----------+------------+-----------+---------------+-----------+--------------+----------+---------------
  vertex       | NULL      | edge       | NULL      |             3 | out       
|              |          | NULL
 (1 row)
@@ -226,14 +226,14 @@ SELECT madlib.graph_bfs(
                          NULL,          -- Edge arguments (NULL means use 
default naming)
                          3,             -- Source vertex for BFS
                          'out_max',     -- Output table of nodes reachable 
from source_vertex
-                         2);            -- Maximum distance to traverse from 
source_vertex        
+                         2);            -- Maximum distance to traverse from 
source_vertex
                          -- Default values used for the other arguments
 SELECT * FROM out_max ORDER BY dist,id;
 </pre>
 <pre class="result">
- id | dist | parent 
+ id | dist | parent
 ----+------+--------
-  3 |    0 |       
+  3 |    0 |
   1 |    1 |      3
   4 |    1 |      3
   5 |    1 |      3
@@ -264,9 +264,9 @@ SELECT madlib.graph_bfs(
 SELECT * FROM out_alt ORDER BY v_id;
 </pre>
 <pre class="result">
- v_id | dist | parent 
+ v_id | dist | parent
 ------+------+--------
-    8 |    0 |       
+    8 |    0 |
     9 |    1 |      8
    10 |    1 |      8
    11 |    2 |      9
@@ -287,15 +287,15 @@ SELECT madlib.graph_bfs(
 SELECT * FROM out_alt_dir ORDER BY v_id;
 </pre>
 <pre class="result">
- v_id | dist | parent 
+ v_id | dist | parent
 ------+------+--------
-    8 |    0 |       
+    8 |    0 |
     9 |    1 |      8
    10 |    2 |      9
    11 |    2 |      9
 (4 rows)
 </pre>
-Notice that, with the graph being treated as directed, the parent of v_id=10 
+Notice that, with the graph being treated as directed, the parent of v_id=10
 is now vertex 9 and not 8 as in the undirected case.
 
 -# Create a graph with 2 groups:
@@ -343,13 +343,13 @@ SELECT madlib.graph_bfs(
 SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
 </pre>
 <pre class="result">
- g1  | g2 | id | dist | parent 
+ g1  | g2 | id | dist | parent
 -----+----+----+------+--------
- 100 | a  |  8 |    0 |       
+ 100 | a  |  8 |    0 |
  100 | a  |  9 |    1 |      8
  100 | a  | 10 |    1 |      8
  100 | a  | 11 |    2 |      9
- 202 | c  |  8 |    0 |       
+ 202 | c  |  8 |    0 |
  202 | c  |  9 |    1 |      8
  202 | c  | 10 |    1 |      8
  202 | c  | 11 |    2 |      9
@@ -373,9 +373,9 @@ SELECT madlib.graph_bfs(
 SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
 </pre>
 <pre class="result">
- g1  | g2 | id | dist | parent 
+ g1  | g2 | id | dist | parent
 -----+----+----+------+--------
- 100 | a  |  3 |    0 |       
+ 100 | a  |  3 |    0 |
  100 | a  |  1 |    1 |      3
  100 | a  |  4 |    1 |      3
  100 | a  |  5 |    1 |      3

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/hits.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/hits.sql_in 
b/src/ports/postgres/modules/graph/hits.sql_in
index bf4b414..96a507c 100644
--- a/src/ports/postgres/modules/graph/hits.sql_in
+++ b/src/ports/postgres/modules/graph/hits.sql_in
@@ -43,7 +43,7 @@ m4_include(`SQLCommon.m4')
 @brief Find the HITS scores(authority and hub) of all vertices in a directed
 graph.
 
-Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the 
+Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the
 authority score and hub score of every vertex, where authority estimates the
 value of the content of the page and hub estimates the value of its links to
 other pages. This algorithm was originally developed to rate web pages [1].
@@ -65,61 +65,61 @@ hits( vertex_table,
 \b Arguments
 <dl class="arglist">
 <dt>vertex_table</dt>
-<dd>TEXT. Name of the table containing the vertex data for the graph. Must 
+<dd>TEXT. Name of the table containing the vertex data for the graph. Must
     contain the column specified in the 'vertex_id' parameter below.</dd>
 
 <dt>vertex_id</dt>
-<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing 
-    vertex ids. The vertex ids are of type INTEGER with no duplicates. They 
+<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
+    vertex ids. The vertex ids are of type INTEGER with no duplicates. They
     do not need to be contiguous.</dd>
 
 <dt>edge_table</dt>
-<dd>TEXT. Name of the table containing the edge data. The edge table must   
+<dd>TEXT. Name of the table containing the edge data. The edge table must
 contain columns for source vertex and destination vertex.</dd>
 
 <dt>edge_args</dt>
 <dd>TEXT. A comma-delimited string containing multiple named arguments of
 the form "name=value". The following parameters are supported for
 this string argument:
-  - src (INTEGER): Name of the column containing the source vertex ids in 
+  - src (INTEGER): Name of the column containing the source vertex ids in
                    the edge table. Default column name is 'src'.
-  - dest (INTEGER): Name of the column containing the destination vertex 
+  - dest (INTEGER): Name of the column containing the destination vertex
                     ids in the edge table. Default column name is 'dest'.</dd>
 
 <dt>out_table</dt>
-<dd>TEXT. Name of the table to store the result of HITS. It will contain 
+<dd>TEXT. Name of the table to store the result of HITS. It will contain
     a row for every vertex from 'vertex_table' with the following columns:
-    - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' 
+    - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id'
                   for column naming.
     - authority : The vertex authority score.
     - hub : The vertex hub score.
     - grouping_cols : Grouping column values (if any) associated with the 
vertex_id.
 </dd>
 
-A summary table is also created that contains information 
+A summary table is also created that contains information
 regarding the number of iterations required for convergence.
-It is named by adding the suffix '_summary' to the 'out_table' 
+It is named by adding the suffix '_summary' to the 'out_table'
 parameter.
 
 <dt>max_iter (optional) </dt>
-<dd>INTEGER, default: 100. The maximum number of iterations allowed. Each 
+<dd>INTEGER, default: 100. The maximum number of iterations allowed. Each
     iteration consists of both authority and hub phases.</dd>
 
 <dt>threshold (optional) </dt>
 <dd>FLOAT8, default: (1/number of vertices * 1000).
     Threshold must be set to a value between 0 and 1, inclusive
-    of end points. 
+    of end points.
     If the difference between two consecutive iterations of authority AND two
     consecutive iterations of hub is smaller than 'threshold', then the
-    computation stops. That is, both authority and hub value differences 
+    computation stops. That is, both authority and hub value differences
     must be below the specified threshold for the algorithm to stop.
     If you set the threshold to 0, then you will force the
     algorithm to run for the full number of iterations specified in 'max_iter'.
 </dd>
 
 <dt>grouping_cols (optional)</dt>
-<dd>TEXT, default: NULL. A single column or a list of comma-separated columns 
-    that divides the input data into discrete groups, resulting in one 
+<dd>TEXT, default: NULL. A single column or a list of comma-separated columns
+    that divides the input data into discrete groups, resulting in one
     distribution per group. When this value is NULL, no grouping is used and a
     single model is generated for all data.
     @note Expressions are not currently supported for 'grouping_cols'.
@@ -369,8 +369,8 @@ SELECT * FROM hits_out_summary order by user_id;
 @anchor literature
 @par Literature
 
-[1] Kleinerg, Jon M., "Authoritative Sources in a Hyperlinked 
-Environment", Journal of the ACM, Sept. 1999.  
+[1] Kleinerg, Jon M., "Authoritative Sources in a Hyperlinked
+Environment", Journal of the ACM, Sept. 1999.
 https://www.cs.cornell.edu/home/kleinber/auth.pdf
 */
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/pagerank.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in 
b/src/ports/postgres/modules/graph/pagerank.sql_in
index 30a0b50..b81b58e 100644
--- a/src/ports/postgres/modules/graph/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/pagerank.sql_in
@@ -48,8 +48,8 @@ This algorithm was originally used by Google to rank websites 
where the World Wi
 modeled as a directed graph with the vertices representing the websites.  The 
PageRank
 algorithm initially proposed by Larry Page and Sergey Brin is implemented here 
[1].
 
-We also implement personalized PageRank, in which a notion of importance 
-provides personalization to a query.  
+We also implement personalized PageRank, in which a notion of importance
+provides personalization to a query.
 For example, importance scores can be biased according
 to a specified set of vertices in the graph that are of interest or special in 
some way [2].
 
@@ -326,13 +326,13 @@ SELECT * FROM pagerank_out_summary;
 @anchor literature
 @par Literature
 
-[1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual 
Web search engine", 
-Computer Networks and ISDN Systems. 30: 107–117, 
+[1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual 
Web search engine",
+Computer Networks and ISDN Systems. 30: 107–117,
 http://infolab.stanford.edu/pub/papers/google.pdf
 
 [2] Jeh, Glen and Widom, Jennifer. "Scaling Personalized Web Search",
-Proceedings of the 12th international conference on World Wide Web, Pages 
271-279 
-Budapest, Hungary, May 20-24, 2003, 
+Proceedings of the 12th international conference on World Wide Web, Pages 
271-279
+Budapest, Hungary, May 20-24, 2003,
 http://ilpubs.stanford.edu:8090/530/1/2002-12.pdf
 */
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/wcc.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/wcc.sql_in 
b/src/ports/postgres/modules/graph/wcc.sql_in
index d47f06c..f5879b9 100644
--- a/src/ports/postgres/modules/graph/wcc.sql_in
+++ b/src/ports/postgres/modules/graph/wcc.sql_in
@@ -140,8 +140,8 @@ information. It contains one or more rows for every group 
and has the following
 columns:
   - grouping_cols: The grouping columns given in the creation of wcc_table.
   If there are no grouping columns, this column is not created.
-  - component_id: The ID of the largest component.  Recall that we use the 
-  convention where 'component_id' is the id of the first vertex in a 
+  - component_id: The ID of the largest component.  Recall that we use the
+  convention where 'component_id' is the id of the first vertex in a
   particular group. It means that component ids are generally not contiguous.
   If there are multiple components of the same size, a row is created for each
   component. If grouping_cols is specified, the largest
@@ -153,7 +153,7 @@ columns:
 @anchor hist
 @par Retrieve Histogram of Vertices Per Connected Component
 
-This function creates a histogram of the number of vertices 
+This function creates a histogram of the number of vertices
 per connected component.
 
 <pre class="syntax">
@@ -394,7 +394,7 @@ SELECT * FROM wcc_out ORDER BY user_id, component_id, id;
 (13 rows)
 </pre>
 Note that vertex 4 is not identified as a separate component
-above. This is because there is no entry in the 
+above. This is because there is no entry in the
 edge table for vertex 4 indicating which group it belongs to
 (though you could do that if you wanted to).
 
@@ -414,7 +414,7 @@ SELECT * FROM largest_cpt_table ORDER BY component_id;
 (2 rows)
 </pre>
 
--# Retrieve histogram of the number of vertices per 
+-# Retrieve histogram of the number of vertices per
 connected component:
 <pre class="syntax">
 DROP TABLE IF EXISTS histogram_table;

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in 
b/src/ports/postgres/modules/knn/knn.sql_in
index 1a90652..24f19c7 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -148,8 +148,8 @@ The following distance functions can be used:
 
 <dt>weighted_avg (optional)</dt>
 <dd>BOOLEAN, default: FALSE. Calculates classification or
-regression values using a weighted average.   The idea is to 
-weigh the contribution of each of the k neighbors according 
+regression values using a weighted average.   The idea is to
+weigh the contribution of each of the k neighbors according
 to their distance to the test point, giving greater influence
 to closer neighbors.  The distance function 'fn_dist' specified
 above is used.
@@ -341,7 +341,7 @@ Result, with neighbors sorted from closest to furthest:
 </pre>
 
 
--#   Run KNN for classification using the 
+-#   Run KNN for classification using the
 weighted average:
 <pre class="example">
 DROP TABLE IF EXISTS knn_result_classification;
@@ -362,7 +362,7 @@ SELECT * FROM madlib.knn(
 SELECT * FROM knn_result_classification ORDER BY id;
 </pre>
 <pre class="result">
- id |  data   |     prediction      | k_nearest_neighbours 
+ id |  data   |     prediction      | k_nearest_neighbours
 ----+---------+---------------------+----------------------
   1 | {2,1}   |                 1   | {2,1,3}
   2 | {2,6}   |                 1   | {5,4,3}
@@ -402,8 +402,8 @@ is assigned to the test point.
     
https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
 
 @anchor knn-lit-4
-[4]    Shepard, Donald (1968). "A two-dimensional interpolation function for 
-irregularly-spaced data". Proceedings of the 1968 ACM National Conference. pp. 
517–524. 
+[4]    Shepard, Donald (1968). "A two-dimensional interpolation function for
+irregularly-spaced data". Proceedings of the 1968 ACM National Conference. pp. 
517–524.
 
 @internal
 @sa namespace knn (documenting the implementation in Python)
@@ -448,7 +448,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     output_table VARCHAR,
     k INTEGER,
     output_neighbors BOOLEAN,
-    fn_dist TEXT, 
+    fn_dist TEXT,
     weighted_avg BOOLEAN
 ) RETURNS VARCHAR AS $$
     PythonFunctionBodyOnly(`knn', `knn')

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/lda/lda.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/lda/lda.sql_in 
b/src/ports/postgres/modules/lda/lda.sql_in
index efdb957..2355850 100644
--- a/src/ports/postgres/modules/lda/lda.sql_in
+++ b/src/ports/postgres/modules/lda/lda.sql_in
@@ -35,16 +35,16 @@ Latent Dirichlet Allocation (LDA) is a generative 
probabilistic
 model for natural texts.  It is used in problems such as automated
 topic discovery, collaborative filtering, and document classification.
 
-In addition to an implementation of LDA, this MADlib module also provides a 
+In addition to an implementation of LDA, this MADlib module also provides a
 number of additional helper functions to interpret results of the LDA output.
 
 @note
-Topic modeling is often used as part of a larger text processing 
+Topic modeling is often used as part of a larger text processing
 pipeline, which may include operations such as term frequency, stemming and
-stop word removal.  You can use the 
+stop word removal.  You can use the
 function <a href="group__grp__text__utilities.html">Term Frequency</a>
-to generate the required vocabulary format from raw documents for the 
-LDA training function. See 
+to generate the required vocabulary format from raw documents for the
+LDA training function. See
 the examples later on this page for more details.
 
 @anchor background
@@ -107,29 +107,29 @@ lda_train( data_table,
     <dt>data_table</dt>
     <dd>TEXT. Name of the table storing the training dataset. Each row is
     in the form <tt>&lt;docid, wordid, count&gt;</tt> where \c docid, \c 
wordid, and \c count
-    are non-negative integers.  
+    are non-negative integers.
     The \c docid column refers to the document ID, the \c wordid column is the
     word ID (the index of a word in the vocabulary), and \c count is the
     number of occurrences of the word in the document. Please note:
-    
-    - \c wordid must be 
+
+    - \c wordid must be
     contiguous integers going from from 0 to \c voc_size &minus; \c 1.
-    - column names for \c docid, \c wordid, and \c count are currently fixed, 
-    so you must use these exact names in the data_table.  
-    
+    - column names for \c docid, \c wordid, and \c count are currently fixed,
+    so you must use these exact names in the data_table.
+
     The function <a href="group__grp__text__utilities.html">Term Frequency</a>
     can be used to generate vocabulary in the required format from raw 
documents.
     </dd>
 
     <dt>model_table</dt>
-    <dd>TEXT. This is an output table generated by LDA which contains the 
learned model. 
+    <dd>TEXT. This is an output table generated by LDA which contains the 
learned model.
     It has one row with the following columns:
         <table class="output">
             <tr>
                 <th>voc_size</th>
-                <td>INTEGER. Size of the vocabulary. As mentioned above for 
the input 
-                table, \c wordid consists of contiguous integers going 
-                from 0 to \c voc_size &minus; \c 1.  
+                <td>INTEGER. Size of the vocabulary. As mentioned above for 
the input
+                table, \c wordid consists of contiguous integers going
+                from 0 to \c voc_size &minus; \c 1.
                 </td>
             </tr>
             <tr>
@@ -138,12 +138,12 @@ lda_train( data_table,
             </tr>
             <tr>
                 <th>alpha</th>
-                <td>DOUBLE PRECISION. Dirichlet prior for the per-document 
+                <td>DOUBLE PRECISION. Dirichlet prior for the per-document
                 topic multinomial.</td>
             </tr>
             <tr>
                 <th>beta</th>
-                <td>DOUBLE PRECISION. Dirichlet prior for the per-topic 
+                <td>DOUBLE PRECISION. Dirichlet prior for the per-topic
                 word multinomial.</td>
             </tr>
             <tr>
@@ -153,7 +153,7 @@ lda_train( data_table,
         </table>
     </dd>
     <dt>output_data_table</dt>
-    <dd>TEXT. The name of the table generated by LDA that stores 
+    <dd>TEXT. The name of the table generated by LDA that stores
     the output data. It has the following columns:
         <table class="output">
             <tr>
@@ -162,14 +162,14 @@ lda_train( data_table,
             </tr>
             <tr>
                 <th>wordcount</th>
-                <td>INTEGER. Count of number of words in the document, 
-                including repeats. For example, if a word appears 3 times 
+                <td>INTEGER. Count of number of words in the document,
+                including repeats. For example, if a word appears 3 times
                 in the document, it is counted 3 times.</td>
             </tr>
             <tr>
                 <th>words</th>
                 <td>INTEGER[]. Array of \c wordid in the document, not
-                including repeats.  For example, if a word appears 3 times 
+                including repeats.  For example, if a word appears 3 times
                 in the document, it appears only once in the \c words 
array.</td>
             </tr>
             <tr>
@@ -183,34 +183,34 @@ lda_train( data_table,
             <tr>
                 <th>topic_count</th>
                 <td>INTEGER[]. Array of the count of words in the document
-                that correspond to each topic.  This array is of 
-                length \c topic_num. Topic ids are continuous integers going 
+                that correspond to each topic.  This array is of
+                length \c topic_num. Topic ids are continuous integers going
                 from 0 to \c topic_num &minus; \c 1.</td>
             </tr>
             <tr>
                 <th>topic_assignment</th>
-                <td>INTEGER[]. Array indicating which topic each word in the 
+                <td>INTEGER[]. Array indicating which topic each word in the
                 document corresponds to.  This array is of length \c  
wordcount.
-                Words that are repeated \c n times in the document 
+                Words that are repeated \c n times in the document
                 will show up consecutively \c n times in this array.</td>
             </tr>
         </table>
     </dd>
     <dt>voc_size</dt>
-    <dd>INTEGER. Size of the vocabulary. As mentioned above for the 
-                input 'data_table', \c wordid consists of continuous integers 
going 
-                from 0 to \c voc_size &minus; \c 1.   
+    <dd>INTEGER. Size of the vocabulary. As mentioned above for the
+                input 'data_table', \c wordid consists of continuous integers 
going
+                from 0 to \c voc_size &minus; \c 1.
     </dd>
     <dt>topic_num</dt>
     <dd>INTEGER. Desired number of topics.</dd>
     <dt>iter_num</dt>
     <dd>INTEGER. Desired number of iterations.</dd>
     <dt>alpha</dt>
-    <dd>DOUBLE PRECISION. Dirichlet prior for the per-document topic 
+    <dd>DOUBLE PRECISION. Dirichlet prior for the per-document topic
     multinomial (e.g., 50/topic_num is a reasonable value to start with
     as per Griffiths and Steyvers [2] ).</dd>
     <dt>beta</dt>
-    <dd>DOUBLE PRECISION. Dirichlet prior for the per-topic 
+    <dd>DOUBLE PRECISION. Dirichlet prior for the per-topic
     word multinomial (e.g., 0.01 is a reasonable value to start with).</dd>
 </dl>
 
@@ -227,27 +227,27 @@ lda_predict( data_table,
 \b Arguments
 <dl class="arglist">
 <dt>data_table</dt>
-    <dd>TEXT. Name of the table storing the test dataset 
+    <dd>TEXT. Name of the table storing the test dataset
     (new document to be labeled).
     </dd>
 <dt>model_table</dt>
     <dd>TEXT. The model table generated by the training process.
     </dd>
 <dt>output_predict_table</dt>
-    <dd>TEXT. The prediction output table. 
-    Each row in the table stores the topic 
-    distribution and the topic assignments for a 
-    document in the dataset. This table has the exact 
-    same columns and interpretation as 
-    the 'output_data_table' from the training function above. 
+    <dd>TEXT. The prediction output table.
+    Each row in the table stores the topic
+    distribution and the topic assignments for a
+    document in the dataset. This table has the exact
+    same columns and interpretation as
+    the 'output_data_table' from the training function above.
     </dd>
 </dl>
 
 @anchor perplexity
 @par Perplexity
-Perplexity describes how well the model fits the data by 
+Perplexity describes how well the model fits the data by
 computing word likelihoods averaged over the test documents.
-This function returns a single perplexity value. 
+This function returns a single perplexity value.
 <pre class="syntax">
 lda_get_perplexity( model_table,
                     output_predict_table
@@ -259,7 +259,7 @@ lda_get_perplexity( model_table,
     <dd>TEXT. The model table generated by the training process.
     </dd>
 <dt>output_predict_table</dt>
-    <dd>TEXT. The prediction output table generated by the 
+    <dd>TEXT. The prediction output table generated by the
     predict function above.
     </dd>
 </dl>
@@ -267,7 +267,7 @@ lda_get_perplexity( model_table,
 @anchor helper
 @par Helper Functions
 
-The helper functions can help to interpret the output 
+The helper functions can help to interpret the output
 from LDA training and LDA prediction.
 
 <b>Topic description by top-k words with highest probability</b>
@@ -288,13 +288,13 @@ lda_get_topic_desc( model_table,
     </dd>
 <dt>vocab_table</dt>
     <dd>TEXT. The vocabulary table in the form <wordid, word>.
-    Reminder that this table can be created using the \c term_frequency 
-    function (\ref grp_text_utilities) with the 
-    parameter \c compute_vocab set to TRUE. 
+    Reminder that this table can be created using the \c term_frequency
+    function (\ref grp_text_utilities) with the
+    parameter \c compute_vocab set to TRUE.
     </dd>
 <dt>output_table</dt>
     <dd>TEXT. The output table with per-topic description
-    generated by this helper function.  
+    generated by this helper function.
     It has the following columns:
         <table class="output">
             <tr>
@@ -337,7 +337,7 @@ lda_get_word_topic_count( model_table,
     </dd>
 <dt>output_table</dt>
     <dd>TEXT. The output table with per-word topic counts
-        generated by this helper function. 
+        generated by this helper function.
         It has the following columns:
         <table class="output">
             <tr>
@@ -347,7 +347,7 @@ lda_get_word_topic_count( model_table,
             <tr>
                 <th>topic_count</th>
                 <td>INTEGER[]. Count of word association with each topic, i.e.,
-                shows how many times a given word is 
+                shows how many times a given word is
                 assigned to a topic. Array is of length number of topics.</td>
             </tr>
         </table>
@@ -370,7 +370,7 @@ lda_get_topic_word_count( model_table,
     </dd>
 <dt>output_table</dt>
     <dd>TEXT. The output table with per-topic word counts
-        generated by this helper function. 
+        generated by this helper function.
         It has the following columns:
         <table class="output">
             <tr>
@@ -403,7 +403,7 @@ lda_get_word_topic_mapping( output_data_table,  -- From 
training or prediction
     </dd>
 <dt>output_table</dt>
     <dd>TEXT. The output table with word to topic mappings
-        generated by this helper function. 
+        generated by this helper function.
         It has the following columns:
         <table class="output">
             <tr>
@@ -435,10 +435,10 @@ INSERT INTO documents VALUES
 (2, 'Machine learning is closely related to and often overlaps with 
computational statistics; a discipline that also specializes in 
prediction-making. It has strong ties to mathematical optimization, which 
deliver methods, theory and application domains to the field.'),
 (3, 'California''s diverse geography ranges from the Sierra Nevada in the east 
to the Pacific Coast in the west, from the Redwood–Douglas fir forests of the 
northwest, to the Mojave Desert areas in the southeast. The center of the state 
is dominated by the Central Valley, a major agricultural area.');
 </pre>
-You can apply stemming, stop word removal and tokenization 
-at this point in order to prepare the documents for text 
-processing. Depending upon your database version, various 
-tools are available. Databases based on more recent 
+You can apply stemming, stop word removal and tokenization
+at this point in order to prepare the documents for text
+processing. Depending upon your database version, various
+tools are available. Databases based on more recent
 versions of PostgreSQL may do something like:
 <pre class="example">
 SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
@@ -452,13 +452,13 @@ SELECT tsvector_to_array(to_tsvector('english',contents)) 
from documents;
  {agricultur,area,california,center,central,coast,desert,divers,...}
 (4 rows)
 </pre>
-In this example, we assume a database based on an older 
-version of PostgreSQL and just perform basic punctuation removal 
-and tokenization. The array of words is added as a new column 
+In this example, we assume a database based on an older
+version of PostgreSQL and just perform basic punctuation removal
+and tokenization. The array of words is added as a new column
 to the documents table:
 <pre class="example">
 ALTER TABLE documents ADD COLUMN words TEXT[];
-UPDATE documents SET words = 
+UPDATE documents SET words =
     regexp_split_to_array(lower(
     regexp_replace(contents, E'[,.;\']','', 'g')
     ), E'[\\\\s+]');
@@ -494,7 +494,7 @@ SELECT madlib.term_frequency('documents',    -- input table
 SELECT * FROM documents_tf ORDER BY docid LIMIT 20;
 </pre>
 <pre class="result">
- docid | wordid | count 
+ docid | wordid | count
 -------+--------+-------
      0 |     71 |     1
      0 |     90 |     1
@@ -523,7 +523,7 @@ Here is the associated vocabulary table. Note that wordid 
starts at 0:
 SELECT * FROM documents_tf_vocabulary ORDER BY wordid LIMIT 20;
 </pre>
 <pre class="result">
- wordid |     word     
+ wordid |     word
 --------+--------------
       0 | 1960s
       1 | 1968
@@ -552,25 +552,25 @@ The total number of words in the vocabulary across all 
documents is:
 SELECT COUNT(*) FROM documents_tf_vocabulary;
 </pre>
 <pre class="result">
- count 
+ count
 +------
    103
 (1 row)
 </pre>
--# Train LDA model.  For Dirichlet priors we use initial 
-rule-of-thumb values of 50/(number of topics) for alpha 
+-# Train LDA model.  For Dirichlet priors we use initial
+rule-of-thumb values of 50/(number of topics) for alpha
 and 0.01 for beta.
-Reminder that column names for docid, wordid, and count 
-are currently fixed, so you must use these exact names 
-in the input table. After a successful run of the LDA 
-training function two tables are generated, one for 
-storing the learned model and the other for storing 
+Reminder that column names for docid, wordid, and count
+are currently fixed, so you must use these exact names
+in the input table. After a successful run of the LDA
+training function two tables are generated, one for
+storing the learned model and the other for storing
 the output data table.
 <pre class="example">
 DROP TABLE IF EXISTS lda_model, lda_output_data;
 SELECT madlib.lda_train( 'documents_tf',     -- documents table in the form of 
term frequency
                          'lda_model',        -- model table created by LDA 
training (not human readable)
-                         'lda_output_data',  -- readable output data table 
+                         'lda_output_data',  -- readable output data table
                          103,                -- vocabulary size
                          5,                  -- number of topics
                          10,                 -- number of iterations
@@ -611,10 +611,10 @@ topic_assignment | 
{4,4,4,0,2,0,0,2,4,4,2,2,2,1,2,4,1,0,2,2,2,2,2,2,2,2,2,2,2,1,
 </pre>
 
 -# Review learned model using helper functions.
-First, we get topic description by top-k words. These are 
+First, we get topic description by top-k words. These are
 the k words with the highest probability for the topic.
-Note that if there are ties in probability, more than k 
-words may actually be reported for each topic. Also note 
+Note that if there are ties in probability, more than k
+words may actually be reported for each topic. Also note
 that topicid starts at 0:
 <pre class="example">
 DROP TABLE IF EXISTS helper_output_table;
@@ -625,7 +625,7 @@ SELECT madlib.lda_get_topic_desc( 'lda_model',              
  -- LDA model gener
 SELECT * FROM helper_output_table ORDER BY topicid, prob DESC LIMIT 40;
 </pre>
 <pre class="result">
- topicid | wordid |        prob        |       word        
+ topicid | wordid |        prob        |       word
 ---------+--------+--------------------+-------------------
        0 |      3 |  0.111357750647429 | a
        0 |     51 |  0.074361820199778 | is
@@ -669,9 +669,9 @@ SELECT * FROM helper_output_table ORDER BY topicid, prob 
DESC LIMIT 40;
        1 |     33 | 0.0438558402084238 | discipline
 (40 rows)
 </pre>
-Get the per-word topic counts.  This mapping shows how 
-many times a given word is assigned to a topic.  E.g., 
-wordid 3 is assigned to topicid 0 three times: 
+Get the per-word topic counts.  This mapping shows how
+many times a given word is assigned to a topic.  E.g.,
+wordid 3 is assigned to topicid 0 three times:
 <pre class="example">
 DROP TABLE IF EXISTS helper_output_table;
 SELECT madlib.lda_get_word_topic_count( 'lda_model',            -- LDA model 
generated in training
@@ -679,7 +679,7 @@ SELECT madlib.lda_get_word_topic_count( 'lda_model',        
    -- LDA model gen
 SELECT * FROM helper_output_table ORDER BY wordid LIMIT 20;
 </pre>
 <pre class="result">
- wordid | topic_count 
+ wordid | topic_count
 --------+-------------
       0 | {0,1,0,0,0}
       1 | {1,0,0,0,0}
@@ -703,7 +703,7 @@ SELECT * FROM helper_output_table ORDER BY wordid LIMIT 20;
      19 | {2,0,0,0,0}
 (20 rows)
 </pre>
-Get the per-topic word counts.   This mapping shows 
+Get the per-topic word counts.   This mapping shows
 which words are associated with each topic by frequency:
 <pre class="example">
 DROP TABLE IF EXISTS topic_word_count;
@@ -736,7 +736,7 @@ SELECT madlib.lda_get_word_topic_mapping('lda_output_data', 
 -- Output table fro
 SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
 </pre>
 <pre class="result">
- docid | wordid | topicid 
+ docid | wordid | topicid
 -------+--------+---------
      0 |     56 |       1
      0 |     54 |       1
@@ -781,15 +781,15 @@ SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
 (40 rows)
 </pre>
 
--# Use a learned LDA model for prediction (that is, to label new documents). 
-In this example, we use the same input table as we used to train, just for 
-demonstration purpose. Normally, the test document is a new one that 
+-# Use a learned LDA model for prediction (that is, to label new documents).
+In this example, we use the same input table as we used to train, just for
+demonstration purpose. Normally, the test document is a new one that
 we want to predict on.
 <pre class="example">
 DROP TABLE IF EXISTS outdata_predict;
 SELECT madlib.lda_predict( 'documents_tf',          -- Document to predict
                            'lda_model',             -- LDA model from training
-                           'outdata_predict'        -- Output table for 
predict results         
+                           'outdata_predict'        -- Output table for 
predict results
                          );
 SELECT * FROM outdata_predict;
 </pre>
@@ -837,7 +837,7 @@ SELECT madlib.lda_get_word_topic_mapping('outdata_predict', 
 -- Output table fro
 SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
 </pre>
 <pre class="result">
- docid | wordid | topicid 
+ docid | wordid | topicid
 -------+--------+---------
      0 |     54 |       4
      0 |     42 |       1
@@ -882,8 +882,8 @@ SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
 (40 rows)
 </pre>
 
--# Call the perplexity function to see how well the model fits 
-the data. Perplexity computes word likelihoods averaged 
+-# Call the perplexity function to see how well the model fits
+the data. Perplexity computes word likelihoods averaged
 over the test documents.
 <pre class="example">
 SELECT madlib.lda_get_perplexity( 'lda_model',        -- LDA model from 
training
@@ -891,7 +891,7 @@ SELECT madlib.lda_get_perplexity( 'lda_model',        -- 
LDA model from training
                                 );
 </pre>
 <pre class="result">
- lda_get_perplexity 
+ lda_get_perplexity
 +--------------------
     79.481894411824
 (1 row)

Reply via email to