Repository: madlib
Updated Branches:
  refs/heads/master 92bdf8cab -> 35818fa39


http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/tsa/test/arima_train.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/tsa/test/arima_train.sql_in 
b/src/ports/postgres/modules/tsa/test/arima_train.sql_in
index e1b2919..e6f5bd2 100644
--- a/src/ports/postgres/modules/tsa/test/arima_train.sql_in
+++ b/src/ports/postgres/modules/tsa/test/arima_train.sql_in
@@ -66,55 +66,55 @@ drop table if exists tsa_out;
 drop table if exists tsa_out_summary;
 drop table if exists tsa_out_residual;
 select arima_train('mini_ts', 'tsa_out', 'id', 'val', NULL, TRUE, 
ARRAY[1,0,1]);
-select assert(relative_error(ar_params, ARRAY[0.685268276058]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out; 
-select assert(relative_error(ar_std_errors, ARRAY[0.103996616127]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out; 
-select assert(relative_error(ma_params, ARRAY[0.730629026211]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out; 
-select assert(relative_error(ma_std_errors, ARRAY[0.0979481470864]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out; 
-select assert(relative_error(mean, 38.6009250545) < 1e-2, 'ARIMA: wrong mean') 
from tsa_out; 
-select assert(relative_error(mean_std_error, 13.2499230619) < 1e-2, 'ARIMA: 
wrong mean_std_errors') from tsa_out; 
-select assert(relative_error(residual_variance, 281.669418496) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary; 
-select assert(relative_error(log_likelihood, -207.725973784) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary; 
+select assert(relative_error(ar_params, ARRAY[0.685268276058]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out;
+select assert(relative_error(ar_std_errors, ARRAY[0.103996616127]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out;
+select assert(relative_error(ma_params, ARRAY[0.730629026211]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out;
+select assert(relative_error(ma_std_errors, ARRAY[0.0979481470864]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out;
+select assert(relative_error(mean, 38.6009250545) < 1e-2, 'ARIMA: wrong mean') 
from tsa_out;
+select assert(relative_error(mean_std_error, 13.2499230619) < 1e-2, 'ARIMA: 
wrong mean_std_errors') from tsa_out;
+select assert(relative_error(residual_variance, 281.669418496) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary;
+select assert(relative_error(log_likelihood, -207.725973784) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary;
 
 -- FALSE, ARRAY[1,0,1]
 drop table if exists tsa_out;
 drop table if exists tsa_out_summary;
 drop table if exists tsa_out_residual;
 select arima_train('mini_ts', 'tsa_out', 'id', 'val', NULL, FALSE, 
ARRAY[1,0,1]);
-select assert(relative_error(ar_params, ARRAY[0.831752901064]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out; 
-select assert(relative_error(ar_std_errors, ARRAY[0.0695053543058]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out; 
-select assert(relative_error(ma_params, ARRAY[0.701393608306]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out; 
-select assert(relative_error(ma_std_errors, ARRAY[0.0969171335486]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out; 
-select assert(relative_error(residual_variance, 304.217719576) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary; 
-select assert(relative_error(log_likelihood, -209.61270701) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary; 
+select assert(relative_error(ar_params, ARRAY[0.831752901064]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out;
+select assert(relative_error(ar_std_errors, ARRAY[0.0695053543058]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out;
+select assert(relative_error(ma_params, ARRAY[0.701393608306]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out;
+select assert(relative_error(ma_std_errors, ARRAY[0.0969171335486]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out;
+select assert(relative_error(residual_variance, 304.217719576) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary;
+select assert(relative_error(log_likelihood, -209.61270701) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary;
 
 -- FALSE, ARRAY[1,1,1]
 drop table if exists tsa_out;
 drop table if exists tsa_out_summary;
 drop table if exists tsa_out_residual;
 select arima_train('mini_ts', 'tsa_out', 'id', 'val', NULL, FALSE, 
ARRAY[1,1,1]);
-select assert(relative_error(ar_params, ARRAY[0.16327119476]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out; 
-select assert(relative_error(ar_std_errors, ARRAY[0.211608737666]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out; 
-select assert(relative_error(ma_params, ARRAY[0.630297255402]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out; 
-select assert(relative_error(ma_std_errors, ARRAY[0.163395070851]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out; 
-select assert(relative_error(residual_variance, 322.217055379) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary; 
-select assert(relative_error(log_likelihood, -206.714459277) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary; 
+select assert(relative_error(ar_params, ARRAY[0.16327119476]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out;
+select assert(relative_error(ar_std_errors, ARRAY[0.211608737666]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out;
+select assert(relative_error(ma_params, ARRAY[0.630297255402]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out;
+select assert(relative_error(ma_std_errors, ARRAY[0.163395070851]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out;
+select assert(relative_error(residual_variance, 322.217055379) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary;
+select assert(relative_error(log_likelihood, -206.714459277) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary;
 
 -- FALSE, ARRAY[1,0,0]
 drop table if exists tsa_out;
 drop table if exists tsa_out_summary;
 drop table if exists tsa_out_residual;
 select arima_train('mini_ts', 'tsa_out', 'id', 'val', NULL, FALSE, 
ARRAY[1,0,0]);
-select assert(relative_error(ar_params, ARRAY[0.90180701677]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out; 
-select assert(relative_error(ar_std_errors, ARRAY[0.0524855716423]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out; 
-select assert(relative_error(residual_variance, 501.798665305) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary; 
-select assert(relative_error(log_likelihood, -221.873862993) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary; 
+select assert(relative_error(ar_params, ARRAY[0.90180701677]) < 1e-2, 'ARIMA: 
wrong ar_params') from tsa_out;
+select assert(relative_error(ar_std_errors, ARRAY[0.0524855716423]) < 1e-2, 
'ARIMA: wrong ar_std_errors') from tsa_out;
+select assert(relative_error(residual_variance, 501.798665305) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary;
+select assert(relative_error(log_likelihood, -221.873862993) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary;
 
 -- FALSE, ARRAY[0,0,1]
 drop table if exists tsa_out;
 drop table if exists tsa_out_summary;
 drop table if exists tsa_out_residual;
 select arima_train('mini_ts', 'tsa_out', 'id', 'val', NULL, FALSE, 
ARRAY[0,0,1]);
-select assert(relative_error(ma_params, ARRAY[0.794860003635]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out; 
-select assert(relative_error(ma_std_errors, ARRAY[0.0625911839965]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out; 
-select assert(relative_error(residual_variance, 1436.32174849) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary; 
+select assert(relative_error(ma_params, ARRAY[0.794860003635]) < 1e-2, 'ARIMA: 
wrong ma_params') from tsa_out;
+select assert(relative_error(ma_std_errors, ARRAY[0.0625911839965]) < 1e-2, 
'ARIMA: wrong ma_std_errors') from tsa_out;
+select assert(relative_error(residual_variance, 1436.32174849) < 1e-2, 'ARIMA: 
wrong residual_variance') from tsa_out_summary;
 select assert(relative_error(log_likelihood, -247.639087319) < 1e-2, 'ARIMA: 
wrong log_likelihood') from tsa_out_summary;

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/utilities/cols2vec.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/cols2vec.sql_in 
b/src/ports/postgres/modules/utilities/cols2vec.sql_in
index 0c54ab5..bbdcc52 100644
--- a/src/ports/postgres/modules/utilities/cols2vec.sql_in
+++ b/src/ports/postgres/modules/utilities/cols2vec.sql_in
@@ -82,7 +82,7 @@ values.</dd>
 
 <dt>list_of_features_to_exclude (optional)</dt>
 <dd>TEXT. Default NULL.
-Comma-separated string of column names to exclude from the feature array.  
Typically used 
+Comma-separated string of column names to exclude from the feature array.  
Typically used
 when 'list_of_features' is set to '*'.</dd>
 
 <dt>cols_to_output (optional)</dt>
@@ -207,7 +207,7 @@ list_of_features_to_exclude | None
 feature_names               | {temperature,humidity}
 </pre>
 
--# Combine the temperature and humidity columns 
+-# Combine the temperature and humidity columns
 and keep 2 other columns from source_table.
 <pre class="example">
 DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
@@ -298,7 +298,7 @@ list_of_features_to_exclude | "OUTLOOK", "Temp_Humidity", 
clouds_airquality, win
 feature_names               | {temperature,humidity,observation_weight}
 </pre>
 
--# Combine the temperature and humidity columns, exclude windy, and keep all 
of the 
+-# Combine the temperature and humidity columns, exclude windy, and keep all 
of the
 columns from the source table.
 <pre class="example">
 DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
@@ -347,8 +347,8 @@ This also shows that you can exclude features in 
'list_of_features_to_exclude'
 that are in the list of 'list_of_features'.  This can be useful if the 
'list_of_features'
 is generated from an expression or subquery.
 
--# Type casting works as per regular rules of the underlying database.  
-E.g, combining integer and double precisions columns will create a double 
precision feature vector.  
+-# Type casting works as per regular rules of the underlying database.
+E.g, combining integer and double precisions columns will create a double 
precision feature vector.
 For Boolean, do an explicit cast to the target type:
 <pre class="example">
 DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
@@ -362,7 +362,7 @@ SELECT madlib.cols2vec(
 SELECT * FROM cols2vec_result ORDER BY id;
 </pre>
 <pre class="result">
- id |    feature_vector    
+ id |    feature_vector
 ----+----------------------
   1 | {false,"Don't Play"}
   2 | {true,"Don't Play"}

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/utilities/path.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/path.sql_in 
b/src/ports/postgres/modules/utilities/path.sql_in
index c90855b..6c102eb 100644
--- a/src/ports/postgres/modules/utilities/path.sql_in
+++ b/src/ports/postgres/modules/utilities/path.sql_in
@@ -153,7 +153,7 @@ path(
 
     <dt>aggregate_func (optional)</dt>
     <dd>VARCHAR, default NULL. A comma-separated list of aggregates to be
-    applied to the pattern matches [3].  
+    applied to the pattern matches [3].
     You can think of this input parameter as being like a SELECT clause.
     Please note that window functions
     cannot currently be used in the parameter 'aggregate_func'.  If you want
@@ -418,20 +418,20 @@ Result:
 There are only 2 different paths.  The wine page is viewed more frequently
 than the beer page just before checkout.
 
--# To demonstrate the use of 'overlapping_patterns', consider 
-a pattern with at least one page followed by and ending with a checkout: 
+-# To demonstrate the use of 'overlapping_patterns', consider
+a pattern with at least one page followed by and ending with a checkout:
 <pre class="example">
 DROP TABLE IF EXISTS path_output, path_output_tuples;
-SELECT madlib.path(                                                            
       
-     'eventlog',                    -- Name of the table                       
                    
-     'path_output',                 -- Table name to store the path results    
                     
-     'session_id',                  -- Partition by session                 
-     'event_timestamp ASC',         -- Order partitions in input table by time 
      
+SELECT madlib.path(
+     'eventlog',                    -- Name of the table
+     'path_output',                 -- Table name to store the path results
+     'session_id',                  -- Partition by session
+     'event_timestamp ASC',         -- Order partitions in input table by time
      $$ nobuy:=page<>'CHECKOUT',
         buy:=page='CHECKOUT'
-     $$,  -- Definition of symbols used in the pattern definition 
+     $$,  -- Definition of symbols used in the pattern definition
      '(nobuy)+(buy)',         -- At least one page followed by and ending with 
a CHECKOUT.
-     'array_agg(page ORDER BY session_id ASC, event_timestamp ASC) as 
page_path',  
+     'array_agg(page ORDER BY session_id ASC, event_timestamp ASC) as 
page_path',
      FALSE,                        -- Don't persist matches
      TRUE                          -- Turn on overlapping patterns
      );
@@ -439,7 +439,7 @@ SELECT * FROM path_output ORDER BY session_id, match_id;
 </pre>
 Result with overlap turned on:
 <pre class="result">
- session_id | match_id |             page_path             
+ session_id | match_id |             page_path
 ------------+----------+-----------------------------------
         100 |        1 | {LANDING,WINE,CHECKOUT}
         100 |        2 | {WINE,CHECKOUT}
@@ -459,7 +459,7 @@ Result with overlap turned on:
 </pre>
 With overlap turned off, the result would be:
 <pre class="result">
- session_id | match_id |             page_path             
+ session_id | match_id |             page_path
 ------------+----------+-----------------------------------
         100 |        1 | {LANDING,WINE,CHECKOUT}
         102 |        1 | {LANDING,WINE,CHECKOUT}

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/utilities/pivot.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/pivot.sql_in 
b/src/ports/postgres/modules/utilities/pivot.sql_in
index d1aba19..ab34868 100644
--- a/src/ports/postgres/modules/utilities/pivot.sql_in
+++ b/src/ports/postgres/modules/utilities/pivot.sql_in
@@ -41,7 +41,7 @@ m4_include(`SQLCommon.m4')
 </ul>
 </div>
 
-@brief Pivoting and data summarization tools for preparing data 
+@brief Pivoting and data summarization tools for preparing data
 for modeling operations.
 
 @anchor pivoting
@@ -143,7 +143,7 @@ pivot(
     If the total number of output columns exceeds this limit, then make this
     parameter either 'array' (to combine the output columns into an array) or
     'svec' (to cast the array output to <em>'madlib.svec'</em> type).
-    If you have an 'aggregate_func' that has an array return type, 
+    If you have an 'aggregate_func' that has an array return type,
     it cannot be combined with 'output_type'='array' or 'svec'.
 
     A dictionary will be created (<em>output_col_dictionary=TRUE</em>)

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/utilities/sessionize.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/sessionize.sql_in 
b/src/ports/postgres/modules/utilities/sessionize.sql_in
index f8ac6d7..735eaf6 100644
--- a/src/ports/postgres/modules/utilities/sessionize.sql_in
+++ b/src/ports/postgres/modules/utilities/sessionize.sql_in
@@ -38,12 +38,12 @@ m4_include(`SQLCommon.m4')
 </ul>
 </div>
 
-@brief Session reconstruction of data 
+@brief Session reconstruction of data
 consisting of a time stampled sequence of events.
 @details The MADlib sessionize function performs time-oriented session 
reconstruction
 on a data set comprising a sequence of events.  A defined period of inactivity
-indicates the end of one session and beginning of the next session.  Sessions 
can be 
-useful in many domains including web analytics [1], network security, 
manufacturing, finance, 
+indicates the end of one session and beginning of the next session.  Sessions 
can be
+useful in many domains including web analytics [1], network security, 
manufacturing, finance,
 and operational analytics.
 
 @anchor syntax
@@ -66,8 +66,8 @@ sessionize(
     <dd>VARCHAR. Name of the source table that contains the data to be 
sessionized.</dd>
 
     <dt>output_table</dt>
-    <dd>VARCHAR. Name of the output view or table.  (The parameter create_view 
described below 
-    defines whether the output is actually a view or a table.)  In addition to 
the columns in the 
+    <dd>VARCHAR. Name of the output view or table.  (The parameter create_view 
described below
+    defines whether the output is actually a view or a table.)  In addition to 
the columns in the
     source_table, the output also contains a new column called session_id:
     <ul>
         <li>session_id=1,2,...n where n is the number of the session in the 
partition.</li>
@@ -78,11 +78,11 @@ sessionize(
     <dd>VARCHAR. The 'partition_expr' is a single column or a list of
     comma-separated columns/expressions to divide all rows into groups,
     or partitions. Sessionization is applied across the rows that fall into
-    the same partition.  This parameter can be set to NULL or '' to indicate 
+    the same partition.  This parameter can be set to NULL or '' to indicate
     the sessionization operation is to be applied to the whole input 
table.</dd>
 
     <dt>time_stamp</dt>
-    <dd>VARCHAR. The time stamp column name that is used for sessionization 
calculation.  
+    <dd>VARCHAR. The time stamp column name that is used for sessionization 
calculation.
     Note that the time_stamp column will be sorted in ascending order before
     the session reconstruction is done within a partition.</dd>
 
@@ -97,16 +97,16 @@ sessionize(
 
     <dt>output_cols (optional)</dt>
     <dd>VARCHAR. An optional comma separated list of columns to be written to 
the output_table.
-    Must be a valid SELECT expression.  This is set to '*' by default, which 
means all columns in the 
-    input table will be written to the output_table plus the session_id 
column.  
-    Note that this parameter could include a list containing the 
partition_expr 
-    or any other expressions of interest.  E.g., '*, expr1, expr2, etc.' where 
this means 
+    Must be a valid SELECT expression.  This is set to '*' by default, which 
means all columns in the
+    input table will be written to the output_table plus the session_id column.
+    Note that this parameter could include a list containing the partition_expr
+    or any other expressions of interest.  E.g., '*, expr1, expr2, etc.' where 
this means
     output all columns from the input table plus the expressions listed plus 
the session_id column.</dd>
 
     <dt>create_view (optional)</dt>
-    <dd>BOOLEAN default: TRUE.  Determines whether to create a view or 
materialize the output as a table. 
-    If you only need session info once, creating a view could be significantly 
-    faster than materializing as a table.  Please note that if you set 
create_view to NULL 
+    <dd>BOOLEAN default: TRUE.  Determines whether to create a view or 
materialize the output as a table.
+    If you only need session info once, creating a view could be significantly
+    faster than materializing as a table.  Please note that if you set 
create_view to NULL
     (allowed by PostgreSQL) it will get set to the default value of TRUE.</dd>
 </dl>
 
@@ -166,7 +166,7 @@ SELECT * FROM sessionize_output_view ORDER BY user_id, 
event_timestamp;
 
 Result:
 <pre class="result">
-   event_timestamp   | user_id |   page   | revenue | session_id 
+   event_timestamp   | user_id |   page   | revenue | session_id
 ---------------------+---------+----------+---------+------------
  2015-04-15 01:03:00 |  100821 | LANDING  |       0 |          1
  2015-04-15 01:04:00 |  100821 | WINE     |       0 |          1
@@ -193,7 +193,7 @@ Result:
 </pre>
 
 Now let's say we want to see 3 minute sessions by a group of users
-with a certain range of user IDs.  To do this, we need to sessionize 
+with a certain range of user IDs.  To do this, we need to sessionize
 the table based on a partition expression.  Also, we want to persist
 a table output with a reduced set of columns in the table.
 <pre class="example">
@@ -212,7 +212,7 @@ a table output with a reduced set of columns in the table.
 
 Result showing 2 users and 3 total sessions across the group:
 <pre class="result">
-   event_timestamp   | user_id | Department-A1 | session_id 
+   event_timestamp   | user_id | Department-A1 | session_id
 ---------------------+---------+---------------+------------
  2015-04-15 01:03:00 |  100821 | t             |          1
  2015-04-15 01:04:00 |  100821 | t             |          1

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/utilities/text_utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/text_utilities.sql_in 
b/src/ports/postgres/modules/utilities/text_utilities.sql_in
index 970dba7..2ae65cb 100644
--- a/src/ports/postgres/modules/utilities/text_utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/text_utilities.sql_in
@@ -25,15 +25,15 @@ m4_include(`SQLCommon.m4')
 @brief Provides a collection of functions for performing common
 tasks related to text analytics.
 
-Term frequency computes the number of times that a word 
-or term occurs in a document.  Term frequency is often 
-used as part of a larger text processing pipeline, which may 
+Term frequency computes the number of times that a word
+or term occurs in a document.  Term frequency is often
+used as part of a larger text processing pipeline, which may
 include operations such as stemming, stop word removal
 and topic modelling.
 
 @anchor function_syntax
 @par Function Syntax
- 
+
 <pre class="syntax">
     term_frequency(input_table,
                    doc_id_col,
@@ -67,9 +67,9 @@ and topic modelling.
     <dd>TEXT.
     The name of the table to store the term frequency output.
     The output table contains the following columns:
-        - \c doc_id_col: This the document id column 
+        - \c doc_id_col: This the document id column
         (name will be same as the one provided as input).
-        - \c word: Word/term present in a document. Depending on the value 
+        - \c word: Word/term present in a document. Depending on the value
         of \c compute_vocab below, this is either the original
         word as it appears in \c word_col, or an id representing the word.
         Note that word id's start from 0 not 1.
@@ -80,7 +80,7 @@ and topic modelling.
     <dd>BOOLEAN. (Optional, Default=FALSE)
     Flag to indicate if a vocabulary table is to be created. If TRUE, an 
additional
     output table is created containing the vocabulary of all words, with an id
-    assigned to each word in alphabetical order. 
+    assigned to each word in alphabetical order.
     The table is called <em>output_table</em>_vocabulary
     (i.e., suffix added to the <em>output_table</em> name) and contains the
     following columns:
@@ -102,16 +102,16 @@ INSERT INTO documents VALUES
 (2, 'My sister adopted two kittens yesterday.'),
 (3, 'Look at this cute hamster munching on a piece of broccoli.');
 </pre>
-You can apply stemming, stop word removal and tokenization at this point 
-in order to prepare the documents for text processing. 
-Depending upon your database version, various tools are 
-available. Databases based on more recent versions of 
+You can apply stemming, stop word removal and tokenization at this point
+in order to prepare the documents for text processing.
+Depending upon your database version, various tools are
+available. Databases based on more recent versions of
 PostgreSQL may do something like:
 <pre class="example">
 SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
 </pre>
 <pre class="result">
-                    tsvector_to_array                     
+                    tsvector_to_array
 +----------------------------------------------------------
  {ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
  {chinchilla,cute,kitten}
@@ -119,17 +119,17 @@ SELECT tsvector_to_array(to_tsvector('english',contents)) 
from documents;
  {broccoli,cute,hamster,look,munch,piec}
 (4 rows)
 </pre>
-In this example, we assume a database based on an older 
-version of PostgreSQL and just perform basic punctuation 
-removal and tokenization. The array of words is added as 
+In this example, we assume a database based on an older
+version of PostgreSQL and just perform basic punctuation
+removal and tokenization. The array of words is added as
 a new column to the documents table:
 <pre class="example">
 ALTER TABLE documents ADD COLUMN words TEXT[];
-UPDATE documents SET words = 
+UPDATE documents SET words =
     regexp_split_to_array(lower(
     regexp_replace(contents, E'[,.;\\']','', 'g')
     ), E'[\\\\s+]');
-\\x on   
+\\x on
 SELECT * FROM documents ORDER BY docid;
 </pre>
 <pre class="result">
@@ -163,7 +163,7 @@ SELECT madlib.term_frequency('documents',    -- input table
 SELECT * FROM documents_tf ORDER BY docid;
 </pre>
 <pre class="result">
- docid |    word     | count 
+ docid |    word     | count
 -------+-------------+-------
      0 | a           |     1
      0 | breakfast   |     1
@@ -204,8 +204,8 @@ SELECT * FROM documents_tf ORDER BY docid;
 (36 rows)
 </pre>
 
--# Next we create a vocabulary of the words 
-and store a wordid in the output table instead of the 
+-# Next we create a vocabulary of the words
+and store a wordid in the output table instead of the
 actual word:
 <pre class="example">
 DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
@@ -219,7 +219,7 @@ SELECT * FROM documents_tf ORDER BY docid;
 </pre>
 \nbsp
 <pre class="result">
- docid | wordid | count 
+ docid | wordid | count
 -------+--------+-------
      0 |     17 |     1
      0 |      9 |     1
@@ -260,13 +260,13 @@ SELECT * FROM documents_tf ORDER BY docid;
 (36 rows)
 </pre>
 \nbsp
-Note above that wordid's start 
+Note above that wordid's start
 at 0 not 1.  The vocabulary table maps wordid to the actual word:
 <pre class="example">
 SELECT * FROM documents_tf_vocabulary ORDER BY wordid;
 </pre>
 <pre class="result">
- wordid |    word     
+ wordid |    word
 --------+-------------
       0 | a
       1 | adopted
@@ -305,7 +305,7 @@ SELECT * FROM documents_tf_vocabulary ORDER BY wordid;
 @anchor related
 @par Related Topics
 
-See text_utilities.sql_in for the term frequency SQL function definition 
+See text_utilities.sql_in for the term frequency SQL function definition
 and porter_stemmer.sql_in for the stemmer function.
 
 */

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/utilities/utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in 
b/src/ports/postgres/modules/utilities/utilities.sql_in
index 09f971d..9d6538d 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in
@@ -25,7 +25,7 @@ m4_include(`SQLCommon.m4')
 
 @brief Provides a collection of user-defined functions for performing common 
tasks in the database.
 
-Database functions are a collection of lower level utilities 
+Database functions are a collection of lower level utilities
 to assist data scientists and others in using MADlib.
 
 @anchor utilities

http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/utilities/vec2cols.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/vec2cols.sql_in 
b/src/ports/postgres/modules/utilities/vec2cols.sql_in
index 115e015..a466d24 100644
--- a/src/ports/postgres/modules/utilities/vec2cols.sql_in
+++ b/src/ports/postgres/modules/utilities/vec2cols.sql_in
@@ -46,7 +46,7 @@ This process can be used to reverse the function cols2vec.
 
 Given a table with a column of type array, this function will create an output
 table that splits this array into multiple columns, one per array element.
-It includes the option to name the new feature columns, and to include 
+It includes the option to name the new feature columns, and to include
 columns from the original table in the output.
 
 @anchor vec2cols_usage
@@ -75,25 +75,25 @@ same name already exists, an error will be returned.</tt>
 <dd>TEXT. Name of the column containing the feature array.  Must be a 
one-dimensional array.</tt>
 
 <dt>feature_names (optional)</dt>
-<dd>TEXT[]. Array of names associated with the feature array.  Note that 
-this array exists in the summary table created by the function 'cols2vec'. If 
+<dd>TEXT[]. Array of names associated with the feature array.  Note that
+this array exists in the summary table created by the function 'cols2vec'. If
 the 'feature_names' array is not specified,
-column names will be automatically generated of 
+column names will be automatically generated of
 the form 'f1, f2, ...fn'.</tt>
-@note If you specify the 'feature_names' parameter, you will get exactly that 
number of 
-feature columns in the 'output_table'.  It means feature arrays from the 
'vector_col' may be 
+@note If you specify the 'feature_names' parameter, you will get exactly that 
number of
+feature columns in the 'output_table'.  It means feature arrays from the 
'vector_col' may be
 padded or truncated, if a particular feature array size does not match the 
target
-number of feature columns.  <br><br>If you do not specify the 'feature names' 
parameter, 
+number of feature columns.  <br><br>If you do not specify the 'feature names' 
parameter,
 the number of feature columns generated
 in the 'output_table' will be the maximum array size from 'vector_col'.
 Feature arrays that are less than this maximum will be padded.
 
 <dt>cols_to_output (optional)</dt>
-<dd>TEXT, default NULL. Comma-separated string of column names 
+<dd>TEXT, default NULL. Comma-separated string of column names
 from the source table to keep in the
-output table, in addition to the feature columns.  
+output table, in addition to the feature columns.
 To keep all columns from the source table, use '*'.
-The total number of columns in a table cannot exceed the 
+The total number of columns in a table cannot exceed the
 PostgreSQL limits.</tt>
 </dd>
 </dl>
@@ -109,8 +109,8 @@ PostgreSQL limits.</tt>
       </tr>
       <tr>
         <th>feature columns</th>
-        <td>Columns for each of the features in 'vector_col'.  Column type 
-        will depend on the feature array type in the source table.  Column 
+        <td>Columns for each of the features in 'vector_col'.  Column type
+        will depend on the feature array type in the source table.  Column
         naming will depend on whether the parameter 'feature_names' is used.
       </tr>
     </table>
@@ -149,9 +149,9 @@ INSERT INTO golf VALUES
 (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0);
 </pre>
 
--# Split the column "clouds_airquality" into new columns 
-called "clouds" and "air_quality". Also keep columns id 
-and "OUTLOOK" from the source table 
+-# Split the column "clouds_airquality" into new columns
+called "clouds" and "air_quality". Also keep columns id
+and "OUTLOOK" from the source table
 <pre class="example">
 DROP TABLE IF EXISTS vec2cols_result;
 SELECT madlib.vec2cols(
@@ -164,7 +164,7 @@ SELECT madlib.vec2cols(
 SELECT * FROM vec2cols_result ORDER BY id;
 </pre>
 <pre class="result">
- id | OUTLOOK  | clouds | air_quality 
+ id | OUTLOOK  | clouds | air_quality
 ----+----------+--------+-------------
   1 | sunny    | none   | unhealthy
   2 | sunny    | none   | moderate
@@ -184,9 +184,9 @@ SELECT * FROM vec2cols_result ORDER BY id;
 </pre>
 
 -# Similar to the previous example, except now
-we keep all columns from source table and do not 
+we keep all columns from source table and do not
 specify the feature names, so that default names
-are created. 
+are created.
 <pre class="example">
 DROP TABLE IF EXISTS vec2cols_result;
 SELECT madlib.vec2cols(
@@ -199,7 +199,7 @@ SELECT madlib.vec2cols(
 SELECT * FROM vec2cols_result ORDER BY id;
 </pre>
 <pre class="result">
- id | OUTLOOK  | temperature | humidity | Temp_Humidity | clouds_airquality | 
windy |   class    | observation_weight |   f1   |    f2     
+ id | OUTLOOK  | temperature | humidity | Temp_Humidity | clouds_airquality | 
windy |   class    | observation_weight |   f1   |    f2
 
----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+--------+-----------
   1 | sunny    |          85 |       85 | {85,85}       | {none,unhealthy}  | 
f     | Don't Play |                  5 | none   | unhealthy
   2 | sunny    |          80 |       90 | {80,90}       | {none,moderate}   | 
t     | Don't Play |                  5 | none   | moderate
@@ -233,7 +233,7 @@ SELECT madlib.cols2vec(
 SELECT * FROM cols2vec_result ORDER BY id;
 </pre>
 <pre class="result">
- id | temperature | humidity | feature_vector 
+ id | temperature | humidity | feature_vector
 ----+-------------+----------+----------------
   1 |          85 |       85 | {85,85}
   2 |          80 |       90 | {80,90}
@@ -278,7 +278,7 @@ SELECT madlib.vec2cols(
 SELECT * FROM vec2cols_result ORDER BY id;
 </pre>
 <pre class="result">
- id | temperature | humidity 
+ id | temperature | humidity
 ----+-------------+----------
   1 |          85 |       85
   2 |          80 |       90

Reply via email to