Multiple: Remove trailing whitespace from all SQL Markup language states that two trailing whitespace should be interpreted as a break line (<br>), which has been implemented by Doxygen 1.8+. This commit removes all such instances since the trailing whitespace is inadvertent in most cases. If a break line is required, then it should be added explicitly (using HTML tag <br>).
Closes #317 Co-authored-by: Domino Valdano <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/35818fa3 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/35818fa3 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/35818fa3 Branch: refs/heads/master Commit: 35818fa395f965191b59ddbfcd1469470f44271b Parents: 92bdf8c Author: Rahul Iyer <[email protected]> Authored: Fri Sep 7 15:12:49 2018 -0700 Committer: Rahul Iyer <[email protected]> Committed: Fri Sep 7 15:12:49 2018 -0700 ---------------------------------------------------------------------- methods/array_ops/src/pg_gp/array_ops.sql_in | 7 +- src/ports/postgres/modules/bayes/bayes.sql_in | 46 ++--- .../postgres/modules/bayes/test/bayes.sql_in | 114 ++++++------ .../conjugate_gradient/test/conj_grad.sql_in | 24 +-- src/ports/postgres/modules/convex/mlp.sql_in | 4 +- .../modules/crf/test/crf_test_large.sql_in | 6 +- .../modules/crf/test/crf_train_small.sql_in | 10 +- .../modules/elastic_net/elastic_net.sql_in | 12 +- src/ports/postgres/modules/glm/glm.sql_in | 4 +- src/ports/postgres/modules/glm/ordinal.sql_in | 38 ++-- .../postgres/modules/glm/test/ordinal.sql_in | 12 +- src/ports/postgres/modules/graph/bfs.sql_in | 88 +++++----- src/ports/postgres/modules/graph/hits.sql_in | 36 ++-- .../postgres/modules/graph/pagerank.sql_in | 12 +- src/ports/postgres/modules/graph/wcc.sql_in | 10 +- src/ports/postgres/modules/knn/knn.sql_in | 14 +- src/ports/postgres/modules/lda/lda.sql_in | 172 +++++++++---------- src/ports/postgres/modules/linalg/svd.sql_in | 62 +++---- src/ports/postgres/modules/pca/pca.sql_in | 104 +++++------ .../postgres/modules/pca/pca_project.sql_in | 54 +++--- .../recursive_partitioning/decision_tree.sql_in | 4 +- .../recursive_partitioning/random_forest.sql_in | 6 +- .../postgres/modules/regress/linear.sql_in | 24 +-- .../postgres/modules/regress/logistic.sql_in | 26 +-- .../modules/regress/test/clustered.sql_in | 8 +- .../postgres/modules/stats/correlation.sql_in | 54 +++--- .../modules/stats/hypothesis_tests.sql_in | 6 +- .../postgres/modules/stats/pred_metrics.sql_in | 8 +- .../postgres/modules/stats/test/f_test.sql_in | 2 +- .../postgres/modules/stats/test/ks_test.sql_in | 2 +- .../postgres/modules/stats/test/mw_test.sql_in | 2 +- .../postgres/modules/stats/test/t_test.sql_in | 4 +- .../postgres/modules/stats/test/wsr_test.sql_in | 2 +- .../postgres/modules/summary/summary.sql_in | 44 ++--- src/ports/postgres/modules/svm/svm.sql_in | 130 +++++++------- .../modules/tsa/test/arima_train.sql_in | 54 +++--- .../postgres/modules/utilities/cols2vec.sql_in | 12 +- .../postgres/modules/utilities/path.sql_in | 24 +-- .../postgres/modules/utilities/pivot.sql_in | 4 +- .../modules/utilities/sessionize.sql_in | 34 ++-- .../modules/utilities/text_utilities.sql_in | 48 +++--- .../postgres/modules/utilities/utilities.sql_in | 2 +- .../postgres/modules/utilities/vec2cols.sql_in | 42 ++--- 43 files changed, 685 insertions(+), 686 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/methods/array_ops/src/pg_gp/array_ops.sql_in ---------------------------------------------------------------------- diff --git a/methods/array_ops/src/pg_gp/array_ops.sql_in b/methods/array_ops/src/pg_gp/array_ops.sql_in index 3c905ce..e1aa368 100644 --- a/methods/array_ops/src/pg_gp/array_ops.sql_in +++ b/methods/array_ops/src/pg_gp/array_ops.sql_in @@ -275,12 +275,11 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); * @brief Aggregate, element-wise sum of arrays. It requires that all the values are NON-NULL. Return type is the same as the input type. * * @param x Array x - * @param y Array y - * @returns Sum of x and y. + * @returns Sum of x * */ -DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.sum(anyarray) CASCADE; -CREATE AGGREGATE MADLIB_SCHEMA.sum(anyarray) ( +DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.sum(/* x */ anyarray) CASCADE; +CREATE AGGREGATE MADLIB_SCHEMA.sum(/* x */ anyarray) ( SFUNC = MADLIB_SCHEMA.array_add, STYPE = anyarray m4_ifdef( `__POSTGRESQL__', `', `, PREFUNC = MADLIB_SCHEMA.array_add') http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/bayes/bayes.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/bayes/bayes.sql_in b/src/ports/postgres/modules/bayes/bayes.sql_in index 769b35c..40b71d2 100644 --- a/src/ports/postgres/modules/bayes/bayes.sql_in +++ b/src/ports/postgres/modules/bayes/bayes.sql_in @@ -43,8 +43,8 @@ certain class \f$ c \f$. Naives Bayes classification estimates feature probabilities and class priors using maximum likelihood or Laplacian smoothing. For numeric attributes, -Gaussian smoothing can be used to estimate the feature probabilities.These -parameters are then used to classify new data. +Gaussian smoothing can be used to estimate the feature probabilities.These +parameters are then used to classify new data. @anchor train @@ -64,7 +64,7 @@ create_nb_prepared_data_tables ( trainingSource, ) </pre> -For data containing both categorical and numeric attributes, use the following form to +For data containing both categorical and numeric attributes, use the following form to precompute the Gaussian parameters (mean and variance) for numeric attributes alongside the feature probabilities for categorical attributes and class priors. @@ -88,8 +88,8 @@ The \e trainingSource is expected to be of the following form: ... )</pre> -\e numericAttrsColumnIndices should be of type TEXT, specified as an array of -indices (starting from 1) in the \e trainingAttrColumn attributes-array that +\e numericAttrsColumnIndices should be of type TEXT, specified as an array of +indices (starting from 1) in the \e trainingAttrColumn attributes-array that correspond to numeric attributes. The two output tables are: @@ -97,7 +97,7 @@ The two output tables are: - \e classPriorsName – stores the class priors In addition to the above, if the function specifying numeric attributes is used, -an additional table \e numericAttrParamsName is created which stores the +an additional table \e numericAttrParamsName is created which stores the Gaussian parameters for the numeric attributes. @anchor classify @@ -225,24 +225,24 @@ attributes.\n For continuous data, a typical assumption, usually used for small datasets, is that the continuous values associated with each class are distributed according to a Gaussian distribution, -and the probabilities \f$ P(A_i = a \mid C=c) \f$ are estimated using the +and the probabilities \f$ P(A_i = a \mid C=c) \f$ are estimated using the Gaussian Distribution formula: \f[ P(A_i=a \mid C=c) = \frac{1}{\sqrt{2\pi\sigma^{2}_c}}exp\left(-\frac{(a-\mu_c)^{2}}{2\sigma^{2}_c}\right) \f] -where \f$\mu_c\f$ and \f$\sigma^{2}_c\f$ are the population mean and variance +where \f$\mu_c\f$ and \f$\sigma^{2}_c\f$ are the population mean and variance of the attribute for the class \f$c\f$.\n -Another common technique for handling continuous values, which is better for -large data sets, is to use binning to discretize the values, and convert the +Another common technique for handling continuous values, which is better for +large data sets, is to use binning to discretize the values, and convert the continuous data into categorical bins. This approach is currently not implemented. - One can provide floating point data to the Naive Bayes -classification function. If the corresponding attribute index is not specified -in \e numericAttrsColumnIndices, floating point numbers will be used as symbolic -substitutions for categorial data. In this case, the classification would work -best if there are sufficient data points for each floating point attribute. However, -if floating point numbers are used as continuous data without the attribute being -marked as of type numeric in \e numericAttrsColumnIndices, no warning is raised +classification function. If the corresponding attribute index is not specified +in \e numericAttrsColumnIndices, floating point numbers will be used as symbolic +substitutions for categorial data. In this case, the classification would work +best if there are sufficient data points for each floating point attribute. However, +if floating point numbers are used as continuous data without the attribute being +marked as of type numeric in \e numericAttrsColumnIndices, no warning is raised and the result may not be as expected. @anchor examples @@ -362,7 +362,7 @@ Result: (4 rows) </pre> -The following is an example of using a dataset with both numeric and +The following is an example of using a dataset with both numeric and categorical attributes -# The training and the classification data. Attributes {height(numeric),weight(numeric),shoe size(categorical)}, @@ -372,7 +372,7 @@ SELECT * FROM gaussian_data; </pre> Result: <pre class="result"> - id | sex | attributes + id | sex | attributes ----+-----+--------------- 1 | 1 | {6,180,12} 2 | 1 | {5.92,190,12} @@ -389,7 +389,7 @@ SELECT * FROM gaussian_test; </pre> Result: <pre class="result"> - id | sex | attributes + id | sex | attributes ----+-----+-------------- 9 | 1 | {5.8,180,11} 10 | 2 | {5,160,6} @@ -415,7 +415,7 @@ SELECT * FROM class_priors; </pre> Result: <pre class="result"> -class | class_cnt | all_cnt +class | class_cnt | all_cnt -------+-----------+--------- 1 | 4 | 8 2 | 4 | 8 @@ -426,7 +426,7 @@ SELECT * FROM categ_feature_probs; </pre> Result: <pre class="result"> - class | attr | value | cnt | attr_cnt + class | attr | value | cnt | attr_cnt -------+------+-------+-----+---------- 2 | 3 | 6 | 2 | 5 1 | 3 | 12 | 2 | 5 @@ -445,7 +445,7 @@ SELECT * FROM numeric_attr_params; </pre> Result: <pre class="result"> -class | attr | attr_mean | attr_var +class | attr | attr_mean | attr_var -------+------+----------------------+------------------------ 1 | 1 | 5.8550000000000000 | 0.03503333333333333333 1 | 2 | 176.2500000000000000 | 122.9166666666666667 @@ -493,7 +493,7 @@ SELECT * FROM probs_view; </pre> Result: <pre class="result"> - key | class | nb_prob + key | class | nb_prob -----+-------+---------------------- 9 | 1 | 0.993556745948775 9 | 2 | 0.00644325405122553 http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/bayes/test/bayes.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/bayes/test/bayes.sql_in b/src/ports/postgres/modules/bayes/test/bayes.sql_in index ec49d4b..88604a6 100644 --- a/src/ports/postgres/modules/bayes/test/bayes.sql_in +++ b/src/ports/postgres/modules/bayes/test/bayes.sql_in @@ -1,5 +1,5 @@ --------------------------------------------------------------------------- --- Rules: +-- Rules: -- ------ -- 1) Any DB objects should be created w/o schema prefix, -- since this file is executed in a separate schema context. @@ -8,7 +8,7 @@ --------------------------------------------------------------------------- --------------------------------------------------------------------------- --- Setup: +-- Setup: --------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION fill_feature(pre_class INT, p FLOAT, total INT) RETURNS FLOAT AS $$ @@ -51,18 +51,18 @@ $$ language plpgsql; -- ---------------- -- install_test_1() -- ---------------- -CREATE FUNCTION install_test_1() RETURNS VOID AS $$ +CREATE FUNCTION install_test_1() RETURNS VOID AS $$ declare num1 INT := 10; - num2 INT := 10; + num2 INT := 10; - result1 INT; + result1 INT; count1 INT; tempvar INT[]; - + begin -- prepare training data: equal priors - --DROP TABLE IF EXISTS data CASCADE; + --DROP TABLE IF EXISTS data CASCADE; CREATE TABLE data_1( class INT, attrib FLOAT[] ); INSERT INTO data_1 SELECT 1, ARRAY[fill_feature(id,0.3,num1),fill_feature(id,0.8,num1)] FROM generate_series(1,num1) as id; INSERT INTO data_1 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2)] FROM generate_series(1,num2) as id; @@ -97,7 +97,7 @@ begin -- Check the results SELECT * INTO result1,count1 FROM - (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) + (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) FROM data_results_1 data_results INNER JOIN probs_view_1 as probs_view ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t; @@ -114,7 +114,7 @@ begin IF (result1 != 2) THEN RAISE EXCEPTION 'Incorrect classification'; - END IF; + END IF; -- Repeat using function w/out preprocessing priors -- Classify @@ -124,21 +124,21 @@ begin --DROP VIEW IF EXISTS probs_view; --PERFORM MADLIB_SCHEMA.create_nb_probs_view('data','class','attrib','data_test','id','attrib',2,'probs_view'); -end +end $$ language plpgsql; -- ---------------- -- install_test_2() -- ---------------- -CREATE FUNCTION install_test_2() RETURNS VOID AS $$ +CREATE FUNCTION install_test_2() RETURNS VOID AS $$ declare num1 INT := 10; - num2 INT := 10; + num2 INT := 10; - result1 INT; + result1 INT; count1 INT; tempvar INT[]; - + begin -- 3 class case CREATE TABLE data_2( class INT, attrib FLOAT[] ); @@ -166,16 +166,16 @@ begin --DROP TABLE IF EXISTS probs CASCADE; --DROP TABLE IF EXISTS priors CASCADE; PERFORM create_nb_prepared_data_tables('data_2','class','attrib',2,'probs_2','priors_2'); - + --DROP VIEW IF EXISTS results; PERFORM create_nb_classify_view('probs_2','priors_2','data_test_2','id','attrib',2,'results_2'); - + --DROP VIEW IF EXISTS probs_view; PERFORM create_nb_probs_view('probs_2','priors_2','data_test_2','id','attrib',2,'probs_view_2'); - + SELECT * INTO result1,count1 FROM - (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) - FROM data_results_2 as data_results INNER JOIN probs_view_2 as probs_view + (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) + FROM data_results_2 as data_results INNER JOIN probs_view_2 as probs_view ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t; IF (result1 != 0) OR (count1 != 18) THEN @@ -183,25 +183,25 @@ begin END IF; RAISE INFO 'Naive Bayes install checks passed'; - RETURN; -end + RETURN; +end $$ language plpgsql; -- ---------------- -- install_test_3() -- ---------------- -CREATE FUNCTION install_test_3() RETURNS VOID AS $$ +CREATE FUNCTION install_test_3() RETURNS VOID AS $$ declare num1 INT := 10; - num2 INT := 10; + num2 INT := 10; - result1 INT; + result1 INT; count1 INT; tempvar INT[]; - + begin -- Unequal prior probabilities - --DROP TABLE IF EXISTS data CASCADE; + --DROP TABLE IF EXISTS data CASCADE; CREATE TABLE data_3( class INT, attrib FLOAT[] ); INSERT INTO data_3 SELECT 1, ARRAY[fill_feature(id,0.3,num1*2),fill_feature(id,0.8,num1*2)] FROM generate_series(1,num1*2) as id; INSERT INTO data_3 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2)] FROM generate_series(1,num2) as id; @@ -236,41 +236,41 @@ begin -- Check the results SELECT * INTO result1,count1 FROM - (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) - FROM data_results_3 as data_results INNER JOIN probs_view_3 as probs_view + (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) + FROM data_results_3 as data_results INNER JOIN probs_view_3 as probs_view ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t; IF (result1 != 0) OR (count1 != 8) THEN RAISE EXCEPTION 'Incorrect probabilities'; END IF; - + RAISE INFO 'Naive Bayes install checks passed'; RETURN; -end +end $$ language plpgsql; ----------------------------------------------------------- ---Test where numeric probabilities cancel out, leaving +--Test where numeric probabilities cancel out, leaving --only nominal probabilities to decide the class ----------------------------------------------------------- -CREATE FUNCTION install_test_4() RETURNS VOID AS $$ +CREATE FUNCTION install_test_4() RETURNS VOID AS $$ declare num1 INT := 10; - num2 INT := 10; + num2 INT := 10; - result1 INT; + result1 INT; count1 INT; tempvar INT[]; - + begin - + --equal class priors and numerical attr probabilities - --DROP TABLE IF EXISTS data CASCADE; + --DROP TABLE IF EXISTS data CASCADE; CREATE TABLE data_4( class INT, attrib FLOAT[] ); INSERT INTO data_4 SELECT 1, ARRAY[fill_feature(id,0.3,num1),fill_feature(id,0.8,num1),id] FROM generate_series(1,num1) as id; INSERT INTO data_4 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2),id] FROM generate_series(1,num2) as id; - + -- prepare testing data --DROP TABLE IF EXISTS data_test CASCADE; CREATE TABLE data_test_4( id INT, attrib NUMERIC[], prob FLOAT[] ); @@ -302,45 +302,45 @@ begin -- Check the results SELECT * INTO result1,count1 FROM - (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) - FROM data_results_4 as data_results INNER JOIN probs_view_4 as probs_view + (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) + FROM data_results_4 as data_results INNER JOIN probs_view_4 as probs_view ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t; IF (result1 != 0) OR (count1 != 8) THEN RAISE EXCEPTION 'Incorrect probabilities'; END IF; - + RAISE INFO 'Naive Bayes install checks passed'; RETURN; -end +end $$ language plpgsql; --------------------------------------------------------------------------- --- Test: +-- Test: --------------------------------------------------------------------------- --------------------------------------------------------------------------- --- Test where nominal probabilities cancel out,leaving only numeric --- probabilities to decide the classes. +-- Test where nominal probabilities cancel out,leaving only numeric +-- probabilities to decide the classes. --------------------------------------------------------------------------- -CREATE FUNCTION install_test_5() RETURNS VOID AS $$ +CREATE FUNCTION install_test_5() RETURNS VOID AS $$ declare num1 INT := 10; - num2 INT := 10; + num2 INT := 10; - result1 INT; + result1 INT; count1 INT; tempvar INT[]; - + begin - + --equal class priors and numerical attr probabilities - --DROP TABLE IF EXISTS data CASCADE; + --DROP TABLE IF EXISTS data CASCADE; CREATE TABLE data_5( class INT, attrib FLOAT[] ); INSERT INTO data_5 SELECT 1, ARRAY[fill_feature(id,0.5,num1),fill_feature(id,0.8,num1),id] FROM generate_series(1,num1) as id; INSERT INTO data_5 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.8,num2),id%5] FROM generate_series(1,num2) as id; - + -- prepare testing data --DROP TABLE IF EXISTS data_test CASCADE; CREATE TABLE data_test_5( id INT, attrib float8[], prob FLOAT8[] ); @@ -372,22 +372,22 @@ begin -- Check the results SELECT * INTO result1,count1 FROM - (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) - FROM data_results_5 as data_results INNER JOIN probs_view_5 as probs_view + (SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*) + FROM data_results_5 as data_results INNER JOIN probs_view_5 as probs_view ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t; IF (result1 != 0) OR (count1 != 8) THEN RAISE EXCEPTION 'Incorrect probabilities'; END IF; - + RAISE INFO 'Naive Bayes install checks passed'; RETURN; -end +end $$ language plpgsql; --------------------------------------------------------------------------- --- Test: +-- Test: --------------------------------------------------------------------------- SELECT install_test_1(); SELECT install_test_2(); http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in b/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in index a3f02f4..52e56ef 100644 --- a/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in +++ b/src/ports/postgres/modules/conjugate_gradient/test/conj_grad.sql_in @@ -1,5 +1,5 @@ --------------------------------------------------------------------------- --- Rules: +-- Rules: -- ------ -- 1) Any DB objects should be created w/o schema prefix, -- since this file is executed in a separate schema context. @@ -8,14 +8,14 @@ --------------------------------------------------------------------------- --------------------------------------------------------------------------- --- Setup: +-- Setup: --------------------------------------------------------------------------- -CREATE FUNCTION install_test() RETURNS VOID AS $$ +CREATE FUNCTION install_test() RETURNS VOID AS $$ declare - + result float; x FLOAT[]; - + begin CREATE TABLE A( row INT, @@ -34,28 +34,28 @@ begin INSERT INTO A VALUES(10,ARRAY[0.1108536, 0.08383503, 0.06882649, 0.05882206, 0.05154975, 0.04597544, 0.04154376, 0.03792426, 0.03490583, 0.23234632]); SELECT INTO x MADLIB_SCHEMA.conjugate_gradient('A', 'val', 'row', ARRAY(SELECT random() FROM generate_series(1,10)), .000001,2); - result = x[1]*x[1]; - + result = x[1]*x[1]; + IF (result > .00000001) THEN RAISE EXCEPTION 'Incorrect results, got %',result; - END IF; + END IF; -- simple symmetric +ve def case CREATE TABLE data(row_num INT, row_val FLOAT[]); INSERT INTO data VALUES (1,'{2,1}'); INSERT INTO data VALUES (2,'{1,4}'); - SELECT INTO x MADLIB_SCHEMA.conjugate_gradient('data','row_val','row_num','{2,1}',1E-6,1); + SELECT INTO x MADLIB_SCHEMA.conjugate_gradient('data','row_val','row_num','{2,1}',1E-6,1); IF (round(x[1]) != 1) OR (round(x[2]) != 0) THEN RAISE EXCEPTION 'Incorrect multivariate results, got %',x; END IF; - + RAISE INFO 'Conjugate gradient install checks passed'; RETURN; - -end + +end $$ language plpgsql; --------------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/convex/mlp.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in index 3e2468f..07d7dbc 100644 --- a/src/ports/postgres/modules/convex/mlp.sql_in +++ b/src/ports/postgres/modules/convex/mlp.sql_in @@ -1165,7 +1165,7 @@ SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id) ORDER BY id; </pre> RMS error: <pre class="example"> -SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM lin_housing +SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM lin_housing JOIN mlp_regress_prediction USING (id); </pre> <pre class="result"> @@ -1257,7 +1257,7 @@ SELECT *, ABS(y-estimated_y) as abs_diff FROM lin_housing JOIN mlp_regress_predi </pre> RMS error: <pre class="example"> -SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM lin_housing +SELECT SQRT(AVG((y-estimated_y)*(y-estimated_y))) as rms_error FROM lin_housing JOIN mlp_regress_prediction USING (id); </pre> <pre class="result"> http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/crf/test/crf_test_large.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/crf/test/crf_test_large.sql_in b/src/ports/postgres/modules/crf/test/crf_test_large.sql_in index 9ededaa..6658545 100644 --- a/src/ports/postgres/modules/crf/test/crf_test_large.sql_in +++ b/src/ports/postgres/modules/crf/test/crf_test_large.sql_in @@ -6,7 +6,7 @@ -- 2) There should be no DROP statements in this script, since -- all objects created in the default schema will be cleaned-up outside. --------------------------------------------------------------------------- - + -- Features table produced by Dr. Sunita's CRF java package (as provided by Kun, "CRF2" package on github) using the training data in the file us50.train.tagged CREATE TABLE crf_feature_test_new (id integer,name text,prev_label_id integer,label_id integer,weight float); INSERT INTO crf_feature_test_new VALUES @@ -585,7 +585,7 @@ INSERT INTO test_segmenttbl_new VALUES (17, 4, '.', 17); analyze test_segmenttbl_new; - -- extract features for tokens stored in segmenttbl + -- extract features for tokens stored in segmenttbl SELECT crf_test_fgen('test_segmenttbl_new','crf_dictionary_new','crf_label_new','crf_regex_new','crf_feature_test_new','viterbi_mtbl_new','viterbi_rtbl_new'); @@ -597,7 +597,7 @@ INSERT INTO test_segmenttbl_new VALUES 'extraction_new'); -- Expected viterbi labeling result - -- The result is produced from Dr. Sunita's CRF java package with the same input + -- The result is produced from Dr. Sunita's CRF java package with the same input CREATE TABLE expected_extraction_new(doc_id integer, start_pos integer, seg_text text, label character varying); INSERT INTO expected_extraction_new VALUES (1, 0, 'that', 'DT'), http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/crf/test/crf_train_small.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/crf/test/crf_train_small.sql_in b/src/ports/postgres/modules/crf/test/crf_train_small.sql_in index 9dca05e..1dbd6fa 100644 --- a/src/ports/postgres/modules/crf/test/crf_train_small.sql_in +++ b/src/ports/postgres/modules/crf/test/crf_train_small.sql_in @@ -16,7 +16,7 @@ m4_changequote(<!,!>) m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<! -- Regex table - CREATE TABLE train_regex(pattern text,name text); + CREATE TABLE train_regex(pattern text,name text); INSERT INTO train_regex VALUES ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'),('^.+[.]$','endsWithDot'), @@ -125,8 +125,8 @@ m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<! (114,'E.',42,6,2.79544657270948); SELECT assert( - SUM(abs(c1.weight-c2.weight)) < 0.1, - 'Total difference between extracted feature weights and expected feature weights is > 0.1.') + SUM(abs(c1.weight-c2.weight)) < 0.1, + 'Total difference between extracted feature weights and expected feature weights is > 0.1.') FROM expected_crf_feature c1, train_crf_feature c2 WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;; @@ -136,7 +136,7 @@ m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<! FROM ( SELECT count(*) FROM( SELECT name, prev_label, label - FROM expected_crf_feature + FROM expected_crf_feature EXCEPT ALL SELECT name, prev_label_id, label_id FROM train_crf_feature @@ -148,7 +148,7 @@ m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<! FROM train_crf_feature EXCEPT ALL SELECT name, prev_label, label - FROM expected_crf_feature + FROM expected_crf_feature ) AS U )s2; http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/elastic_net/elastic_net.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in index 838a6bd..55f5b25 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in @@ -231,14 +231,14 @@ cross validation is used. Also, cross validation is not supported if grouping i Hyperparameter optimization can be carried out using the built-in cross validation mechanism, which is activated by assigning a value greater than 1 to -the parameter \e n_folds. +the parameter \e n_folds. The cross validation scores are the mean and standard deviation -of the accuracy when predicted on the validation fold, +of the accuracy when predicted on the validation fold, averaged over all folds and all rows. For classification, the accuracy -metric used is the ratio of correct classifications. For regression, the -accuracy metric used is the negative of mean squared error (negative to -make it a concave problem, thus selecting \e max means the highest accuracy). +metric used is the ratio of correct classifications. For regression, the +accuracy metric used is the negative of mean squared error (negative to +make it a concave problem, thus selecting \e max means the highest accuracy). The values of a parameter to cross validate should be provided in a list. For example, to regularize with the L1 norm and use a lambda value @@ -784,7 +784,7 @@ iteration_run | 10000 SELECT * FROM houses_en3_cv ORDER BY mean_neg_loss DESC; </pre> <pre class="result"> - alpha | lambda_value | mean_neg_loss | std_neg_loss + alpha | lambda_value | mean_neg_loss | std_neg_loss -------+--------------+------------------------------------------+ 0.0 | 0.1 | -36094.4685768 | 10524.4473253 0.1 | 0.1 | -36136.2448004 | 10682.4136993 http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/glm/glm.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/glm/glm.sql_in b/src/ports/postgres/modules/glm/glm.sql_in index b0a8da1..377dfea 100644 --- a/src/ports/postgres/modules/glm/glm.sql_in +++ b/src/ports/postgres/modules/glm/glm.sql_in @@ -1247,7 +1247,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); * @param coef Coefficients obtained by running generalized linear model. * @param col_ind Predictor variable array * @param link Link function used in training - * @returns Numeric value of the predicted count, obtained by rounding the predicted mean + * @returns Numeric value of the predicted count, obtained by rounding the predicted mean * to the nearest integral value. * * This function computes the dot product of the independent variables and the @@ -1263,7 +1263,7 @@ AS 'MODULE_PATHNAME', 'glm_predict_poisson' LANGUAGE C STRICT IMMUTABLE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); - + -- Help messages ------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict( message TEXT http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/glm/ordinal.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/glm/ordinal.sql_in b/src/ports/postgres/modules/glm/ordinal.sql_in index aba9e93..7cda294 100644 --- a/src/ports/postgres/modules/glm/ordinal.sql_in +++ b/src/ports/postgres/modules/glm/ordinal.sql_in @@ -26,11 +26,11 @@ m4_include(`SQLCommon.m4') @brief Regression to model data with ordinal response variable. -In statistics, ordinal regression is a type of regression analysis -used for predicting an ordinal variable, i.e. a variable whose -value exists on an arbitrary scale where only the relative ordering -between different values is significant. The two most common types -of ordinal regression models are ordered logit, which applies to +In statistics, ordinal regression is a type of regression analysis +used for predicting an ordinal variable, i.e. a variable whose +value exists on an arbitrary scale where only the relative ordering +between different values is significant. The two most common types +of ordinal regression models are ordered logit, which applies to data that meet the proportional odds assumption, and ordered probit. @anchor train @@ -62,25 +62,25 @@ ordinal(source_table, <table class="output"> <tr> <th><...></th> - <td>Grouping columns, if provided in input. This could be + <td>Grouping columns, if provided in input. This could be multiple columns depending on the \c grouping_col input.</td> </tr> <tr> <th>coef_threshold</th> - <td>FLOAT8[]. Vector of the threshold coefficients in linear predictor. + <td>FLOAT8[]. Vector of the threshold coefficients in linear predictor. The threshold coefficients are the intercepts specific to each categorical levels</td> </tr> <tr> <th>std_err_threshold</th> - <td>FLOAT8[]. Vector of the threshold standard errors + <td>FLOAT8[]. Vector of the threshold standard errors of the threshold coefficients.</td> </tr> <tr> <th>z_stats_threshold</th> - <td>FLOAT8[]. Vector of the threshold z-statistics of + <td>FLOAT8[]. Vector of the threshold z-statistics of the thresholdcoefficients.</td> </tr> @@ -91,14 +91,14 @@ ordinal(source_table, <tr> <th>log_likelihood</th> - <td>FLOAT8. The log-likelihood \f$ l(\boldsymbol \beta) \f$. The value will + <td>FLOAT8. The log-likelihood \f$ l(\boldsymbol \beta) \f$. The value will be the same across categories within the same group.</td> </tr> <tr> <th>coef_feature</th> - <td>FLOAT8[]. Vector of the feature coefficients in linear predictor. The - feature coefficients are the coefficients for the independent variables. + <td>FLOAT8[]. Vector of the feature coefficients in linear predictor. The + feature coefficients are the coefficients for the independent variables. They are the same across categories.</td> </tr> @@ -160,9 +160,9 @@ ordinal(source_table, <tr> <th>independent_varname</th> - <td>VARCHAR. Expression for independent variables. The independent variables - should not include intercept term. Otherwise there will be an error message - indicating Hessian matrix is not finite. In that case, the user should + <td>VARCHAR. Expression for independent variables. The independent variables + should not include intercept term. Otherwise there will be an error message + indicating Hessian matrix is not finite. In that case, the user should drop the intercept and rerun the function agian.</td> </tr> @@ -183,7 +183,7 @@ ordinal(source_table, <tr> <th>optimizer_params</th> - <td>VARCHAR. String that contains optimizer parameters, and has the form + <td>VARCHAR. String that contains optimizer parameters, and has the form of 'optimizer=..., max_iter=..., tolerance=...'.</td> </tr> @@ -299,7 +299,7 @@ ordinal_predict( <DD>TEXT. Either 'response' or 'probability'. Using 'response' will give the predicted category with the largest probability. Using probability will give the predicted probabilities for all categories</DD> - + <DT>verbose</DT> <DD>BOOLEAN. Whether verbose is displayed</DD> </DL> @@ -421,7 +421,7 @@ would be used for prediction.) \\x off -- Add the id column for prediction function ALTER TABLE test3 ADD COLUMN id SERIAL; --- Predict probabilities for all categories using the original data +-- Predict probabilities for all categories using the original data SELECT ordinal_predict('test3_output','test3', 'test3_prd_prob', 'probability'); -- Display the predicted value SELECT * FROM test3_prd_prob; @@ -432,7 +432,7 @@ SELECT * FROM test3_prd_prob; The function ordinal() fit the ordinal response model using a cumulative link model. The ordinal reponse variable, denoted by \f$ Y_i \f$, can fall in \f$ j = 1,.. , J\f$ categories. Then \f$ Y_i \f$ follows a multinomial distribution with parameter \f$\pi\f$ where \f$\pi_{ij}\f$ denote the probability that the \f$i\f$th observation falls in response category \f$j\f$. We define the cumulative probabilities as \f[ -\gamma_{ij} = \Pr(Y_i \le j)= \pi_{i1} +...+ \pi_{ij} . +\gamma_{ij} = \Pr(Y_i \le j)= \pi_{i1} +...+ \pi_{ij} . \f] Next we will consider the logit link for illustration purpose. The logit function is defined as \f$ \mbox{logit}(\pi) = \log[\pi/(1-\pi)] \f$ and cumulative logits are defined as: \f[ http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/glm/test/ordinal.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/glm/test/ordinal.sql_in b/src/ports/postgres/modules/glm/test/ordinal.sql_in index ffcb131..2998f22 100644 --- a/src/ports/postgres/modules/glm/test/ordinal.sql_in +++ b/src/ports/postgres/modules/glm/test/ordinal.sql_in @@ -239,7 +239,7 @@ SELECT * FROM ordinal_out_summary; SELECT assert( relative_error(coef_threshold, ARRAY[2.6350, 3.8995]) < 1e-4 AND relative_error(coef_feature, ARRAY[.53958, .05595]) < 1e-4 AND - relative_error(log_likelihood, -190.92) < 1e-4, + relative_error(log_likelihood, -190.92) < 1e-4, 'Ordinal regression with IRLS optimizer (test): Wrong results' ) FROM ordinal_out; @@ -268,7 +268,7 @@ SELECT ordinal_predict( ); SELECT assert( - relative_error(prob, ARRAY[0.5342011, 0.4278446, 0.2375883, 0.5062776])< 1e-4, + relative_error(prob, ARRAY[0.5342011, 0.4278446, 0.2375883, 0.5062776])< 1e-4, 'Ordinal regression prediction: Wrong results' ) FROM (SELECT array_agg("0" ORDER BY id) as prob FROM ordinal_prd)subq; @@ -293,7 +293,7 @@ SELECT * FROM ordinal_out_summary; SELECT assert( relative_error(coef_threshold, ARRAY[2.3497, 3.7680]) < 1e-4 AND relative_error(coef_feature, ARRAY[0.59758, 0.04845]) < 1e-4 AND - relative_error(log_likelihood, -154.6991) < 1e-4, + relative_error(log_likelihood, -154.6991) < 1e-4, 'Ordinal regression with IRLS optimizer (test grouping case): Wrong results' ) FROM ordinal_out @@ -308,7 +308,7 @@ SELECT ordinal_predict( ); SELECT assert( - relative_error(prob, ARRAY[0.5141240, 0.3907468])< 1e-4, + relative_error(prob, ARRAY[0.5141240, 0.3907468])< 1e-4, 'Ordinal regression prediction: Wrong results' ) FROM (SELECT array_agg("0" ORDER BY id) as prob FROM ordinal_prd where id=1 OR id=2)subq; @@ -332,7 +332,7 @@ SELECT * FROM ordinal_out_summary; SELECT assert( relative_error(coef_threshold, ARRAY[1.5104, 2.2577]) < 1e-4 AND relative_error(coef_feature, ARRAY[.31431, .03215]) < 1e-4 AND - relative_error(log_likelihood, -191.9912) < 1e-4, + relative_error(log_likelihood, -191.9912) < 1e-4, 'Ordinal regression with IRLS optimizer (test): Wrong results' ) FROM ordinal_out; @@ -346,7 +346,7 @@ SELECT ordinal_predict( ); SELECT assert( - relative_error(prob, ARRAY[0.5281595, 0.4288294, 0.2461692, 0.5025270])< 1e-4, + relative_error(prob, ARRAY[0.5281595, 0.4288294, 0.2461692, 0.5025270])< 1e-4, 'Ordinal regression prediction: Wrong results' ) FROM (SELECT array_agg("0" ORDER BY id) as prob FROM ordinal_prd)subq; http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/bfs.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/bfs.sql_in b/src/ports/postgres/modules/graph/bfs.sql_in index 067c8b6..c1c27fe 100644 --- a/src/ports/postgres/modules/graph/bfs.sql_in +++ b/src/ports/postgres/modules/graph/bfs.sql_in @@ -42,7 +42,7 @@ m4_include(`SQLCommon.m4') @brief Finds the nodes reachable from a given source vertex using a breadth-first approach. Given a graph and a source vertex, the breadth-first search (BFS) algorithm -finds all nodes reachable from the source vertex by searching / traversing the graph +finds all nodes reachable from the source vertex by searching / traversing the graph in a breadth-first manner. @anchor bfs @@ -72,9 +72,9 @@ vertex ids. The vertex ids are of type INTEGER with no duplicates. They do not need to be contiguous.</dd> <dt>edge_table</dt> -<dd>TEXT. Name of the table containing the edge data. The edge table must contain -columns for source vertex and destination vertex. Column naming convention is -described below in the 'edge_args' parameter. +<dd>TEXT. Name of the table containing the edge data. The edge table must contain +columns for source vertex and destination vertex. Column naming convention is +described below in the 'edge_args' parameter. In addition to vertex columns, if grouping is used then the columns specified in the 'grouping_cols' parameter must be present. </dd> @@ -82,12 +82,12 @@ in the 'grouping_cols' parameter must be present. </dd> <dd>TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument: - - src (INTEGER): Name of the column containing the source vertex ids in the edge table. + - src (INTEGER): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'. (This is not to be confused with the 'source_vertex' argument passed to the BFS function.) - - dest (INTEGER): Name of the column containing the destination vertex ids in + - dest (INTEGER): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'. - + <dt>source_vertex</dt> <dd>INTEGER. The source vertex id for the algorithm to start. This vertex id must exist in the 'vertex_id' column of 'vertex_table'.</dd> @@ -95,37 +95,37 @@ exist in the 'vertex_id' column of 'vertex_table'.</dd> <dt>out_table</dt> <dd>TEXT. Name of the table to store the result of BFS. It contains a row for every vertex that is reachable from the source_vertex. -In the presence of grouping columns, only those edges are used for which there are no NULL values +In the presence of grouping columns, only those edges are used for which there are no NULL values in any grouping column. The output table will have the following columns (in addition to the grouping columns): - - vertex_id : The id for any node reachable from source_vertex in addition to - the source_vertex. Will use the input parameter 'vertex_id' for + - vertex_id : The id for any node reachable from source_vertex in addition to + the source_vertex. Will use the input parameter 'vertex_id' for column naming. - - dist : The distance in number of edges (or hops) from the source_vertex - to where this vertex is located. - - parent : The parent of this vertex in BFS traversal of the graph from source_vertex. - Will use 'parent' for column naming. For the + - dist : The distance in number of edges (or hops) from the source_vertex + to where this vertex is located. + - parent : The parent of this vertex in BFS traversal of the graph from source_vertex. + Will use 'parent' for column naming. For the case where vertex_id = source_vertex, the value for parent is NULL. A summary table named <out_table>_summary is also created. This is an internal table that keeps a record of the input parameters. </dd> <dt>max_distance (optional)</dt> -<dd>INT, default = NULL. Maximum distance to traverse -from the source vertex. When this value is null, -traverses until reaches leaf node. E.g., if set -to 1 will return only adjacent vertices, if set -to 7 will return vertices up to a maximum distance +<dd>INT, default = NULL. Maximum distance to traverse +from the source vertex. When this value is null, +traverses until reaches leaf node. E.g., if set +to 1 will return only adjacent vertices, if set +to 7 will return vertices up to a maximum distance of 7 vertices away. <dt>directed (optional)</dt> <dd>BOOLEAN, default = FALSE. If TRUE the graph will be treated as directed, else it will be treated as an undirected graph.</dd> <dt>grouping_cols (optional)</dt> -<dd>TEXT, default = NULL. A comma-separated list of columns used to group the -input into discrete subgraphs. -These columns must exist in the edge table. When this value is NULL, no grouping is used -and a single BFS result is generated. +<dd>TEXT, default = NULL. A comma-separated list of columns used to group the +input into discrete subgraphs. +These columns must exist in the edge table. When this value is NULL, no grouping is used +and a single BFS result is generated. @note Expressions are not currently supported for 'grouping_cols'.</dd> </dl> @@ -133,11 +133,11 @@ and a single BFS result is generated. @anchor notes @par Notes -The graph_bfs function is a SQL implementation of the well-known breadth-first -search algorithm [1] modified appropriately for a relational database. It will -find any node in the graph reachable from the source_vertex only once. If a node -is reachable by many different paths from the source_vertex (i.e. has more than -one parent), then only one of those parents is present in the output table. +The graph_bfs function is a SQL implementation of the well-known breadth-first +search algorithm [1] modified appropriately for a relational database. It will +find any node in the graph reachable from the source_vertex only once. If a node +is reachable by many different paths from the source_vertex (i.e. has more than +one parent), then only one of those parents is present in the output table. The BFS result will, in general, be different for different choices of source_vertex. @anchor examples @@ -195,9 +195,9 @@ SELECT madlib.graph_bfs( SELECT * FROM out ORDER BY dist,id; </pre> <pre class="result"> - id | dist | parent + id | dist | parent ----+------+-------- - 3 | 0 | + 3 | 0 | 1 | 1 | 3 4 | 1 | 3 5 | 1 | 3 @@ -210,7 +210,7 @@ SELECT * FROM out ORDER BY dist,id; SELECT * FROM out_summary; </pre> <pre class="result"> - vertex_table | vertex_id | edge_table | edge_args | source_vertex | out_table | max_distance | directed | grouping_cols + vertex_table | vertex_id | edge_table | edge_args | source_vertex | out_table | max_distance | directed | grouping_cols --------------+-----------+------------+-----------+---------------+-----------+--------------+----------+--------------- vertex | NULL | edge | NULL | 3 | out | | | NULL (1 row) @@ -226,14 +226,14 @@ SELECT madlib.graph_bfs( NULL, -- Edge arguments (NULL means use default naming) 3, -- Source vertex for BFS 'out_max', -- Output table of nodes reachable from source_vertex - 2); -- Maximum distance to traverse from source_vertex + 2); -- Maximum distance to traverse from source_vertex -- Default values used for the other arguments SELECT * FROM out_max ORDER BY dist,id; </pre> <pre class="result"> - id | dist | parent + id | dist | parent ----+------+-------- - 3 | 0 | + 3 | 0 | 1 | 1 | 3 4 | 1 | 3 5 | 1 | 3 @@ -264,9 +264,9 @@ SELECT madlib.graph_bfs( SELECT * FROM out_alt ORDER BY v_id; </pre> <pre class="result"> - v_id | dist | parent + v_id | dist | parent ------+------+-------- - 8 | 0 | + 8 | 0 | 9 | 1 | 8 10 | 1 | 8 11 | 2 | 9 @@ -287,15 +287,15 @@ SELECT madlib.graph_bfs( SELECT * FROM out_alt_dir ORDER BY v_id; </pre> <pre class="result"> - v_id | dist | parent + v_id | dist | parent ------+------+-------- - 8 | 0 | + 8 | 0 | 9 | 1 | 8 10 | 2 | 9 11 | 2 | 9 (4 rows) </pre> -Notice that, with the graph being treated as directed, the parent of v_id=10 +Notice that, with the graph being treated as directed, the parent of v_id=10 is now vertex 9 and not 8 as in the undirected case. -# Create a graph with 2 groups: @@ -343,13 +343,13 @@ SELECT madlib.graph_bfs( SELECT * FROM out_gr ORDER BY g1,g2,dist,id; </pre> <pre class="result"> - g1 | g2 | id | dist | parent + g1 | g2 | id | dist | parent -----+----+----+------+-------- - 100 | a | 8 | 0 | + 100 | a | 8 | 0 | 100 | a | 9 | 1 | 8 100 | a | 10 | 1 | 8 100 | a | 11 | 2 | 9 - 202 | c | 8 | 0 | + 202 | c | 8 | 0 | 202 | c | 9 | 1 | 8 202 | c | 10 | 1 | 8 202 | c | 11 | 2 | 9 @@ -373,9 +373,9 @@ SELECT madlib.graph_bfs( SELECT * FROM out_gr ORDER BY g1,g2,dist,id; </pre> <pre class="result"> - g1 | g2 | id | dist | parent + g1 | g2 | id | dist | parent -----+----+----+------+-------- - 100 | a | 3 | 0 | + 100 | a | 3 | 0 | 100 | a | 1 | 1 | 3 100 | a | 4 | 1 | 3 100 | a | 5 | 1 | 3 http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/hits.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/hits.sql_in b/src/ports/postgres/modules/graph/hits.sql_in index bf4b414..96a507c 100644 --- a/src/ports/postgres/modules/graph/hits.sql_in +++ b/src/ports/postgres/modules/graph/hits.sql_in @@ -43,7 +43,7 @@ m4_include(`SQLCommon.m4') @brief Find the HITS scores(authority and hub) of all vertices in a directed graph. -Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the +Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the authority score and hub score of every vertex, where authority estimates the value of the content of the page and hub estimates the value of its links to other pages. This algorithm was originally developed to rate web pages [1]. @@ -65,61 +65,61 @@ hits( vertex_table, \b Arguments <dl class="arglist"> <dt>vertex_table</dt> -<dd>TEXT. Name of the table containing the vertex data for the graph. Must +<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the column specified in the 'vertex_id' parameter below.</dd> <dt>vertex_id</dt> -<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing - vertex ids. The vertex ids are of type INTEGER with no duplicates. They +<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing + vertex ids. The vertex ids are of type INTEGER with no duplicates. They do not need to be contiguous.</dd> <dt>edge_table</dt> -<dd>TEXT. Name of the table containing the edge data. The edge table must +<dd>TEXT. Name of the table containing the edge data. The edge table must contain columns for source vertex and destination vertex.</dd> <dt>edge_args</dt> <dd>TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument: - - src (INTEGER): Name of the column containing the source vertex ids in + - src (INTEGER): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'. - - dest (INTEGER): Name of the column containing the destination vertex + - dest (INTEGER): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.</dd> <dt>out_table</dt> -<dd>TEXT. Name of the table to store the result of HITS. It will contain +<dd>TEXT. Name of the table to store the result of HITS. It will contain a row for every vertex from 'vertex_table' with the following columns: - - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' + - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming. - authority : The vertex authority score. - hub : The vertex hub score. - grouping_cols : Grouping column values (if any) associated with the vertex_id. </dd> -A summary table is also created that contains information +A summary table is also created that contains information regarding the number of iterations required for convergence. -It is named by adding the suffix '_summary' to the 'out_table' +It is named by adding the suffix '_summary' to the 'out_table' parameter. <dt>max_iter (optional) </dt> -<dd>INTEGER, default: 100. The maximum number of iterations allowed. Each +<dd>INTEGER, default: 100. The maximum number of iterations allowed. Each iteration consists of both authority and hub phases.</dd> <dt>threshold (optional) </dt> <dd>FLOAT8, default: (1/number of vertices * 1000). Threshold must be set to a value between 0 and 1, inclusive - of end points. + of end points. If the difference between two consecutive iterations of authority AND two consecutive iterations of hub is smaller than 'threshold', then the - computation stops. That is, both authority and hub value differences + computation stops. That is, both authority and hub value differences must be below the specified threshold for the algorithm to stop. If you set the threshold to 0, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'. </dd> <dt>grouping_cols (optional)</dt> -<dd>TEXT, default: NULL. A single column or a list of comma-separated columns - that divides the input data into discrete groups, resulting in one +<dd>TEXT, default: NULL. A single column or a list of comma-separated columns + that divides the input data into discrete groups, resulting in one distribution per group. When this value is NULL, no grouping is used and a single model is generated for all data. @note Expressions are not currently supported for 'grouping_cols'. @@ -369,8 +369,8 @@ SELECT * FROM hits_out_summary order by user_id; @anchor literature @par Literature -[1] Kleinerg, Jon M., "Authoritative Sources in a Hyperlinked -Environment", Journal of the ACM, Sept. 1999. +[1] Kleinerg, Jon M., "Authoritative Sources in a Hyperlinked +Environment", Journal of the ACM, Sept. 1999. https://www.cs.cornell.edu/home/kleinber/auth.pdf */ http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/pagerank.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in b/src/ports/postgres/modules/graph/pagerank.sql_in index 30a0b50..b81b58e 100644 --- a/src/ports/postgres/modules/graph/pagerank.sql_in +++ b/src/ports/postgres/modules/graph/pagerank.sql_in @@ -48,8 +48,8 @@ This algorithm was originally used by Google to rank websites where the World Wi modeled as a directed graph with the vertices representing the websites. The PageRank algorithm initially proposed by Larry Page and Sergey Brin is implemented here [1]. -We also implement personalized PageRank, in which a notion of importance -provides personalization to a query. +We also implement personalized PageRank, in which a notion of importance +provides personalization to a query. For example, importance scores can be biased according to a specified set of vertices in the graph that are of interest or special in some way [2]. @@ -326,13 +326,13 @@ SELECT * FROM pagerank_out_summary; @anchor literature @par Literature -[1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual Web search engine", -Computer Networks and ISDN Systems. 30: 107â117, +[1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual Web search engine", +Computer Networks and ISDN Systems. 30: 107â117, http://infolab.stanford.edu/pub/papers/google.pdf [2] Jeh, Glen and Widom, Jennifer. "Scaling Personalized Web Search", -Proceedings of the 12th international conference on World Wide Web, Pages 271-279 -Budapest, Hungary, May 20-24, 2003, +Proceedings of the 12th international conference on World Wide Web, Pages 271-279 +Budapest, Hungary, May 20-24, 2003, http://ilpubs.stanford.edu:8090/530/1/2002-12.pdf */ http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/graph/wcc.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/wcc.sql_in b/src/ports/postgres/modules/graph/wcc.sql_in index d47f06c..f5879b9 100644 --- a/src/ports/postgres/modules/graph/wcc.sql_in +++ b/src/ports/postgres/modules/graph/wcc.sql_in @@ -140,8 +140,8 @@ information. It contains one or more rows for every group and has the following columns: - grouping_cols: The grouping columns given in the creation of wcc_table. If there are no grouping columns, this column is not created. - - component_id: The ID of the largest component. Recall that we use the - convention where 'component_id' is the id of the first vertex in a + - component_id: The ID of the largest component. Recall that we use the + convention where 'component_id' is the id of the first vertex in a particular group. It means that component ids are generally not contiguous. If there are multiple components of the same size, a row is created for each component. If grouping_cols is specified, the largest @@ -153,7 +153,7 @@ columns: @anchor hist @par Retrieve Histogram of Vertices Per Connected Component -This function creates a histogram of the number of vertices +This function creates a histogram of the number of vertices per connected component. <pre class="syntax"> @@ -394,7 +394,7 @@ SELECT * FROM wcc_out ORDER BY user_id, component_id, id; (13 rows) </pre> Note that vertex 4 is not identified as a separate component -above. This is because there is no entry in the +above. This is because there is no entry in the edge table for vertex 4 indicating which group it belongs to (though you could do that if you wanted to). @@ -414,7 +414,7 @@ SELECT * FROM largest_cpt_table ORDER BY component_id; (2 rows) </pre> --# Retrieve histogram of the number of vertices per +-# Retrieve histogram of the number of vertices per connected component: <pre class="syntax"> DROP TABLE IF EXISTS histogram_table; http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/knn/knn.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index 1a90652..24f19c7 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -148,8 +148,8 @@ The following distance functions can be used: <dt>weighted_avg (optional)</dt> <dd>BOOLEAN, default: FALSE. Calculates classification or -regression values using a weighted average. The idea is to -weigh the contribution of each of the k neighbors according +regression values using a weighted average. The idea is to +weigh the contribution of each of the k neighbors according to their distance to the test point, giving greater influence to closer neighbors. The distance function 'fn_dist' specified above is used. @@ -341,7 +341,7 @@ Result, with neighbors sorted from closest to furthest: </pre> --# Run KNN for classification using the +-# Run KNN for classification using the weighted average: <pre class="example"> DROP TABLE IF EXISTS knn_result_classification; @@ -362,7 +362,7 @@ SELECT * FROM madlib.knn( SELECT * FROM knn_result_classification ORDER BY id; </pre> <pre class="result"> - id | data | prediction | k_nearest_neighbours + id | data | prediction | k_nearest_neighbours ----+---------+---------------------+---------------------- 1 | {2,1} | 1 | {2,1,3} 2 | {2,6} | 1 | {5,4,3} @@ -402,8 +402,8 @@ is assigned to the test point. https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf @anchor knn-lit-4 -[4] Shepard, Donald (1968). "A two-dimensional interpolation function for -irregularly-spaced data". Proceedings of the 1968 ACM National Conference. pp. 517â524. +[4] Shepard, Donald (1968). "A two-dimensional interpolation function for +irregularly-spaced data". Proceedings of the 1968 ACM National Conference. pp. 517â524. @internal @sa namespace knn (documenting the implementation in Python) @@ -448,7 +448,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( output_table VARCHAR, k INTEGER, output_neighbors BOOLEAN, - fn_dist TEXT, + fn_dist TEXT, weighted_avg BOOLEAN ) RETURNS VARCHAR AS $$ PythonFunctionBodyOnly(`knn', `knn') http://git-wip-us.apache.org/repos/asf/madlib/blob/35818fa3/src/ports/postgres/modules/lda/lda.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/lda/lda.sql_in b/src/ports/postgres/modules/lda/lda.sql_in index efdb957..2355850 100644 --- a/src/ports/postgres/modules/lda/lda.sql_in +++ b/src/ports/postgres/modules/lda/lda.sql_in @@ -35,16 +35,16 @@ Latent Dirichlet Allocation (LDA) is a generative probabilistic model for natural texts. It is used in problems such as automated topic discovery, collaborative filtering, and document classification. -In addition to an implementation of LDA, this MADlib module also provides a +In addition to an implementation of LDA, this MADlib module also provides a number of additional helper functions to interpret results of the LDA output. @note -Topic modeling is often used as part of a larger text processing +Topic modeling is often used as part of a larger text processing pipeline, which may include operations such as term frequency, stemming and -stop word removal. You can use the +stop word removal. You can use the function <a href="group__grp__text__utilities.html">Term Frequency</a> -to generate the required vocabulary format from raw documents for the -LDA training function. See +to generate the required vocabulary format from raw documents for the +LDA training function. See the examples later on this page for more details. @anchor background @@ -107,29 +107,29 @@ lda_train( data_table, <dt>data_table</dt> <dd>TEXT. Name of the table storing the training dataset. Each row is in the form <tt><docid, wordid, count></tt> where \c docid, \c wordid, and \c count - are non-negative integers. + are non-negative integers. The \c docid column refers to the document ID, the \c wordid column is the word ID (the index of a word in the vocabulary), and \c count is the number of occurrences of the word in the document. Please note: - - - \c wordid must be + + - \c wordid must be contiguous integers going from from 0 to \c voc_size − \c 1. - - column names for \c docid, \c wordid, and \c count are currently fixed, - so you must use these exact names in the data_table. - + - column names for \c docid, \c wordid, and \c count are currently fixed, + so you must use these exact names in the data_table. + The function <a href="group__grp__text__utilities.html">Term Frequency</a> can be used to generate vocabulary in the required format from raw documents. </dd> <dt>model_table</dt> - <dd>TEXT. This is an output table generated by LDA which contains the learned model. + <dd>TEXT. This is an output table generated by LDA which contains the learned model. It has one row with the following columns: <table class="output"> <tr> <th>voc_size</th> - <td>INTEGER. Size of the vocabulary. As mentioned above for the input - table, \c wordid consists of contiguous integers going - from 0 to \c voc_size − \c 1. + <td>INTEGER. Size of the vocabulary. As mentioned above for the input + table, \c wordid consists of contiguous integers going + from 0 to \c voc_size − \c 1. </td> </tr> <tr> @@ -138,12 +138,12 @@ lda_train( data_table, </tr> <tr> <th>alpha</th> - <td>DOUBLE PRECISION. Dirichlet prior for the per-document + <td>DOUBLE PRECISION. Dirichlet prior for the per-document topic multinomial.</td> </tr> <tr> <th>beta</th> - <td>DOUBLE PRECISION. Dirichlet prior for the per-topic + <td>DOUBLE PRECISION. Dirichlet prior for the per-topic word multinomial.</td> </tr> <tr> @@ -153,7 +153,7 @@ lda_train( data_table, </table> </dd> <dt>output_data_table</dt> - <dd>TEXT. The name of the table generated by LDA that stores + <dd>TEXT. The name of the table generated by LDA that stores the output data. It has the following columns: <table class="output"> <tr> @@ -162,14 +162,14 @@ lda_train( data_table, </tr> <tr> <th>wordcount</th> - <td>INTEGER. Count of number of words in the document, - including repeats. For example, if a word appears 3 times + <td>INTEGER. Count of number of words in the document, + including repeats. For example, if a word appears 3 times in the document, it is counted 3 times.</td> </tr> <tr> <th>words</th> <td>INTEGER[]. Array of \c wordid in the document, not - including repeats. For example, if a word appears 3 times + including repeats. For example, if a word appears 3 times in the document, it appears only once in the \c words array.</td> </tr> <tr> @@ -183,34 +183,34 @@ lda_train( data_table, <tr> <th>topic_count</th> <td>INTEGER[]. Array of the count of words in the document - that correspond to each topic. This array is of - length \c topic_num. Topic ids are continuous integers going + that correspond to each topic. This array is of + length \c topic_num. Topic ids are continuous integers going from 0 to \c topic_num − \c 1.</td> </tr> <tr> <th>topic_assignment</th> - <td>INTEGER[]. Array indicating which topic each word in the + <td>INTEGER[]. Array indicating which topic each word in the document corresponds to. This array is of length \c wordcount. - Words that are repeated \c n times in the document + Words that are repeated \c n times in the document will show up consecutively \c n times in this array.</td> </tr> </table> </dd> <dt>voc_size</dt> - <dd>INTEGER. Size of the vocabulary. As mentioned above for the - input 'data_table', \c wordid consists of continuous integers going - from 0 to \c voc_size − \c 1. + <dd>INTEGER. Size of the vocabulary. As mentioned above for the + input 'data_table', \c wordid consists of continuous integers going + from 0 to \c voc_size − \c 1. </dd> <dt>topic_num</dt> <dd>INTEGER. Desired number of topics.</dd> <dt>iter_num</dt> <dd>INTEGER. Desired number of iterations.</dd> <dt>alpha</dt> - <dd>DOUBLE PRECISION. Dirichlet prior for the per-document topic + <dd>DOUBLE PRECISION. Dirichlet prior for the per-document topic multinomial (e.g., 50/topic_num is a reasonable value to start with as per Griffiths and Steyvers [2] ).</dd> <dt>beta</dt> - <dd>DOUBLE PRECISION. Dirichlet prior for the per-topic + <dd>DOUBLE PRECISION. Dirichlet prior for the per-topic word multinomial (e.g., 0.01 is a reasonable value to start with).</dd> </dl> @@ -227,27 +227,27 @@ lda_predict( data_table, \b Arguments <dl class="arglist"> <dt>data_table</dt> - <dd>TEXT. Name of the table storing the test dataset + <dd>TEXT. Name of the table storing the test dataset (new document to be labeled). </dd> <dt>model_table</dt> <dd>TEXT. The model table generated by the training process. </dd> <dt>output_predict_table</dt> - <dd>TEXT. The prediction output table. - Each row in the table stores the topic - distribution and the topic assignments for a - document in the dataset. This table has the exact - same columns and interpretation as - the 'output_data_table' from the training function above. + <dd>TEXT. The prediction output table. + Each row in the table stores the topic + distribution and the topic assignments for a + document in the dataset. This table has the exact + same columns and interpretation as + the 'output_data_table' from the training function above. </dd> </dl> @anchor perplexity @par Perplexity -Perplexity describes how well the model fits the data by +Perplexity describes how well the model fits the data by computing word likelihoods averaged over the test documents. -This function returns a single perplexity value. +This function returns a single perplexity value. <pre class="syntax"> lda_get_perplexity( model_table, output_predict_table @@ -259,7 +259,7 @@ lda_get_perplexity( model_table, <dd>TEXT. The model table generated by the training process. </dd> <dt>output_predict_table</dt> - <dd>TEXT. The prediction output table generated by the + <dd>TEXT. The prediction output table generated by the predict function above. </dd> </dl> @@ -267,7 +267,7 @@ lda_get_perplexity( model_table, @anchor helper @par Helper Functions -The helper functions can help to interpret the output +The helper functions can help to interpret the output from LDA training and LDA prediction. <b>Topic description by top-k words with highest probability</b> @@ -288,13 +288,13 @@ lda_get_topic_desc( model_table, </dd> <dt>vocab_table</dt> <dd>TEXT. The vocabulary table in the form <wordid, word>. - Reminder that this table can be created using the \c term_frequency - function (\ref grp_text_utilities) with the - parameter \c compute_vocab set to TRUE. + Reminder that this table can be created using the \c term_frequency + function (\ref grp_text_utilities) with the + parameter \c compute_vocab set to TRUE. </dd> <dt>output_table</dt> <dd>TEXT. The output table with per-topic description - generated by this helper function. + generated by this helper function. It has the following columns: <table class="output"> <tr> @@ -337,7 +337,7 @@ lda_get_word_topic_count( model_table, </dd> <dt>output_table</dt> <dd>TEXT. The output table with per-word topic counts - generated by this helper function. + generated by this helper function. It has the following columns: <table class="output"> <tr> @@ -347,7 +347,7 @@ lda_get_word_topic_count( model_table, <tr> <th>topic_count</th> <td>INTEGER[]. Count of word association with each topic, i.e., - shows how many times a given word is + shows how many times a given word is assigned to a topic. Array is of length number of topics.</td> </tr> </table> @@ -370,7 +370,7 @@ lda_get_topic_word_count( model_table, </dd> <dt>output_table</dt> <dd>TEXT. The output table with per-topic word counts - generated by this helper function. + generated by this helper function. It has the following columns: <table class="output"> <tr> @@ -403,7 +403,7 @@ lda_get_word_topic_mapping( output_data_table, -- From training or prediction </dd> <dt>output_table</dt> <dd>TEXT. The output table with word to topic mappings - generated by this helper function. + generated by this helper function. It has the following columns: <table class="output"> <tr> @@ -435,10 +435,10 @@ INSERT INTO documents VALUES (2, 'Machine learning is closely related to and often overlaps with computational statistics; a discipline that also specializes in prediction-making. It has strong ties to mathematical optimization, which deliver methods, theory and application domains to the field.'), (3, 'California''s diverse geography ranges from the Sierra Nevada in the east to the Pacific Coast in the west, from the RedwoodâDouglas fir forests of the northwest, to the Mojave Desert areas in the southeast. The center of the state is dominated by the Central Valley, a major agricultural area.'); </pre> -You can apply stemming, stop word removal and tokenization -at this point in order to prepare the documents for text -processing. Depending upon your database version, various -tools are available. Databases based on more recent +You can apply stemming, stop word removal and tokenization +at this point in order to prepare the documents for text +processing. Depending upon your database version, various +tools are available. Databases based on more recent versions of PostgreSQL may do something like: <pre class="example"> SELECT tsvector_to_array(to_tsvector('english',contents)) from documents; @@ -452,13 +452,13 @@ SELECT tsvector_to_array(to_tsvector('english',contents)) from documents; {agricultur,area,california,center,central,coast,desert,divers,...} (4 rows) </pre> -In this example, we assume a database based on an older -version of PostgreSQL and just perform basic punctuation removal -and tokenization. The array of words is added as a new column +In this example, we assume a database based on an older +version of PostgreSQL and just perform basic punctuation removal +and tokenization. The array of words is added as a new column to the documents table: <pre class="example"> ALTER TABLE documents ADD COLUMN words TEXT[]; -UPDATE documents SET words = +UPDATE documents SET words = regexp_split_to_array(lower( regexp_replace(contents, E'[,.;\']','', 'g') ), E'[\\\\s+]'); @@ -494,7 +494,7 @@ SELECT madlib.term_frequency('documents', -- input table SELECT * FROM documents_tf ORDER BY docid LIMIT 20; </pre> <pre class="result"> - docid | wordid | count + docid | wordid | count -------+--------+------- 0 | 71 | 1 0 | 90 | 1 @@ -523,7 +523,7 @@ Here is the associated vocabulary table. Note that wordid starts at 0: SELECT * FROM documents_tf_vocabulary ORDER BY wordid LIMIT 20; </pre> <pre class="result"> - wordid | word + wordid | word --------+-------------- 0 | 1960s 1 | 1968 @@ -552,25 +552,25 @@ The total number of words in the vocabulary across all documents is: SELECT COUNT(*) FROM documents_tf_vocabulary; </pre> <pre class="result"> - count + count +------ 103 (1 row) </pre> --# Train LDA model. For Dirichlet priors we use initial -rule-of-thumb values of 50/(number of topics) for alpha +-# Train LDA model. For Dirichlet priors we use initial +rule-of-thumb values of 50/(number of topics) for alpha and 0.01 for beta. -Reminder that column names for docid, wordid, and count -are currently fixed, so you must use these exact names -in the input table. After a successful run of the LDA -training function two tables are generated, one for -storing the learned model and the other for storing +Reminder that column names for docid, wordid, and count +are currently fixed, so you must use these exact names +in the input table. After a successful run of the LDA +training function two tables are generated, one for +storing the learned model and the other for storing the output data table. <pre class="example"> DROP TABLE IF EXISTS lda_model, lda_output_data; SELECT madlib.lda_train( 'documents_tf', -- documents table in the form of term frequency 'lda_model', -- model table created by LDA training (not human readable) - 'lda_output_data', -- readable output data table + 'lda_output_data', -- readable output data table 103, -- vocabulary size 5, -- number of topics 10, -- number of iterations @@ -611,10 +611,10 @@ topic_assignment | {4,4,4,0,2,0,0,2,4,4,2,2,2,1,2,4,1,0,2,2,2,2,2,2,2,2,2,2,2,1, </pre> -# Review learned model using helper functions. -First, we get topic description by top-k words. These are +First, we get topic description by top-k words. These are the k words with the highest probability for the topic. -Note that if there are ties in probability, more than k -words may actually be reported for each topic. Also note +Note that if there are ties in probability, more than k +words may actually be reported for each topic. Also note that topicid starts at 0: <pre class="example"> DROP TABLE IF EXISTS helper_output_table; @@ -625,7 +625,7 @@ SELECT madlib.lda_get_topic_desc( 'lda_model', -- LDA model gener SELECT * FROM helper_output_table ORDER BY topicid, prob DESC LIMIT 40; </pre> <pre class="result"> - topicid | wordid | prob | word + topicid | wordid | prob | word ---------+--------+--------------------+------------------- 0 | 3 | 0.111357750647429 | a 0 | 51 | 0.074361820199778 | is @@ -669,9 +669,9 @@ SELECT * FROM helper_output_table ORDER BY topicid, prob DESC LIMIT 40; 1 | 33 | 0.0438558402084238 | discipline (40 rows) </pre> -Get the per-word topic counts. This mapping shows how -many times a given word is assigned to a topic. E.g., -wordid 3 is assigned to topicid 0 three times: +Get the per-word topic counts. This mapping shows how +many times a given word is assigned to a topic. E.g., +wordid 3 is assigned to topicid 0 three times: <pre class="example"> DROP TABLE IF EXISTS helper_output_table; SELECT madlib.lda_get_word_topic_count( 'lda_model', -- LDA model generated in training @@ -679,7 +679,7 @@ SELECT madlib.lda_get_word_topic_count( 'lda_model', -- LDA model gen SELECT * FROM helper_output_table ORDER BY wordid LIMIT 20; </pre> <pre class="result"> - wordid | topic_count + wordid | topic_count --------+------------- 0 | {0,1,0,0,0} 1 | {1,0,0,0,0} @@ -703,7 +703,7 @@ SELECT * FROM helper_output_table ORDER BY wordid LIMIT 20; 19 | {2,0,0,0,0} (20 rows) </pre> -Get the per-topic word counts. This mapping shows +Get the per-topic word counts. This mapping shows which words are associated with each topic by frequency: <pre class="example"> DROP TABLE IF EXISTS topic_word_count; @@ -736,7 +736,7 @@ SELECT madlib.lda_get_word_topic_mapping('lda_output_data', -- Output table fro SELECT * FROM helper_output_table ORDER BY docid LIMIT 40; </pre> <pre class="result"> - docid | wordid | topicid + docid | wordid | topicid -------+--------+--------- 0 | 56 | 1 0 | 54 | 1 @@ -781,15 +781,15 @@ SELECT * FROM helper_output_table ORDER BY docid LIMIT 40; (40 rows) </pre> --# Use a learned LDA model for prediction (that is, to label new documents). -In this example, we use the same input table as we used to train, just for -demonstration purpose. Normally, the test document is a new one that +-# Use a learned LDA model for prediction (that is, to label new documents). +In this example, we use the same input table as we used to train, just for +demonstration purpose. Normally, the test document is a new one that we want to predict on. <pre class="example"> DROP TABLE IF EXISTS outdata_predict; SELECT madlib.lda_predict( 'documents_tf', -- Document to predict 'lda_model', -- LDA model from training - 'outdata_predict' -- Output table for predict results + 'outdata_predict' -- Output table for predict results ); SELECT * FROM outdata_predict; </pre> @@ -837,7 +837,7 @@ SELECT madlib.lda_get_word_topic_mapping('outdata_predict', -- Output table fro SELECT * FROM helper_output_table ORDER BY docid LIMIT 40; </pre> <pre class="result"> - docid | wordid | topicid + docid | wordid | topicid -------+--------+--------- 0 | 54 | 4 0 | 42 | 1 @@ -882,8 +882,8 @@ SELECT * FROM helper_output_table ORDER BY docid LIMIT 40; (40 rows) </pre> --# Call the perplexity function to see how well the model fits -the data. Perplexity computes word likelihoods averaged +-# Call the perplexity function to see how well the model fits +the data. Perplexity computes word likelihoods averaged over the test documents. <pre class="example"> SELECT madlib.lda_get_perplexity( 'lda_model', -- LDA model from training @@ -891,7 +891,7 @@ SELECT madlib.lda_get_perplexity( 'lda_model', -- LDA model from training ); </pre> <pre class="result"> - lda_get_perplexity + lda_get_perplexity +-------------------- 79.481894411824 (1 row)
