Repository: madlib Updated Branches: refs/heads/master cf5ace944 -> 24a11c1e5
http://git-wip-us.apache.org/repos/asf/madlib/blob/24a11c1e/src/ports/postgres/modules/stats/correlation.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in index 45ff05e..005de75 100644 --- a/src/ports/postgres/modules/stats/correlation.py_in +++ b/src/ports/postgres/modules/stats/correlation.py_in @@ -414,57 +414,6 @@ triangle set to NULL. To obtain the result from the output_table in this matrix format ensure to order the elements using the 'column_position' column. """.format(schema_madlib=schema_madlib, func=func) - elif message is not None and message.lower() in ('example', 'examples'): - return """ -DROP TABLE IF EXISTS example_data; -CREATE TABLE example_data( - id SERIAL, - outlook text, - temperature float8, - humidity float8, - windy text, - class text) ; - -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('sunny', 85, 85, 'false', E'Dont Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('sunny', 80, 90, 'true', E'Dont Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('overcast', 83, 78, 'false', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('rain', 70, 96, 'false', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('rain', 68, 80, 'false', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('rain', 65, 70, 'true', E'Dont Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('overcast', 64, 65, 'true', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('sunny', 72, 95, 'false', E'Dont Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('sunny', 69, 70, 'false', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('rain', 75, 80, 'false', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('sunny', 75, 70, 'true', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('overcast', 72, 90, 'true', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('overcast', 81, 75, 'false', 'Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES('rain', 71, 80, 'true', E'Dont Play'); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES(NULL, 100, 100, 'true', NULL); -INSERT INTO example_data(outlook, temperature, humidity, windy, class) -VALUES(NULL, 110, 100, 'true', NULL); - -SELECT madlib.{func}('example_data', 'example_data_output'); -SELECT madlib.{func}('example_data', 'example_data_output', '*'); -SELECT madlib.{func}('example_data', 'example_data_output', 'temperature, humidity'); - --- To get the {func} matrix from output table: -SELECT * from example_data_output order by column_position; - """.format(func=func) else: if cov: return """ @@ -478,9 +427,6 @@ covariance is negative. The sign of the covariance therefore shows the tendency ------- For an overview on usage, run: SELECT {schema_madlib}.covariance('usage'); -------- -For examples: - SELECT {schema_madlib}.covariance('example'); """.format(schema_madlib=schema_madlib) else: return """ @@ -492,8 +438,5 @@ perfectly anti-correlated. ------- For an overview on usage, run: SELECT {schema_madlib}.correlation('usage'); -------- -For examples: - SELECT {schema_madlib}.correlation('example'); """.format(schema_madlib=schema_madlib) # ------------------------------------------------------------------------------ http://git-wip-us.apache.org/repos/asf/madlib/blob/24a11c1e/src/ports/postgres/modules/stats/cox_prop_hazards.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/stats/cox_prop_hazards.py_in b/src/ports/postgres/modules/stats/cox_prop_hazards.py_in index 706503a..f73c14f 100644 --- a/src/ports/postgres/modules/stats/cox_prop_hazards.py_in +++ b/src/ports/postgres/modules/stats/cox_prop_hazards.py_in @@ -64,8 +64,6 @@ the probability that death has happened before time t. For more details on function usage: SELECT {schema_madlib}.coxph_train('usage') -For an example on using the function: - SELECT {schema_madlib}.coxph_train('example') """ elif message in ['usage', 'help', '?']: @@ -110,55 +108,6 @@ The output summary table is named as <output_table>_summary has the following co due to missing values """ - - elif message in ['example', 'examples']: - help_string = """ -DROP TABLE IF EXISTS sample_data; -CREATE TABLE sample_data ( - id INTEGER NOT NULL, - grp DOUBLE PRECISION, - wbc DOUBLE PRECISION, - timedeath INTEGER, - status BOOLEAN -); - -COPY sample_data FROM STDIN DELIMITER '|'; - 0 | 0 | 1.45 | 35 | t - 1 | 0 | 1.47 | 34 | t - 3 | 0 | 2.2 | 32 | t - 4 | 0 | 1.78 | 25 | t - 5 | 0 | 2.57 | 23 | t - 6 | 0 | 2.32 | 22 | t - 7 | 0 | 2.01 | 20 | t - 8 | 0 | 2.05 | 19 | t - 9 | 0 | 2.16 | 17 | t - 10 | 0 | 3.6 | 16 | t - 11 | 1 | 2.3 | 15 | t - 12 | 0 | 2.88 | 13 | t - 13 | 1 | 1.5 | 12 | t - 14 | 0 | 2.6 | 11 | t - 15 | 0 | 2.7 | 10 | t - 16 | 0 | 2.8 | 9 | t - 17 | 1 | 2.32 | 8 | t - 18 | 0 | 4.43 | 7 | t - 19 | 0 | 2.31 | 6 | t - 20 | 1 | 3.49 | 5 | t - 21 | 1 | 2.42 | 4 | t - 22 | 1 | 4.01 | 3 | t - 23 | 1 | 4.91 | 2 | t - 24 | 1 | 5 | 1 | t -\. - -SELECT {schema_madlib}.coxph_train( - 'sample_data', - 'sample_cox', - 'timedeath', - 'ARRAY[grp,wbc]', - 'status'); - -SELECT * FROM sample_cox; - """ - else: help_string = "No such option. Use {schema_madlib}.coxph_train()" http://git-wip-us.apache.org/repos/asf/madlib/blob/24a11c1e/src/ports/postgres/modules/summary/summary.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/summary/summary.py_in b/src/ports/postgres/modules/summary/summary.py_in index 1dd6c61..ecd8726 100644 --- a/src/ports/postgres/modules/summary/summary.py_in +++ b/src/ports/postgres/modules/summary/summary.py_in @@ -159,43 +159,6 @@ def summary_help_message(schema_madlib, message, **kwargs): - most_frequent_values : Most frequent values - mfv_frequencies : Frequency of the most frequent values """.format(madlib=schema_madlib) - elif message is not None and message.lower() in ('example', 'examples'): - return """ - DROP TABLE IF EXISTS example_data; - CREATE TABLE example_data( - id SERIAL, - outlook text, - temperature float8, - humidity float8, - windy text, - class text) ; - - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play'); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' '); - INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', ''); - - SELECT madlib.summary('example_data', 'example_data_output'); - SELECT madlib.summary('example_data', 'example_data_output', 'windy'); - SELECT madlib.summary('example_data', 'example_data_output', 'windy,humidity'); - SELECT madlib.summary('example_data', 'example_data_output', 'id', 'windy'); - SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3]); - SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2); - SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2, False); - SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2, False, 2); - """ else: return """ 'summary' is a generic function used to produce summary statistics @@ -204,7 +167,4 @@ def summary_help_message(schema_madlib, message, **kwargs): ------- For an overview on usage, run: SELECT {madlib}.summary('usage'); - ------- - For an example, run: - SELECT {madlib}.summary('example') """.format(madlib=schema_madlib) http://git-wip-us.apache.org/repos/asf/madlib/blob/24a11c1e/src/ports/postgres/modules/svm/svm.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index b8780ab..d2d22c4 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -516,9 +516,6 @@ def svm_one_class_help(schema_madlib, message, is_svc, **kwargs): For more details on function usage: SELECT {schema_madlib}.{method}('usage') - - For a small example on using the function: - SELECT {schema_madlib}.{method}('example') """.format(**args) usage = """ @@ -627,89 +624,11 @@ def svm_one_class_help(schema_madlib, message, is_svc, **kwargs): gaussian_usage = get_svc_gaussian_usage_string() poly_usage = get_svc_poly_usage_string() - example_usage = """ - --------------------------------------------------------------------------- - EXAMPLES - --------------------------------------------------------------------------- - - Create an input data set. - - CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, - size INT, lot INT); - COPY houses FROM STDIN WITH DELIMITER '|'; - 1 | 590 | 2 | 1 | 50000 | 770 | 22100 - 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 - 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 - 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 - 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 - 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 - 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 - 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 - 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 - 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 - 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 - 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 - 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 - 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 - 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 - \. - - - Generate a non-linear one-class SVM using a Gaussian kernel. We - specify the initial step size and maximum number of iterations to run. - As part of the kernel parameter, we choose 10 as the dimension of the - space where we train SVM. A larger number will lead to a more powerful - model but run the risk of overfitting. As a result, the model will be a - 10 dimensional vector. - - select {schema_madlib}.svm_one_class('houses', - 'houses_one_class_gaussian', - 'ARRAY[1,tax,bedroom,bath,size,lot,price]', - 'gaussian', - 'gamma=0.01,n_components=10', - NULL, - 'max_iter=250, init_stepsize=100,lambda=0.9' - ); - - - Create a test data set. - DROP TABLE IF EXISTS houses_novelty_test; - CREATE TABLE houses_novelty_test (id INT, tax INT, bedroom INT, bath FLOAT, price INT, - size INT, lot INT); - COPY houses_novelty_test FROM STDIN WITH DELIMITER '|'; - 1 | 33590 | 12 | 11 | 5000000 | 12770 | 221100 - 2 | 1050 | 31 | 21 | 85000000 | 141210 | 120010 - 3 | 233330 | 13 | 11 | 22500000 | 112060 | 351100 - 4 | 833370 | 12 | 12 | 9000000 | 130120 | 1751100 - 5 | 132330 | 31 | 12 | 133000000 | 150120 | 30011100 - 6 | 135330 | 21 | 11 | 90500000 | 8212120 | 25711100 - 7 | 279330 | 31 | 21.5 | 260000000 | 213012 | 25011100 - 8 | 6803333 | 12 | 11 | 142500000 | 117012 | 22111000 - 9 | 33331840 | 31 | 12 | 160000000 | 150120 | 19011100 - 10 | 3780 | 4 | 2 | 220000 | 2790 | 21000 - 11 | 1760 | 3 | 1 | 77000 | 1030 | 18500 - 12 | 1520 | 3 | 2 | 128600 | 1250 | 21000 - 13 | 3000 | 3 | 2 | 130000 | 1760 | 37000 - 14 | 2170 | 2 | 3 | 138000 | 1550 | 13000 - 15 | 750 | 3 | 1.5 | 75000 | 1450 | 13000 - \. - - - Use the prediction function to evaluate the models. The predicted - results are in the prediction column and the actual data is in the - target column. - -- For the Gaussian model: - SELECT {schema_madlib}.svm_predict('houses_one_class_gaussian', - 'houses_test', - 'id', - 'houses_pred_gaussian'); - -- View the results of the prediction function: - SELECT * FROM houses_novelty_test JOIN houses_pred_gaussian USING (id) ORDER BY id; - - """.format(**args) if not message: return summary elif message.lower() in ('usage', 'help', '?'): return usage - elif message.lower() == 'example': - return example_usage elif message.lower() == 'params': return params_usage elif message.lower() == 'gaussian': @@ -849,84 +768,10 @@ def svm_help(schema_madlib, message, is_svc, **kwargs): gaussian_usage = get_svc_gaussian_usage_string() poly_usage = get_svc_poly_usage_string() - example_usage = """ - --------------------------------------------------------------------------- - EXAMPLES - --------------------------------------------------------------------------- - - Create an input data set. - - CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, - size INT, lot INT); - COPY houses FROM STDIN WITH DELIMITER '|'; - 1 | 590 | 2 | 1 | 50000 | 770 | 22100 - 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 - 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 - 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 - 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 - 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 - 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 - 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 - 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 - 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 - 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 - 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 - 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 - 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 - 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 - \. - - - Train a classification model, using a linear model. - - SELECT {schema_madlib}.svm_classification('houses', - 'houses_svm', - 'price < 100000', - 'ARRAY[1, tax, bath, size]'); - - - Generate a nonlinear model using a Gaussian kernel. This time we - specify the initial step size and maximum number of iterations to run. - As part of the kernel parameter, we choose 10 as the dimension of the - space where we train SVM. A larger number will lead to a more powerful - model but run the risk of overfitting. As a result, the model will be a - 10 dimensional vector, instead of 4 as in the case of linear model. - - SELECT {schema_madlib}.svm_classification( 'houses', - 'houses_svm_gaussian', - 'price < 100000', - 'ARRAY[1, tax, bath, size]', - 'gaussian', - 'n_components=10', - '', - 'init_stepsize=1, max_iter=200'); - - - Use the prediction function to evaluate the models. The predicted - results are in the prediction column and the actual data is in the - target column. - - -- For the linear model: - SELECT {schema_madlib}.svm_predict('houses_svm', - 'houses', - 'id', - 'houses_pred'); - SELECT *, price < 100000 AS target - FROM houses JOIN houses_pred - USING (id) ORDER BY id; - - -- For the Gaussian model: - SELECT {schema_madlib}.svm_predict('houses_svm_gaussian', - 'houses', - 'id', - 'houses_pred_gaussian'); - SELECT *, price < 100000 AS target - FROM houses JOIN houses_pred_gaussian - USING (id) ORDER BY id; - """.format(**args) - if not message: return summary elif message.lower() in ('usage', 'help', '?'): return usage - elif message.lower() in ('example', 'examples'): - return example_usage elif message.lower() == 'params': return params_usage elif message.lower() == 'gaussian': http://git-wip-us.apache.org/repos/asf/madlib/blob/24a11c1e/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index cbcd9b7..1238104 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -490,9 +490,6 @@ class MiniBatchDocumentation: For more details on function usage: SELECT {schema_madlib}.{method}('usage') - - For a small example on using the function: - SELECT {schema_madlib}.{method}('example') """.format(**locals()) usage = """ @@ -557,58 +554,11 @@ class MiniBatchDocumentation: for normalization). """.format(**locals()) - example = """ - -- Create input table - CREATE TABLE iris_data( - id INTEGER, - attributes NUMERIC[], - class_text text, - class INTEGER, - state VARCHAR - ); - - COPY iris_data (attributes, class_text, class, state) FROM STDIN NULL '?' DELIMITER '|'; - {4.4,3.2,1.3,0.2}|Iris_setosa|1|Alaska - {5.0,3.5,1.6,0.6}|Iris_setosa|1|Alaska - {5.1,3.8,1.9,0.4}|Iris_setosa|1|Alaska - {4.8,3.0,1.4,0.3}|Iris_setosa|1|Alaska - {5.1,3.8,1.6,0.2}|Iris_setosa|1|Alaska - {5.7,2.8,4.5,1.3}|Iris_versicolor|2|Alaska - {6.3,3.3,4.7,1.6}|Iris_versicolor|2|Alaska - {4.9,2.4,3.3,1.0}|Iris_versicolor|2|Alaska - {6.6,2.9,4.6,1.3}|Iris_versicolor|2|Alaska - {5.2,2.7,3.9,1.4}|Iris_versicolor|2|Alaska - {5.0,2.0,3.5,1.0}|Iris_versicolor|2|Alaska - {4.8,3.0,1.4,0.1}|Iris_setosa|1|Tennessee - {4.3,3.0,1.1,0.1}|Iris_setosa|1|Tennessee - {5.8,4.0,1.2,0.2}|Iris_setosa|1|Tennessee - {5.7,4.4,1.5,0.4}|Iris_setosa|1|Tennessee - {5.4,3.9,1.3,0.4}|Iris_setosa|1|Tennessee - {6.0,2.9,4.5,1.5}|Iris_versicolor|2|Tennessee - {5.7,2.6,3.5,1.0}|Iris_versicolor|2|Tennessee - {5.5,2.4,3.8,1.1}|Iris_versicolor|2|Tennessee - {5.5,2.4,3.7,1.0}|Iris_versicolor|2|Tennessee - {5.8,2.7,3.9,1.2}|Iris_versicolor|2|Tennessee - {6.0,2.7,5.1,1.6}|Iris_versicolor|2|Tennessee - \. - - -- #TODO add description here - DROP TABLE IF EXISTS iris_data_batch, iris_data_batch_standardization, iris_data_batch_summary; - SELECT madlib.minibatch_preprocessor('iris_data', 'iris_data_batch', 'class_text', 'attributes', 3); - - - -- #TODO add description here NULL buffer size - DROP TABLE IF EXISTS iris_data_batch, iris_data_batch_standardization, iris_data_batch_summary; - SELECT madlib.minibatch_preprocessor('iris_data', 'iris_data_batch', 'class_text', 'attributes'); - - """ if not message: return summary elif message.lower() in ('usage', 'help', '?'): return usage - elif message.lower() == 'example': - return example return """ No such option. Use "SELECT {schema_madlib}.minibatch_preprocessor()" for help. http://git-wip-us.apache.org/repos/asf/madlib/blob/24a11c1e/src/ports/postgres/modules/utilities/path.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/path.py_in b/src/ports/postgres/modules/utilities/path.py_in index 37457ff..acbaf8d 100644 --- a/src/ports/postgres/modules/utilities/path.py_in +++ b/src/ports/postgres/modules/utilities/path.py_in @@ -321,9 +321,6 @@ involved like aggregation. For more details on function usage: SELECT {schema_madlib}.path('usage'); - -For a small example on using the function: - SELECT {schema_madlib}.path('example'); """.format(schema_madlib=schema_madlib) usage_string = """ @@ -345,76 +342,10 @@ SELECT {schema_madlib}.path( ); """.format(schema_madlib=schema_madlib) - example_string = """ ---------------------------------------------------------------------------- - EXAMPLE ---------------------------------------------------------------------------- -- Create an input data set. - -DROP TABLE IF EXISTS eventlog, path_output, path_output_tuples; -CREATE TABLE eventlog (event_timestamp TIMESTAMP, - user_id INT, - session_id INT, - page TEXT, - revenue FLOAT); -INSERT INTO eventlog VALUES -('04/15/2015 01:03:00', 100821, 100, 'LANDING', 0), -('04/15/2015 01:04:00', 100821, 100, 'WINE', 0), -('04/15/2015 01:05:00', 100821, 100, 'CHECKOUT', 39), -('04/15/2015 02:06:00', 100821, 101, 'WINE', 0), -('04/15/2015 02:09:00', 100821, 101, 'WINE', 0), -('04/15/2015 01:15:00', 101121, 102, 'LANDING', 0), -('04/15/2015 01:16:00', 101121, 102, 'WINE', 0), -('04/15/2015 01:17:00', 101121, 102, 'CHECKOUT', 15), -('04/15/2015 01:18:00', 101121, 102, 'LANDING', 0), -('04/15/2015 01:19:00', 101121, 102, 'HELP', 0), -('04/15/2015 01:21:00', 101121, 102, 'WINE', 0), -('04/15/2015 01:22:00', 101121, 102, 'CHECKOUT', 23), -('04/15/2015 02:15:00', 101331, 103, 'LANDING', 0), -('04/15/2015 02:16:00', 101331, 103, 'WINE', 0), -('04/15/2015 02:17:00', 101331, 103, 'HELP', 0), -('04/15/2015 02:18:00', 101331, 103, 'WINE', 0), -('04/15/2015 02:19:00', 101331, 103, 'CHECKOUT', 16), -('04/15/2015 02:22:00', 101443, 104, 'BEER', 0), -('04/15/2015 02:25:00', 101443, 104, 'CHECKOUT', 12), -('04/15/2015 02:29:00', 101881, 105, 'LANDING', 0), -('04/15/2015 02:30:00', 101881, 105, 'BEER', 0), -('04/15/2015 01:05:00', 102201, 106, 'LANDING', 0), -('04/15/2015 01:06:00', 102201, 106, 'HELP', 0), -('04/15/2015 01:09:00', 102201, 106, 'LANDING', 0), -('04/15/2015 02:15:00', 102201, 107, 'WINE', 0), -('04/15/2015 02:16:00', 102201, 107, 'BEER', 0), -('04/15/2015 02:17:00', 102201, 107, 'WINE', 0), -('04/15/2015 02:18:00', 102871, 108, 'BEER', 0), -('04/15/2015 02:19:00', 102871, 108, 'WINE', 0), -('04/15/2015 02:22:00', 102871, 108, 'CHECKOUT', 21), -('04/15/2015 02:25:00', 102871, 108, 'LANDING', 0), -('04/15/2015 02:17:00', 103711, 109, 'BEER', 0), -('04/15/2015 02:18:00', 103711, 109, 'LANDING', 0), -('04/15/2015 02:19:00', 103711, 109, 'WINE', 0); - -- Calculate the revenue by checkout: - -SELECT {schema_madlib}.path( - 'eventlog', -- Name of input table - 'path_output', -- Table name to store path results - 'session_id', -- Partition input table by session - 'event_timestamp ASC', -- Order partitions in input table by time - 'buy:=page=''CHECKOUT''', -- Define a symbol for checkout events - '(buy)', -- Pattern search: purchase - 'sum(revenue) as checkout_rev', -- Aggregate: sum revenue by checkout - TRUE -- Persist matches - ); - -SELECT * FROM path_output ORDER BY session_id, match_id; - """.format(schema_madlib=schema_madlib) - if not message: return summary_string elif message.lower() in ('usage', 'help', '?'): return usage_string - elif message.lower() in ('example', 'examples'): - return example_string else: return """ No such option. Use "SELECT {schema_madlib}.path()" for help. http://git-wip-us.apache.org/repos/asf/madlib/blob/24a11c1e/src/ports/postgres/modules/utilities/sessionize.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/sessionize.py_in b/src/ports/postgres/modules/utilities/sessionize.py_in index ccd0a3e..278e1f8 100644 --- a/src/ports/postgres/modules/utilities/sessionize.py_in +++ b/src/ports/postgres/modules/utilities/sessionize.py_in @@ -131,15 +131,12 @@ def sessionize_help_message(schema_madlib, message, **kwargs): ----------------------------------------------------------------------------------- Functionality: Sessionize -The MADlib sessionize function performs time-oriented session reconstruction on a -data set comprising a sequence of events. A defined period of inactivity indicates +The MADlib sessionize function performs time-oriented session reconstruction on a +data set comprising a sequence of events. A defined period of inactivity indicates the end of one session and beginning of the next session. For more details on function usage: SELECT {schema_madlib}.sessionize('usage'); - -For a small example on using the function: - SELECT {schema_madlib}.sessionize('example'); """.format(schema_madlib=schema_madlib) usage_string = """ @@ -157,92 +154,17 @@ SELECT {schema_madlib}.sessionize( -- a session 'output_cols' -- str, An optional valid postgres SELECT expression for the -- output table/view (default *) - 'create_view' -- boolean, Optional parameter to specify if output is a + 'create_view' -- boolean, Optional parameter to specify if output is a -- view or materilized to a table (default True) ); """.format(schema_madlib=schema_madlib) - example_string = """ ------------------------------------------------------------------------------------ - EXAMPLE ------------------------------------------------------------------------------------ -- Create an input data set: - -DROP TABLE IF EXISTS eventlog; -CREATE TABLE eventlog (event_timestamp TIMESTAMP, - user_id INT, - page TEXT, - revenue FLOAT); -INSERT INTO eventlog VALUES -('04/15/2015 02:19:00', 101331, 'CHECKOUT', 16), -('04/15/2015 02:17:00', 202201, 'WINE', 0), -('04/15/2015 03:18:00', 202201, 'BEER', 0), -('04/15/2015 01:03:00', 100821, 'LANDING', 0), -('04/15/2015 01:04:00', 100821, 'WINE', 0), -('04/15/2015 01:05:00', 100821, 'CHECKOUT', 39), -('04/15/2015 02:06:00', 100821, 'WINE', 0), -('04/15/2015 02:09:00', 100821, 'WINE', 0), -('04/15/2015 02:15:00', 101331, 'LANDING', 0), -('04/15/2015 02:16:00', 101331, 'WINE', 0), -('04/15/2015 02:17:00', 101331, 'HELP', 0), -('04/15/2015 02:18:00', 101331, 'WINE', 0), -('04/15/2015 02:29:00', 201881, 'LANDING', 0), -('04/15/2015 02:30:00', 201881, 'BEER', 0), -('04/15/2015 01:05:00', 202201, 'LANDING', 0), -('04/15/2015 01:06:00', 202201, 'HELP', 0), -('04/15/2015 01:09:00', 202201, 'LANDING', 0), -('04/15/2015 02:15:00', 202201, 'WINE', 0), -('04/15/2015 02:16:00', 202201, 'BEER', 0), -('04/15/2015 03:19:00', 202201, 'WINE', 0), -('04/15/2015 03:22:00', 202201, 'CHECKOUT', 21); - -- Sessionize the table for each user_id, and obtain only the user_id, with partition -expression, event_timestamp and session_id: - -SELECT {schema_madlib}.sessionize( - 'eventlog', -- Name of input table - 'sessionize_output', -- Table name to store sessionized results - 'user_id', -- Partition input table by session - 'event_timestamp', -- Order partitions in input table by time - '0:30:0' -- Use 30 minute time out to define sessions - ); - -- View the output table containing the session IDs: - -SELECT * FROM sessionize_output; - -DROP VIEW sessionize_output; - -- Sessionize the table for each user_id, and materialize all columns from -source table into an output table: - -SELECT {schema_madlib}.sessionize( - 'eventlog', -- Name of input table - 'sessionize_output', -- Table name to store sessionized results - 'user_id < 200000', -- Partition input table by session - 'event_timestamp', -- Order partitions in input table by time - '180', -- Use 3 minutes (180 seconds) to define sessions - 'event_timestamp, user_id, user_id < 200000 AS "Department-A1"', - -- Select only the required columns, along with the - -- session id column that is selected by default - 'false' -- Materialize results into a table, and not a view - ); - -- View the output table containing the session IDs: - -SELECT * FROM sessionize_output WHERE "Department-A1"='TRUE'; - -DROP TABLE sessionize_output; - """.format(schema_madlib=schema_madlib) - help_string = summary_string if not message: return summary_string elif message.lower() in ('usage', 'help', '?'): return usage_string - elif message.lower() == 'example': - return example_string else: return """ No such option. Use "SELECT {schema_madlib}.sessionize()" for help.
