Repository: incubator-madlib Updated Branches: refs/heads/master fdf7b6d54 -> cc4ccd682
Sample: Minor changes to stratified sampling documentation. Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/cc4ccd68 Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/cc4ccd68 Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/cc4ccd68 Branch: refs/heads/master Commit: cc4ccd6828190237c9a1c9ea5094ae144c07b2e6 Parents: fdf7b6d Author: Orhan Kislal <okis...@pivotal.io> Authored: Wed Jun 28 16:23:44 2017 -0700 Committer: Orhan Kislal <okis...@pivotal.io> Committed: Wed Jun 28 16:23:44 2017 -0700 ---------------------------------------------------------------------- doc/mainpage.dox.in | 5 +- .../modules/sample/stratified_sample.sql_in | 140 +++++++++++-------- 2 files changed, 85 insertions(+), 60 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/cc4ccd68/doc/mainpage.dox.in ---------------------------------------------------------------------- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index 274426a..9922ed2 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -248,8 +248,11 @@ complete matrix stored as a distributed table. @defgroup grp_pmml PMML Export @ingroup grp_utility_functions - @defgroup grp_strs Stratified Sampling + @defgroup grp_sampling Sampling @ingroup grp_utility_functions + @{A collection of methods for sampling from a population. @} + @defgroup grp_strs Stratified Sampling + @ingroup grp_sampling @defgroup grp_sessionize Sessionize @ingroup grp_utility_functions http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/cc4ccd68/src/ports/postgres/modules/sample/stratified_sample.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/sample/stratified_sample.sql_in b/src/ports/postgres/modules/sample/stratified_sample.sql_in index 7327c26..c762115 100644 --- a/src/ports/postgres/modules/sample/stratified_sample.sql_in +++ b/src/ports/postgres/modules/sample/stratified_sample.sql_in @@ -37,67 +37,77 @@ m4_include(`SQLCommon.m4') <div class="toc"><b>Contents</b> <ul> <li><a href="#strs">Stratified Sampling</a></li> -<li><a href="#notes">Notes</a></li> <li><a href="#examples">Examples</a></li> </ul> </div> -@brief Provides stratified sampling function. +@brief A method for independently sampling subpopulations (strata). -Given a table, stratified sampling returns a proportion of records for -each group (strata). It is possible to use with or without replacement -sampling methods, specify a set of target columns, and assume the -whole table is a single strata. +Stratified sampling is a method for independently sampling +subpopulations (strata). It is commonly used to reduce +sampling error by ensuring that subgroups are adequately +represented in the sample. @anchor strs @par Stratified Sampling <pre class="syntax"> -stratified_sample( source_table, - output_table, - proportion, - grouping_cols, - target_cols, - with_replacement - ) +stratified_sample( source_table, + output_table, + proportion, + grouping_cols, + target_cols, + with_replacement + ) </pre> \b Arguments <dl class="arglist"> <dt>source_table</dt> -<dd>TEXT. Name of the table containing the input data. Must contain the -columns specified in the 'groupinhg_cols' and target_cols' parameters -below.</dd> +<dd>TEXT. Name of the table containing the input data.</dd> + <dt>output_table</dt> -<dd>TEXT. Name of the table to store the sampled records.</dd> +<dd>TEXT. Name of output table that contains the sampled data. +The output table contains all columns present in the source +table unless otherwise specified in the 'target_cols' +parameter below.</dd> + <dt>proportion</dt> -<dd>FLOAT8. The ratio of sample size to the number of records.</dd> -<dt>grouping_cols</dt> -<dd>TEXT. (Default: NULL) The columns to distinguish each strata.</dd> -<dt>target_cols</dt> -<dd>TEXT. (Default: NULL) The columns to include in the output.</dd> -<dt>with_replacement</dt> -<dd>BOOLEAN. (Default: FALSE) The sampling method.</dd> -</dl> -@anchor notes -@par Notes +<dd>FLOAT8 in the range (0,1). Each stratum is sampled +independently.</dd> -If grouping_cols is NULL, the whole table is treated as a single group and -sampled accordingly. +<dt>grouping_cols (optional)</dt> +<dd>TEXT, default: NULL. A single column or a list of +comma-separated columns that defines the strata. When this +parameter is NULL, no grouping is used so the sampling is +non-stratified, that is, the whole table is treated as a +single group.</dd> -If target_cols is NULL or '*', all of the columns will be included in the -output table. +<dt>target_cols (optional)</dt> +<dd>TEXT, default NULL. A comma-separated list of columns +to appear in the 'output_table'. If NULL or '*', all +columns from the 'source_table' will appear in +the 'output_table'.</dd> -If with_replacement is TRUE, each sample is independent (the same row may -be selected in the sample set more than once). Else (if with_replacement -is FALSE), a row can be selected at most once. -); +@anchor note +@note + Do not include 'grouping_cols' in the parameter 'target_cols', + because they are always included in the 'output_table'. + +<dt>with_replacement (optional)</dt> +<dd>BOOLEAN, default FALSE. Determines whether to sample +with replacement or without replacement (default). +With replacement means that it is possible that the +same row may appear in the sample set more than once. +Without replacement means a given row can be selected +only once.</dd> +</dl> @anchor examples @par Examples -Please note that, due to the random nature of sampling, the output records -might differ. +Please note that due to the random nature of sampling, your +results may look different from those below. -# Create an input table: <pre class="syntax"> @@ -136,10 +146,16 @@ INSERT INTO test VALUES (70,70,2,2); </pre> --# Sample without replacement +-# Sample without replacement: <pre class="syntax"> DROP TABLE IF EXISTS out; -SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', FALSE); +SELECT madlib.stratified_sample( + 'test', -- Source table + 'out', -- Output table + 0.5, -- Sample proportion + 'gr1,gr2', -- Strata definition + 'id1,id2', -- Columns to output + FALSE); -- Sample without replacement SELECT * FROM out ORDER BY gr1,gr2,id1,id2; </pre> <pre class="result"> @@ -161,10 +177,16 @@ SELECT * FROM out ORDER BY gr1,gr2,id1,id2; (13 rows) </pre> --# Sample with replacement +-# Sample with replacement: <pre class="syntax"> DROP TABLE IF EXISTS out; -SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', TRUE); +SELECT madlib.stratified_sample( + 'test', -- Source table + 'out', -- Output table + 0.5, -- Sample proportion + 'gr1,gr2', -- Strata definition + 'id1,id2', -- Columns to output + TRUE); -- Sample with replacement SELECT * FROM out ORDER BY gr1,gr2,id1,id2; </pre> <pre class="result"> @@ -187,12 +209,12 @@ SELECT * FROM out ORDER BY gr1,gr2,id1,id2; */ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( - source_table TEXT, - output_table TEXT, - proportion FLOAT8, - grouping_cols TEXT, - target_cols TEXT, - with_replacement BOOLEAN + source_table TEXT, + output_table TEXT, + proportion FLOAT8, + grouping_cols TEXT, + target_cols TEXT, + with_replacement BOOLEAN ) RETURNS VOID AS $$ PythonFunction(sample, stratified_sample, stratified_sample) $$ LANGUAGE plpythonu VOLATILE @@ -201,28 +223,28 @@ m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); ------------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( - source_table TEXT, - output_table TEXT, - proportion FLOAT8, - grouping_cols TEXT, - target_cols TEXT + source_table TEXT, + output_table TEXT, + proportion FLOAT8, + grouping_cols TEXT, + target_cols TEXT ) RETURNS VOID AS $$ SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, $5, FALSE); $$ LANGUAGE sql VOLATILE m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( - source_table TEXT, - output_table TEXT, - proportion FLOAT8, - grouping_cols TEXT + source_table TEXT, + output_table TEXT, + proportion FLOAT8, + grouping_cols TEXT ) RETURNS VOID AS $$ SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, NULL, FALSE); $$ LANGUAGE sql VOLATILE m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( - source_table TEXT, - output_table TEXT, - proportion FLOAT8 + source_table TEXT, + output_table TEXT, + proportion FLOAT8 ) RETURNS VOID AS $$ SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, NULL, NULL, FALSE); $$ LANGUAGE sql VOLATILE