Repository: incubator-madlib
Updated Branches:
  refs/heads/master e6a21d94c -> d5d5a26b4


Change test_train_split to train_test_split

Closes #174


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/2308bef5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/2308bef5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/2308bef5

Branch: refs/heads/master
Commit: 2308bef59d08a9b492e59db202231e2d1e032c9b
Parents: e6a21d9
Author: Orhan Kislal <okis...@pivotal.io>
Authored: Fri Aug 18 16:22:05 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Aug 18 20:51:30 2017 -0700

----------------------------------------------------------------------
 doc/mainpage.dox.in                             |   2 +-
 .../modules/sample/test/test_train_split.sql_in |  85 -----
 .../modules/sample/test/train_test_split.sql_in |  89 +++++
 .../modules/sample/test_train_split.py_in       | 319 -----------------
 .../modules/sample/test_train_split.sql_in      | 319 -----------------
 .../modules/sample/train_test_split.py_in       | 319 +++++++++++++++++
 .../modules/sample/train_test_split.sql_in      | 346 +++++++++++++++++++
 7 files changed, 755 insertions(+), 724 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2308bef5/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 10c1f76..3706305 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -149,7 +149,7 @@ Contains graph algorithms.
     @ingroup grp_mdl
     @defgroup grp_pred Prediction Metrics
     @ingroup grp_mdl
-    @defgroup grp_test_train_split Test Train Split
+    @defgroup grp_train_test_split Train-Test Split
     @ingroup grp_mdl
 
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2308bef5/src/ports/postgres/modules/sample/test/test_train_split.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/test/test_train_split.sql_in 
b/src/ports/postgres/modules/sample/test/test_train_split.sql_in
deleted file mode 100644
index 5ae0ade..0000000
--- a/src/ports/postgres/modules/sample/test/test_train_split.sql_in
+++ /dev/null
@@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------- 
*//**
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *//* ----------------------------------------------------------------------- 
*/
-
-DROP TABLE IF EXISTS test;
-
-CREATE TABLE test(
-    id1 INTEGER,
-    id2 INTEGER,
-    gr1 INTEGER,
-    gr2 INTEGER
-);
-
-INSERT INTO test VALUES
-(1,0,1,1),
-(2,0,1,1),
-(3,0,1,1),
-(4,0,1,1),
-(5,0,1,1),
-(6,0,1,1),
-(7,0,1,1),
-(8,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(0,1,1,2),
-(0,2,1,2),
-(0,3,1,2),
-(0,4,1,2),
-(0,5,1,2),
-(0,6,1,2),
-(10,10,2,2),
-(20,20,2,2)
-;
-
-SELECT setseed(0);
-
-DROP TABLE IF EXISTS out_train,out_test,out;
-SELECT test_train_split('test', 'out', 0.1, 0.2, NULL, 'id1,id2,gr1,gr2', 
FALSE, TRUE);
-SELECT assert(count(*) = 2, 'Wrong number of samples') FROM out_train;
-SELECT assert(count(*) = 4, 'Wrong number of samples') FROM out_test;
-
-DROP TABLE IF EXISTS out_train,out_test,out;
-SELECT test_train_split('test', 'out', 0.1, 0.2, NULL, 'id1,id2,gr1,gr2', 
FALSE, FALSE);
-SELECT assert(count(*) = 2, 'Wrong number of samples') FROM out WHERE split=1;
-SELECT assert(count(*) = 4, 'Wrong number of samples') FROM out WHERE split=0;
-
-
-DROP TABLE IF EXISTS out_train,out_test,out;
-SELECT test_train_split('test', 'out', 0.5, 0.5, NULL, 'id1,id2,gr1,gr2', 
TRUE, FALSE);
-SELECT assert(count(*) = 20, 'Wrong number of samples') FROM out;
-
-DROP TABLE IF EXISTS out;
-SELECT test_train_split('test', 'out', 0.5, 0.5, 'gr1,gr2', 'id1,id2', TRUE, 
FALSE);
-select * from out;
-SELECT assert(count(*) = 6, 'Wrong number of samples')
-FROM out WHERE gr1 = 1 AND gr2 = 1 AND split = 0;
-SELECT assert(count(*) = 6, 'Wrong number of samples')
-FROM out WHERE gr1 = 1 AND gr2 = 1 AND split = 1;
-SELECT assert(count(*) = 3, 'Wrong number of samples')
-FROM out WHERE gr1 = 1 AND gr2 = 2 AND split = 0;
-SELECT assert(count(*) = 3, 'Wrong number of samples')
-FROM out WHERE gr1 = 1 AND gr2 = 2 AND split = 1;
-SELECT assert(count(*) = 1, 'Wrong number of samples')
-FROM out WHERE gr1 = 2 AND gr2 = 2 AND split = 0;
-SELECT assert(count(*) = 1, 'Wrong number of samples')
-FROM out WHERE gr1 = 2 AND gr2 = 2 AND split = 1;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2308bef5/src/ports/postgres/modules/sample/test/train_test_split.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/test/train_test_split.sql_in 
b/src/ports/postgres/modules/sample/test/train_test_split.sql_in
new file mode 100644
index 0000000..9c15490
--- /dev/null
+++ b/src/ports/postgres/modules/sample/test/train_test_split.sql_in
@@ -0,0 +1,89 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test(
+    id1 INTEGER,
+    id2 INTEGER,
+    gr1 INTEGER,
+    gr2 INTEGER
+);
+
+INSERT INTO test VALUES
+(1,0,1,1),
+(2,0,1,1),
+(3,0,1,1),
+(4,0,1,1),
+(5,0,1,1),
+(6,0,1,1),
+(7,0,1,1),
+(8,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(0,1,1,2),
+(0,2,1,2),
+(0,3,1,2),
+(0,4,1,2),
+(0,5,1,2),
+(0,6,1,2),
+(10,10,2,2),
+(20,20,2,2)
+;
+
+SELECT setseed(0);
+
+DROP TABLE IF EXISTS out_train,out_test,out;
+SELECT train_test_split('test', 'out', 0.1, 0.2, NULL, 'id1,id2,gr1,gr2', 
FALSE, TRUE);
+SELECT assert(count(*) = 2, 'Wrong number of samples') FROM out_train;
+SELECT assert(count(*) = 4, 'Wrong number of samples') FROM out_test;
+
+DROP TABLE IF EXISTS out_train,out_test,out;
+SELECT train_test_split('test', 'out', 0.1, 0.2, NULL, 'id1,id2,gr1,gr2', 
FALSE, FALSE);
+SELECT assert(count(*) = 2, 'Wrong number of samples') FROM out WHERE split=1;
+SELECT assert(count(*) = 4, 'Wrong number of samples') FROM out WHERE split=0;
+
+
+DROP TABLE IF EXISTS out_train,out_test,out;
+SELECT train_test_split('test', 'out', 0.5, 0.5, NULL, 'id1,id2,gr1,gr2', 
TRUE, FALSE);
+SELECT assert(count(*) = 20, 'Wrong number of samples') FROM out;
+
+DROP TABLE IF EXISTS out_train,out_test,out;
+SELECT train_test_split('test', 'out', 0.5);
+SELECT assert(count(*) = 20, 'Wrong number of samples') FROM out;
+
+DROP TABLE IF EXISTS out;
+SELECT train_test_split('test', 'out', 0.5, 0.5, 'gr1,gr2', 'id1,id2', TRUE, 
FALSE);
+select * from out;
+SELECT assert(count(*) = 6, 'Wrong number of samples')
+FROM out WHERE gr1 = 1 AND gr2 = 1 AND split = 0;
+SELECT assert(count(*) = 6, 'Wrong number of samples')
+FROM out WHERE gr1 = 1 AND gr2 = 1 AND split = 1;
+SELECT assert(count(*) = 3, 'Wrong number of samples')
+FROM out WHERE gr1 = 1 AND gr2 = 2 AND split = 0;
+SELECT assert(count(*) = 3, 'Wrong number of samples')
+FROM out WHERE gr1 = 1 AND gr2 = 2 AND split = 1;
+SELECT assert(count(*) = 1, 'Wrong number of samples')
+FROM out WHERE gr1 = 2 AND gr2 = 2 AND split = 0;
+SELECT assert(count(*) = 1, 'Wrong number of samples')
+FROM out WHERE gr1 = 2 AND gr2 = 2 AND split = 1;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2308bef5/src/ports/postgres/modules/sample/test_train_split.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/test_train_split.py_in 
b/src/ports/postgres/modules/sample/test_train_split.py_in
deleted file mode 100644
index 6056b2a..0000000
--- a/src/ports/postgres/modules/sample/test_train_split.py_in
+++ /dev/null
@@ -1,319 +0,0 @@
-# coding=utf-8
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import plpy
-from utilities.control import MinWarning
-from utilities.utilities import _assert
-from utilities.utilities import extract_keyvalue_params
-from utilities.utilities import add_postfix
-from utilities.utilities import unique_string
-from utilities.utilities import split_quoted_delimited_str
-from utilities.validate_args import table_exists
-from utilities.validate_args import columns_exist_in_table
-from utilities.validate_args import table_is_empty
-from utilities.validate_args import get_expr_type
-from utilities.validate_args import get_cols
-from graph.graph_utils import _check_groups
-from graph.graph_utils import _grp_from_table
-
-m4_changequote(` <!', `!>')
-
-
-def _get_sql_string(str):
-    if str:
-        return "'" + str + "'"
-    return "NULL"
-
-
-def test_train_split(schema_madlib, source_table, output_table, 
train_proportion,
-                     test_proportion, grouping_cols, target_cols, 
with_replacement,
-                     separate_output_tables, **kwargs):
-    """
-    test train split function
-    Args:
-        @param source_table           Input table name.
-        @param output_table           Output table name.
-        @param train_proportion       The ratio of training data to the entire
-                                      input table
-        @param test_proportion        The ratio of test data to the entire
-                                      input table
-        @param grouping_cols          (Default: NULL) The columns to 
distinguish
-                                      each strata.
-        @param target_cols            (Default: NULL) The columns to include in
-                                      the output.
-        @param with_replacement       (Default: FALSE) The sampling method.
-        @param separate_output_tables (Default: FALSE) Create two output 
tables,
-                                      <output_table>_train and 
<output_table>_test.
-                                      Otherwise one output table is created 
with
-                                      and additional column 'split' which 
takes the
-                                      value 0 for test and 1 for training.
-
-    """
-    with MinWarning("warning"):
-        if test_proportion is None:
-            test_proportion = 1 - train_proportion
-        validate_strs(source_table, output_table, train_proportion, 
test_proportion,
-                      split_quoted_delimited_str(grouping_cols), target_cols,
-                      with_replacement)
-        grouping_cols = _get_sql_string(grouping_cols)
-        target_cols = _get_sql_string(target_cols)
-        with_replacement = with_replacement or "False"
-        strat_query = """
-        SELECT {schema_madlib}.stratified_sample(
-            '{strat_source_table}',
-            '{strat_out_table}',
-            '{strat_proportion}',
-            {strat_grouping_cols},
-            {strat_target_cols},
-            {strat_with_replacement}
-        )
-        """
-        strat_out_table = unique_string()
-        q = strat_query.format(
-            schema_madlib=schema_madlib,
-            strat_source_table=source_table,
-            strat_out_table=strat_out_table,
-            strat_proportion=train_proportion + test_proportion,
-            strat_grouping_cols=grouping_cols,
-            strat_with_replacement=with_replacement,
-            strat_target_cols=target_cols
-        )
-        plpy.execute(q)
-        test_table = add_postfix(output_table, "_test")
-        train_table = add_postfix(output_table, "_train")
-        if not separate_output_tables:
-            test_table = unique_string()
-            train_table = unique_string()
-        test_query = strat_query.format(
-            schema_madlib=schema_madlib,
-            strat_source_table=strat_out_table,
-            strat_out_table=test_table,
-            strat_proportion=(test_proportion /
-                              (train_proportion + test_proportion)),
-            strat_grouping_cols=grouping_cols,
-            strat_with_replacement=False,
-            strat_target_cols=target_cols
-        )
-        plpy.execute(test_query)
-        train_query = """
-        CREATE TABLE {train_table} AS
-        SELECT * FROM {strat_out_table}
-        EXCEPT ALL
-        SELECT * FROM {test_table}
-        """.format(train_table=train_table,
-                   strat_out_table=strat_out_table,
-                   test_table=test_table)
-        plpy.execute(train_query)
-        clean_up_tables = [strat_out_table]
-        if not separate_output_tables:
-            union_query = """
-            CREATE TABLE {output_table} AS
-            SELECT *,0 AS split FROM {test_table}
-            UNION ALL
-            SELECT *,1 AS split FROM {train_table}
-            """.format(output_table=output_table,
-                       test_table=test_table,
-                       train_table=train_table)
-            plpy.execute(union_query)
-            clean_up_tables += [train_table, test_table]
-        clean_up_query = """
-        DROP TABLE IF EXISTS {clean_up_tables}
-        """.format(clean_up_tables=",".join(clean_up_tables))
-        plpy.execute(clean_up_query)
-    return
-
-
-def validate_strs(source_table, output_table, train_proportion, 
test_proportion, glist, target_cols, with_replacement):
-
-    _assert(output_table and output_table.strip().lower() not in ('null', ''),
-            "Sample: Invalid output table name 
{output_table}!".format(**locals()))
-    _assert(not table_exists(output_table),
-            "Sample: Output table already exists!".format(**locals()))
-
-    _assert(source_table and source_table.strip().lower() not in ('null', ''),
-            "Sample: Invalid Source table name!".format(**locals()))
-    _assert(table_exists(source_table),
-            "Sample: Source table ({source_table}) is 
missing!".format(**locals()))
-    _assert(not table_is_empty(source_table),
-            "Sample: Source table ({source_table}) is 
empty!".format(**locals()))
-
-    for proportion in [train_proportion, test_proportion]:
-        _assert(proportion > 0 and proportion < 1,
-                "Sample: Proportions aren't in the range (0,1)!")
-    if not with_replacement:
-        _assert(train_proportion + test_proportion <= 1,
-                "Sample: Proportions add up to greater than 1!")
-
-    if glist is not None:
-        _assert(columns_exist_in_table(source_table, glist),
-                ("""Sample: Not all columns from {glist} are present in 
source""" +
-                 """ table ({source_table}).""").format(**locals()))
-
-    if not (target_cols is None or target_cols is '*'):
-        tlist = split_quoted_delimited_str(target_cols)
-        _assert(columns_exist_in_table(source_table, tlist),
-                ("""Sample: Not all columns from {target_cols} are present 
in""" +
-                 """ edge table ({source_table})""").format(**locals()))
-    return
-
-
-def test_train_split_help(schema_madlib, message, **kwargs):
-    """
-    Help function for test_train_split
-
-    Args:
-        @param schema_madlib
-        @param message: string, Help message string
-        @param kwargs
-
-    Returns:
-        String. Help/usage information
-    """
-    if not message:
-        help_string = """
------------------------------------------------------------------------
-                            SUMMARY
------------------------------------------------------------------------
-
-Given a table, test_train_split returns a random sample of the
-table for testing and training. It is possible to use with or without
-replacement sampling methods, specify a set of target columns, and a
-set of grouping columns, in which case, stratified sampling will be
-performed.
-
-For more details on function usage:
-    SELECT {schema_madlib}.test_train_split('usage');
-    SELECT {schema_madlib}.test_train_split('example');
-            """
-    elif message.lower() in ['usage', 'help', '?']:
-        help_string = """
-
-Given a table, test train split returns a proportion of records for
-each group (strata). It is possible to use with or without replacement
-sampling methods, specify a set of target columns, and assume the
-whole table is a single strata.
-
-----------------------------------------------------------------------------
-                            USAGE
-----------------------------------------------------------------------------
-
- SELECT {schema_madlib}.test_train_split(
-    source_table     TEXT,    -- Name of the table containing the input data.
-    output_table     TEXT,    -- Output table name.
-    train_proportion FLOAT8,  -- The ratio of train sample size to the
-                              -- number of records.
-    test_proportion  FLOAT8,  -- The ratio of test sample size to the
-                              -- number of records.
-    grouping_cols    TEXT     -- (Default: NULL) The columns to distinguish
-                              -- each strata.
-    target_cols      TEXT,    -- (Default: NULL) The columns to include in
-                              -- the output.
-    with_replacement BOOLEAN  -- (Default: FALSE) The sampling method.
-    separate_output_tables
-                     BOOLEAN  -- (Default: FALSE) Separate the output table
-                              -- into $output_table$_train and
-                              -- $output_table$_test, otherwise, the split
-                              -- column in output_table will identify 1 for
-                              -- train set and 0 for test set.
-
-If grouping_cols is NULL, the whole table is treated as a single group and
-sampled accordingly.
-
-If target_cols is NULL or '*', all of the columns will be included in the
-output table.
-
-If with_replacement is TRUE, each sample is independent (the same row may
-be selected in the sample set more than once). Else (if with_replacement
-is FALSE), a row can be selected at most once.
-);
-"""
-    elif message.lower() in ("example", "examples"):
-        help_string = """
-----------------------------------------------------------------------------
-                                EXAMPLES
-----------------------------------------------------------------------------
-
--- Create an input table
-DROP TABLE IF EXISTS test;
-
-CREATE TABLE test(
-    id1 INTEGER,
-    id2 INTEGER,
-    gr1 INTEGER,
-    gr2 INTEGER
-);
-
-INSERT INTO test VALUES
-(1,0,1,1),
-(2,0,1,1),
-(3,0,1,1),
-(4,0,1,1),
-(5,0,1,1),
-(6,0,1,1),
-(7,0,1,1),
-(8,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(0,1,1,2),
-(0,2,1,2),
-(0,3,1,2),
-(0,4,1,2),
-(0,5,1,2),
-(0,6,1,2),
-(10,10,2,2),
-(20,20,2,2),
-(30,30,2,2),
-(40,40,2,2),
-(50,50,2,2),
-(60,60,2,2),
-(70,70,2,2)
-;
-
--- Sample without replacement
-DROP TABLE IF EXISTS out;
-SELECT madlib.test_train_split(
-                                'test',    -- Source table
-                                'out',     -- Output table
-                                0.5,       -- Sample proportion
-                                0.5,       -- Sample proportion
-                                'gr1,gr2', -- Strata definition
-                                'id1,id2', -- Columns to output
-                                FALSE,     -- Sample without replacement
-                                FALSE);    -- Do not separate output tables
-SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
-
--- Sample with replacement
-DROP TABLE IF EXISTS out_train, out_test;
-SELECT madlib.test_train_split(
-                                'test',    -- Source table
-                                'out',     -- Output table
-                                0.5,       -- train_proportion
-                                NULL,      -- Default = 1 - train_proportion = 
0.5
-                                'gr1,gr2', -- Strata definition
-                                'id1,id2', -- Columns to output
-                                TRUE,      -- Sample with replacement
-                                TRUE);     -- Separate output tables
-SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
-"""
-    else:
-        help_string = "No such option. Use {schema_madlib}.graph_sssp()"
-
-    return help_string.format(schema_madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2308bef5/src/ports/postgres/modules/sample/test_train_split.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/test_train_split.sql_in 
b/src/ports/postgres/modules/sample/test_train_split.sql_in
deleted file mode 100644
index ba1adb3..0000000
--- a/src/ports/postgres/modules/sample/test_train_split.sql_in
+++ /dev/null
@@ -1,319 +0,0 @@
-/* ----------------------------------------------------------------------- 
*//**
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *
- * @file test_train_split.sql_in
- *
- * @brief SQL functions for test train split.
- * @date 07/19/2017
- *
- * @sa Given a table, test train split returns a proportion of records
- * for each group (strata).
- *
- *//* ----------------------------------------------------------------------- 
*/
-
-m4_include(`SQLCommon.m4')
-
-
-/**
-@addtogroup grp_test_train_split
-
-<div class="toc"><b>Contents</b>
-<ul>
-<li><a href="#strs">test train split</a></li>
-<li><a href="#examples">Examples</a></li>
-</ul>
-</div>
-
-@brief A method for independently sampling subpopulations (strata).
-
-test_train_split is a utility to create test and
-training data set as subsets of a single table.
-
-@anchor strs
-@par test train split
-
-<pre class="syntax">
-test_train_split(   source_table,
-                    output_table,
-                    train_proportion,
-                    test_proportion,
-                    grouping_cols,
-                    target_cols,
-                    with_replacement
-                )
-</pre>
-
-\b Arguments
-<dl class="arglist">
-<dt>source_table</dt>
-<dd>TEXT. Name of the table containing the input data.</dd>
-
-<dt>output_table</dt>
-<dd>Name of output table.   A new INTEGER column on the right
-called 'split' will identify 1 for train set and 0 for test set,
-unless the 'separate_output_tables' parameter below is TRUE,
-in which case two output tables will be created using
-the 'output_table' name with the suffixes '_train' and '_test'.
-The output table contains all the  columns present in the source
-table unless otherwise specified  in the 'target_cols' parameter below. </dd>
-
-<dt>train_proportion</dt>
-<dd>FLOAT8 in the range (0,1).  Proportion of the dataset to include
-in the train split.  If the 'grouping_col' parameter is specified below,
-each group will be sampled independently using the
-train proportion, i.e., in a stratified fashion.</dd>
-
-<dt>test_proportion</dt>
-<dd>FLOAT8 in the range (0,1).  Proportion of the dataset to include
-in the test split.  Default is the complement to the train
-proportion (1-'train_proportion').  If the 'grouping_col'
-parameter is specified below,  each group will be sampled
-independently using the  train proportion,
-i.e., in a stratified fashion.</dd>
-
-<dt>grouping_cols (optional)</dt>
-<dd>TEXT, default: NULL. A single column or a list of comma-separated columns
- that defines how to stratify.  When this parameter is NULL,
-the train-test split is not stratified.</dd>
-
-<dt>target_cols (optional)</dt>
-<dd>TEXT, default NULL. A comma-separated list of columns
-to appear in the 'output_table'.  If NULL or '*', all
-columns from the 'source_table'  will appear in
-the 'output_table'.</dd>
-
-@anchor note
-@note
-        Do not include 'grouping_cols' in the parameter 'target_cols',
-        because they are always included in the 'output_table'.
-
-<dt>with_replacement (optional)</dt>
-<dd>BOOLEAN, default FALSE.  Determines whether to sample
-with replacement or without replacement (default).
-With replacement means that it is possible that the
-same row may appear in the sample set more than once.
-Without replacement means a given row can be selected
-only once.</dd>
-</dl>
-
-<dt>separate_output_tables (optional)</dt>
-<dd>BOOLEAN, default FALSE.  If TRUE, two output tables will be created using
-the 'output_table' name with the suffixes '_train' and '_test'.</dd>
-</dl>
-
-
-@anchor examples
-@par Examples
-
-Please note that due to the random nature of sampling, your
-results may look different from those below.
-
--# Create an input table:
-<pre class="syntax">
-DROP TABLE IF EXISTS test;
-CREATE TABLE test(
-    id1 INTEGER,
-    id2 INTEGER,
-    gr1 INTEGER,
-    gr2 INTEGER
-);
-INSERT INTO test VALUES
-(1,0,1,1),
-(2,0,1,1),
-(3,0,1,1),
-(4,0,1,1),
-(5,0,1,1),
-(6,0,1,1),
-(7,0,1,1),
-(8,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(9,0,1,1),
-(0,1,1,2),
-(0,2,1,2),
-(0,3,1,2),
-(0,4,1,2),
-(0,5,1,2),
-(0,6,1,2),
-(10,10,2,2),
-(20,20,2,2),
-(30,30,2,2),
-(40,40,2,2),
-(50,50,2,2),
-(60,60,2,2),
-(70,70,2,2);
-</pre>
-
--# Sample without replacement:
-<pre class="syntax">
-DROP TABLE IF EXISTS out;
-SELECT madlib.test_train_split(
-                                'test',    -- Source table
-                                'out',     -- Output table
-                                0.5,       -- Sample proportion
-                                0.5,       -- Sample proportion
-                                'gr1,gr2', -- Strata definition
-                                'id1,id2', -- Columns to output
-                                FALSE,     -- Sample without replacement
-                                FALSE);    -- Do not separate output tables
-SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
-</pre>
-<pre class="result">
- gr1 | gr2 | id1 | id2 | split
------+-----+-----+-----+-------
-   1 |   1 |   1 |   0 |     0
-   1 |   1 |   4 |   0 |     0
-   1 |   1 |   6 |   0 |     0
-   1 |   1 |   9 |   0 |     0
-   1 |   1 |   9 |   0 |     0
-   1 |   1 |   9 |   0 |     0
-   1 |   2 |   0 |   3 |     0
-   1 |   2 |   0 |   4 |     0
-   1 |   2 |   0 |   5 |     0
-   2 |   2 |  10 |  10 |     0
-   2 |   2 |  30 |  30 |     0
-   2 |   2 |  40 |  40 |     0
-   2 |   2 |  60 |  60 |     0
-   1 |   1 |   2 |   0 |     1
-   1 |   1 |   3 |   0 |     1
-   1 |   1 |   5 |   0 |     1
-   1 |   1 |   7 |   0 |     1
-   1 |   1 |   8 |   0 |     1
-   1 |   1 |   9 |   0 |     1
-   1 |   2 |   0 |   1 |     1
-   1 |   2 |   0 |   2 |     1
-   1 |   2 |   0 |   6 |     1
-   2 |   2 |  20 |  20 |     1
-   2 |   2 |  50 |  50 |     1
-   2 |   2 |  70 |  70 |     1
-(25 rows)
-</pre>
-
--# Sample with replacement:
-<pre class="syntax">
-DROP TABLE IF EXISTS out_train, out_test;
-SELECT madlib.test_train_split(
-                                'test',    -- Source table
-                                'out',     -- Output table
-                                0.5,       -- train_proportion
-                                NULL,      -- Default = 1 - train_proportion = 
0.5
-                                'gr1,gr2', -- Strata definition
-                                'id1,id2', -- Columns to output
-                                TRUE,      -- Sample with replacement
-                                TRUE);     -- Separate output tables
-SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
-</pre>
-<pre class="result">
- gr1 | gr2 | id1 | id2
------+-----+-----+-----
-   1 |   1 |   1 |   0
-   1 |   1 |   2 |   0
-   1 |   1 |   4 |   0
-   1 |   1 |   7 |   0
-   1 |   1 |   8 |   0
-   1 |   1 |   9 |   0
-   1 |   2 |   0 |   4
-   1 |   2 |   0 |   5
-   1 |   2 |   0 |   6
-   2 |   2 |  40 |  40
-   2 |   2 |  50 |  50
-   2 |   2 |  50 |  50
-(12 rows)
-</pre>
-*/
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.test_train_split(
-  source_table           TEXT,
-  output_table           TEXT,
-  train_proportion       FLOAT8,
-  test_proportion        FLOAT8,
-  grouping_cols          TEXT,
-  target_cols            TEXT,
-  with_replacement       BOOLEAN,
-  separate_output_tables BOOLEAN
-) RETURNS VOID AS $$
-    PythonFunction(sample, test_train_split, test_train_split)
-$$ LANGUAGE plpythonu VOLATILE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-
--------------------------------------------------------------------------------
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.test_train_split(
-  source_table       TEXT,
-  output_table       TEXT,
-  train_proportion   FLOAT8,
-  test_proportion    FLOAT8,
-  grouping_cols      TEXT,
-  target_cols        TEXT,
-  with_replacement   BOOLEAN
-) RETURNS VOID AS $$
-     SELECT MADLIB_SCHEMA.test_train_split($1, $2, $3, $4, $5, $6, $7, FALSE);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.test_train_split(
-  source_table       TEXT,
-  output_table       TEXT,
-  train_proportion   FLOAT8,
-  test_proportion    FLOAT8,
-  grouping_cols      TEXT,
-  target_cols        TEXT
-) RETURNS VOID AS $$
-     SELECT MADLIB_SCHEMA.test_train_split($1, $2, $3, $4, $5, $6, FALSE);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.test_train_split(
-  source_table       TEXT,
-  output_table       TEXT,
-  train_proportion   FLOAT8,
-  test_proportion    FLOAT8,
-  grouping_cols      TEXT
-) RETURNS VOID AS $$
-     SELECT MADLIB_SCHEMA.test_train_split($1, $2, $3, $4, $5, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.test_train_split(
-  source_table       TEXT,
-  output_table       TEXT,
-  train_proportion   FLOAT8,
-  test_proportion    FLOAT8
-) RETURNS VOID AS $$
-     SELECT MADLIB_SCHEMA.test_train_split($1, $2, $3, $4, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-
--------------------------------------------------------------------------------
-
--- Online help
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.test_train_split(
-    message VARCHAR
-) RETURNS VARCHAR AS $$
-    PythonFunction(sample, test_train_split, test_train_split_help)
-$$ LANGUAGE plpythonu IMMUTABLE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-
--------------------------------------------------------------------------------
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.test_train_split()
-RETURNS VARCHAR AS $$
-    SELECT MADLIB_SCHEMA.test_train_split('');
-$$ LANGUAGE sql IMMUTABLE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
--------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2308bef5/src/ports/postgres/modules/sample/train_test_split.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/train_test_split.py_in 
b/src/ports/postgres/modules/sample/train_test_split.py_in
new file mode 100644
index 0000000..87cb2a7
--- /dev/null
+++ b/src/ports/postgres/modules/sample/train_test_split.py_in
@@ -0,0 +1,319 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import add_postfix
+from utilities.utilities import unique_string
+from utilities.utilities import split_quoted_delimited_str
+from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_expr_type
+from utilities.validate_args import get_cols
+from graph.graph_utils import _check_groups
+from graph.graph_utils import _grp_from_table
+
+m4_changequote(` <!', `!>')
+
+
+def _get_sql_string(str):
+    if str:
+        return "'" + str + "'"
+    return "NULL"
+
+
+def train_test_split(schema_madlib, source_table, output_table, 
train_proportion,
+                     test_proportion, grouping_cols, target_cols, 
with_replacement,
+                     separate_output_tables, **kwargs):
+    """
+    test train split function
+    Args:
+        @param source_table           Input table name.
+        @param output_table           Output table name.
+        @param train_proportion       The ratio of training data to the entire
+                                      input table
+        @param test_proportion        The ratio of test data to the entire
+                                      input table
+        @param grouping_cols          (Default: NULL) The columns to 
distinguish
+                                      each strata.
+        @param target_cols            (Default: NULL) The columns to include in
+                                      the output.
+        @param with_replacement       (Default: FALSE) The sampling method.
+        @param separate_output_tables (Default: FALSE) Create two output 
tables,
+                                      <output_table>_train and 
<output_table>_test.
+                                      Otherwise one output table is created 
with
+                                      and additional column 'split' which 
takes the
+                                      value 0 for test and 1 for training.
+
+    """
+    with MinWarning("warning"):
+        if test_proportion is None:
+            test_proportion = 1 - train_proportion
+        validate_strs(source_table, output_table, train_proportion, 
test_proportion,
+                      split_quoted_delimited_str(grouping_cols), target_cols,
+                      with_replacement)
+        grouping_cols = _get_sql_string(grouping_cols)
+        target_cols = _get_sql_string(target_cols)
+        with_replacement = with_replacement or "False"
+        strat_query = """
+        SELECT {schema_madlib}.stratified_sample(
+            '{strat_source_table}',
+            '{strat_out_table}',
+            '{strat_proportion}',
+            {strat_grouping_cols},
+            {strat_target_cols},
+            {strat_with_replacement}
+        )
+        """
+        strat_out_table = unique_string()
+        q = strat_query.format(
+            schema_madlib=schema_madlib,
+            strat_source_table=source_table,
+            strat_out_table=strat_out_table,
+            strat_proportion=train_proportion + test_proportion,
+            strat_grouping_cols=grouping_cols,
+            strat_with_replacement=with_replacement,
+            strat_target_cols=target_cols
+        )
+        plpy.execute(q)
+        test_table = add_postfix(output_table, "_test")
+        train_table = add_postfix(output_table, "_train")
+        if not separate_output_tables:
+            test_table = unique_string()
+            train_table = unique_string()
+        test_query = strat_query.format(
+            schema_madlib=schema_madlib,
+            strat_source_table=strat_out_table,
+            strat_out_table=test_table,
+            strat_proportion=(test_proportion /
+                              (train_proportion + test_proportion)),
+            strat_grouping_cols=grouping_cols,
+            strat_with_replacement=False,
+            strat_target_cols=target_cols
+        )
+        plpy.execute(test_query)
+        train_query = """
+        CREATE TABLE {train_table} AS
+        SELECT * FROM {strat_out_table}
+        EXCEPT ALL
+        SELECT * FROM {test_table}
+        """.format(train_table=train_table,
+                   strat_out_table=strat_out_table,
+                   test_table=test_table)
+        plpy.execute(train_query)
+        clean_up_tables = [strat_out_table]
+        if not separate_output_tables:
+            union_query = """
+            CREATE TABLE {output_table} AS
+            SELECT *,0 AS split FROM {test_table}
+            UNION ALL
+            SELECT *,1 AS split FROM {train_table}
+            """.format(output_table=output_table,
+                       test_table=test_table,
+                       train_table=train_table)
+            plpy.execute(union_query)
+            clean_up_tables += [train_table, test_table]
+        clean_up_query = """
+        DROP TABLE IF EXISTS {clean_up_tables}
+        """.format(clean_up_tables=",".join(clean_up_tables))
+        plpy.execute(clean_up_query)
+    return
+
+
+def validate_strs(source_table, output_table, train_proportion, 
test_proportion, glist, target_cols, with_replacement):
+
+    _assert(output_table and output_table.strip().lower() not in ('null', ''),
+            "Sample: Invalid output table name 
{output_table}!".format(**locals()))
+    _assert(not table_exists(output_table),
+            "Sample: Output table already exists!".format(**locals()))
+
+    _assert(source_table and source_table.strip().lower() not in ('null', ''),
+            "Sample: Invalid Source table name!".format(**locals()))
+    _assert(table_exists(source_table),
+            "Sample: Source table ({source_table}) is 
missing!".format(**locals()))
+    _assert(not table_is_empty(source_table),
+            "Sample: Source table ({source_table}) is 
empty!".format(**locals()))
+
+    for proportion in [train_proportion, test_proportion]:
+        _assert(proportion > 0 and proportion < 1,
+                "Sample: Proportions aren't in the range (0,1)!")
+    if not with_replacement:
+        _assert(train_proportion + test_proportion <= 1,
+                "Sample: Proportions add up to greater than 1!")
+
+    if glist is not None:
+        _assert(columns_exist_in_table(source_table, glist),
+                ("""Sample: Not all columns from {glist} are present in 
source""" +
+                 """ table ({source_table}).""").format(**locals()))
+
+    if not (target_cols is None or target_cols is '*'):
+        tlist = split_quoted_delimited_str(target_cols)
+        _assert(columns_exist_in_table(source_table, tlist),
+                ("""Sample: Not all columns from {target_cols} are present 
in""" +
+                 """ edge table ({source_table})""").format(**locals()))
+    return
+
+
+def train_test_split_help(schema_madlib, message, **kwargs):
+    """
+    Help function for train_test_split
+
+    Args:
+        @param schema_madlib
+        @param message: string, Help message string
+        @param kwargs
+
+    Returns:
+        String. Help/usage information
+    """
+    if not message:
+        help_string = """
+-----------------------------------------------------------------------
+                            SUMMARY
+-----------------------------------------------------------------------
+
+Given a table, train_test_split returns a random sample of the
+table for testing and training. It is possible to use with or without
+replacement sampling methods, specify a set of target columns, and a
+set of grouping columns, in which case, stratified sampling will be
+performed.
+
+For more details on function usage:
+    SELECT {schema_madlib}.train_test_split('usage');
+    SELECT {schema_madlib}.train_test_split('example');
+            """
+    elif message.lower() in ['usage', 'help', '?']:
+        help_string = """
+
+Given a table, test train split returns a proportion of records for
+each group (strata). It is possible to use with or without replacement
+sampling methods, specify a set of target columns, and assume the
+whole table is a single strata.
+
+----------------------------------------------------------------------------
+                            USAGE
+----------------------------------------------------------------------------
+
+ SELECT {schema_madlib}.train_test_split(
+    source_table     TEXT,    -- Name of the table containing the input data.
+    output_table     TEXT,    -- Output table name.
+    train_proportion FLOAT8,  -- The ratio of train sample size to the
+                              -- number of records.
+    test_proportion  FLOAT8,  -- The ratio of test sample size to the
+                              -- number of records.
+    grouping_cols    TEXT     -- (Default: NULL) The columns to distinguish
+                              -- each strata.
+    target_cols      TEXT,    -- (Default: NULL) The columns to include in
+                              -- the output.
+    with_replacement BOOLEAN  -- (Default: FALSE) The sampling method.
+    separate_output_tables
+                     BOOLEAN  -- (Default: FALSE) Separate the output table
+                              -- into $output_table$_train and
+                              -- $output_table$_test, otherwise, the split
+                              -- column in output_table will identify 1 for
+                              -- train set and 0 for test set.
+
+If grouping_cols is NULL, the whole table is treated as a single group and
+sampled accordingly.
+
+If target_cols is NULL or '*', all of the columns will be included in the
+output table.
+
+If with_replacement is TRUE, each sample is independent (the same row may
+be selected in the sample set more than once). Else (if with_replacement
+is FALSE), a row can be selected at most once.
+);
+"""
+    elif message.lower() in ("example", "examples"):
+        help_string = """
+----------------------------------------------------------------------------
+                                EXAMPLES
+----------------------------------------------------------------------------
+
+-- Create an input table
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test(
+    id1 INTEGER,
+    id2 INTEGER,
+    gr1 INTEGER,
+    gr2 INTEGER
+);
+
+INSERT INTO test VALUES
+(1,0,1,1),
+(2,0,1,1),
+(3,0,1,1),
+(4,0,1,1),
+(5,0,1,1),
+(6,0,1,1),
+(7,0,1,1),
+(8,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(0,1,1,2),
+(0,2,1,2),
+(0,3,1,2),
+(0,4,1,2),
+(0,5,1,2),
+(0,6,1,2),
+(10,10,2,2),
+(20,20,2,2),
+(30,30,2,2),
+(40,40,2,2),
+(50,50,2,2),
+(60,60,2,2),
+(70,70,2,2)
+;
+
+-- Sample without replacement
+DROP TABLE IF EXISTS out;
+SELECT madlib.train_test_split(
+                                'test',    -- Source table
+                                'out',     -- Output table
+                                0.5,       -- Sample proportion
+                                0.5,       -- Sample proportion
+                                'gr1,gr2', -- Strata definition
+                                'id1,id2', -- Columns to output
+                                FALSE,     -- Sample without replacement
+                                FALSE);    -- Do not separate output tables
+SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
+
+-- Sample with replacement
+DROP TABLE IF EXISTS out_train, out_test;
+SELECT madlib.train_test_split(
+                                'test',    -- Source table
+                                'out',     -- Output table
+                                0.5,       -- train_proportion
+                                NULL,      -- Default = 1 - train_proportion = 
0.5
+                                'gr1,gr2', -- Strata definition
+                                'id1,id2', -- Columns to output
+                                TRUE,      -- Sample with replacement
+                                TRUE);     -- Separate output tables
+SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
+"""
+    else:
+        help_string = "No such option. Use {schema_madlib}.graph_sssp()"
+
+    return help_string.format(schema_madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2308bef5/src/ports/postgres/modules/sample/train_test_split.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/train_test_split.sql_in 
b/src/ports/postgres/modules/sample/train_test_split.sql_in
new file mode 100644
index 0000000..d5d0913
--- /dev/null
+++ b/src/ports/postgres/modules/sample/train_test_split.sql_in
@@ -0,0 +1,346 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file train_test_split.sql_in
+ *
+ * @brief SQL functions for test train split.
+ * @date 07/19/2017
+ *
+ * @sa Given a table, test train split returns a proportion of records
+ * for each group (strata).
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+
+m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_train_test_split
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#strs">Train-Test Split</a></li>
+<li><a href="#examples">Examples</a></li>
+</ul>
+</div>
+
+@brief A method for splitting a data set into separate training and testing 
sets.
+
+Train-test split is a utility to create training and
+testing sets from a single data set.
+
+@anchor strs
+@par Train-Test Split
+
+<pre class="syntax">
+train_test_split(   source_table,
+                    output_table,
+                    train_proportion,
+                    test_proportion,
+                    grouping_cols,
+                    target_cols,
+                    with_replacement
+                )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>source_table</dt>
+<dd>TEXT. Name of the table containing the input data.</dd>
+
+<dt>output_table</dt>
+<dd>Name of output table.   A new INTEGER column on the right
+called 'split' will identify 1 for train set and 0 for test set,
+unless the 'separate_output_tables' parameter below is TRUE,
+in which case two output tables will be created using
+the 'output_table' name with the suffixes '_train' and '_test'.
+The output table contains all the  columns present in the source
+table unless otherwise specified  in the 'target_cols' parameter below. </dd>
+
+<dt>train_proportion</dt>
+<dd>FLOAT8 in the range (0,1).  Proportion of the dataset to include
+in the train split.  If the 'grouping_col' parameter is specified below,
+each group will be sampled independently using the
+train proportion, i.e., in a stratified fashion.</dd>
+
+<dt>test_proportion (optional)</dt>
+<dd>FLOAT8 in the range (0,1).  Proportion of the dataset to include
+in the test split.  Default is the complement to the train
+proportion (1-'train_proportion').  If the 'grouping_col'
+parameter is specified below,  each group will be sampled
+independently using the  train proportion,
+i.e., in a stratified fashion.</dd>
+
+<dt>grouping_cols (optional)</dt>
+<dd>TEXT, default: NULL. A single column or a list of comma-separated columns
+ that defines how to stratify.  When this parameter is NULL,
+the train-test split is not stratified.</dd>
+
+<dt>target_cols (optional)</dt>
+<dd>TEXT, default NULL. A comma-separated list of columns
+to appear in the 'output_table'.  If NULL or '*', all
+columns from the 'source_table'  will appear in
+the 'output_table'.</dd>
+
+@anchor note
+@note
+        Do not include 'grouping_cols' in the parameter 'target_cols',
+        because they are always included in the 'output_table'.
+
+<dt>with_replacement (optional)</dt>
+<dd>BOOLEAN, default FALSE.  Determines whether to sample
+with replacement or without replacement (default).
+With replacement means that it is possible that the
+same row may appear in the sample set more than once.
+Without replacement means a given row can be selected
+only once.</dd>
+
+<dt>separate_output_tables (optional)</dt>
+<dd>BOOLEAN, default FALSE.  If TRUE, two output tables will be created using
+the 'output_table' name with the suffixes '_train' and '_test'.</dd>
+</dl>
+
+
+@anchor examples
+@par Examples
+
+Please note that due to the random nature of sampling, your
+results may look different from those below.
+
+-# Create an input table:
+<pre class="syntax">
+DROP TABLE IF EXISTS test;
+CREATE TABLE test(
+    id1 INTEGER,
+    id2 INTEGER,
+    gr1 INTEGER,
+    gr2 INTEGER
+);
+INSERT INTO test VALUES
+(1,0,1,1),
+(2,0,1,1),
+(3,0,1,1),
+(4,0,1,1),
+(5,0,1,1),
+(6,0,1,1),
+(7,0,1,1),
+(8,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(0,1,1,2),
+(0,2,1,2),
+(0,3,1,2),
+(0,4,1,2),
+(0,5,1,2),
+(0,6,1,2),
+(10,10,2,2),
+(20,20,2,2),
+(30,30,2,2),
+(40,40,2,2),
+(50,50,2,2),
+(60,60,2,2),
+(70,70,2,2);
+</pre>
+
+-# Sample without replacement:
+<pre class="syntax">
+DROP TABLE IF EXISTS out;
+SELECT madlib.train_test_split(
+                                'test',    -- Source table
+                                'out',     -- Output table
+                                0.5,       -- Sample proportion
+                                0.5,       -- Sample proportion
+                                'gr1,gr2', -- Strata definition
+                                'id1,id2', -- Columns to output
+                                FALSE,     -- Sample without replacement
+                                FALSE);    -- Do not separate output tables
+SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
+</pre>
+<pre class="result">
+ gr1 | gr2 | id1 | id2 | split
+-----+-----+-----+-----+-------
+   1 |   1 |   1 |   0 |     0
+   1 |   1 |   4 |   0 |     0
+   1 |   1 |   6 |   0 |     0
+   1 |   1 |   9 |   0 |     0
+   1 |   1 |   9 |   0 |     0
+   1 |   1 |   9 |   0 |     0
+   1 |   2 |   0 |   3 |     0
+   1 |   2 |   0 |   4 |     0
+   1 |   2 |   0 |   5 |     0
+   2 |   2 |  10 |  10 |     0
+   2 |   2 |  30 |  30 |     0
+   2 |   2 |  40 |  40 |     0
+   2 |   2 |  60 |  60 |     0
+   1 |   1 |   2 |   0 |     1
+   1 |   1 |   3 |   0 |     1
+   1 |   1 |   5 |   0 |     1
+   1 |   1 |   7 |   0 |     1
+   1 |   1 |   8 |   0 |     1
+   1 |   1 |   9 |   0 |     1
+   1 |   2 |   0 |   1 |     1
+   1 |   2 |   0 |   2 |     1
+   1 |   2 |   0 |   6 |     1
+   2 |   2 |  20 |  20 |     1
+   2 |   2 |  50 |  50 |     1
+   2 |   2 |  70 |  70 |     1
+(25 rows)
+</pre>
+
+-# Sample with replacement and create separate train and test tables:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_train, out_test;
+SELECT madlib.train_test_split(
+                                'test',    -- Source table
+                                'out',     -- Output table
+                                0.5,       -- train_proportion
+                                NULL,      -- Default = 1 - train_proportion = 
0.5
+                                'gr1,gr2', -- Strata definition
+                                'id1,id2', -- Columns to output
+                                TRUE,      -- Sample with replacement
+                                TRUE);     -- Separate output tables
+SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
+</pre>
+<pre class="result">
+ gr1 | gr2 | id1 | id2
+-----+-----+-----+-----
+   1 |   1 |   1 |   0
+   1 |   1 |   2 |   0
+   1 |   1 |   4 |   0
+   1 |   1 |   7 |   0
+   1 |   1 |   8 |   0
+   1 |   1 |   9 |   0
+   1 |   2 |   0 |   4
+   1 |   2 |   0 |   5
+   1 |   2 |   0 |   6
+   2 |   2 |  40 |  40
+   2 |   2 |  50 |  50
+   2 |   2 |  50 |  50
+(12 rows)
+</pre>
+<pre class="syntax">
+SELECT * FROM out_test ORDER BY gr1,gr2,id1,id2;
+</pre>
+<pre class="result">
+ gr1 | gr2 | id1 | id2
+-----+-----+-----+-----
+   1 |   1 |   1 |   0
+   1 |   1 |   1 |   0
+   1 |   1 |   3 |   0
+   1 |   1 |   4 |   0
+   1 |   1 |   5 |   0
+   1 |   1 |   9 |   0
+   1 |   2 |   0 |   1
+   1 |   2 |   0 |   5
+   1 |   2 |   0 |   6
+   2 |   2 |  20 |  20
+   2 |   2 |  20 |  20
+   2 |   2 |  20 |  20
+   2 |   2 |  70 |  70
+(13 rows)
+</pre>
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
+  source_table           TEXT,
+  output_table           TEXT,
+  train_proportion       FLOAT8,
+  test_proportion        FLOAT8,
+  grouping_cols          TEXT,
+  target_cols            TEXT,
+  with_replacement       BOOLEAN,
+  separate_output_tables BOOLEAN
+) RETURNS VOID AS $$
+    PythonFunction(sample, train_test_split, train_test_split)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
+  source_table       TEXT,
+  output_table       TEXT,
+  train_proportion   FLOAT8,
+  test_proportion    FLOAT8,
+  grouping_cols      TEXT,
+  target_cols        TEXT,
+  with_replacement   BOOLEAN
+) RETURNS VOID AS $$
+     SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, $7, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
+  source_table       TEXT,
+  output_table       TEXT,
+  train_proportion   FLOAT8,
+  test_proportion    FLOAT8,
+  grouping_cols      TEXT,
+  target_cols        TEXT
+) RETURNS VOID AS $$
+     SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, FALSE, 
FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
+  source_table       TEXT,
+  output_table       TEXT,
+  train_proportion   FLOAT8,
+  test_proportion    FLOAT8,
+  grouping_cols      TEXT
+) RETURNS VOID AS $$
+     SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, NULL, FALSE, 
FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
+  source_table       TEXT,
+  output_table       TEXT,
+  train_proportion   FLOAT8,
+  test_proportion    FLOAT8
+) RETURNS VOID AS $$
+     SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, NULL, NULL, FALSE, 
FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
+  source_table       TEXT,
+  output_table       TEXT,
+  train_proportion   FLOAT8
+) RETURNS VOID AS $$
+     SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, NULL, NULL, NULL, 
FALSE, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(sample, train_test_split, train_test_split_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split()
+RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.train_test_split('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+-------------------------------------------------------------------------------

Reply via email to