http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/7fa3b796/community-artifacts/Train-test-split-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Train-test-split-v1.ipynb
b/community-artifacts/Train-test-split-v1.ipynb
new file mode 100644
index 0000000..1441891
--- /dev/null
+++ b/community-artifacts/Train-test-split-v1.ipynb
@@ -0,0 +1,1154 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Train-test split\n",
+ "Train-test split is a utility to create training and testing set as
subsets from a single data set.\n",
+ "\n",
+ "Train-test split was added in MADlib 1.12."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The sql extension is already loaded. To reload it, use:\n",
+ " %reload_ext sql\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "u'Connected: gpdbchina@madlib'"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Greenplum 4.3.10.0\n",
+ "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+ " \n",
+ "# PostgreSQL local\n",
+ "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+ "\n",
+ "# Greenplum 4.2.3.0\n",
+ "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>version</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>MADlib version: 1.12, git revision: rel/v1.11-57-gd5d5a26,
cmake configuration time: Sun Aug 20 19:18:30 UTC 2017, build type: Release,
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0,
C++ compiler: g++ 4.4.0</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'MADlib version: 1.12, git revision: rel/v1.11-57-gd5d5a26, cmake
configuration time: Sun Aug 20 19:18:30 UTC 2017, build type: Release, build
system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++
compiler: g++ 4.4.0',)]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql select madlib.version();\n",
+ "#%sql select version();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Create input table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "Done.\n",
+ "25 rows affected.\n",
+ "25 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id1</th>\n",
+ " <th>id2</th>\n",
+ " <th>gr1</th>\n",
+ " <th>gr2</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>10</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>20</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>30</td>\n",
+ " <td>30</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>40</td>\n",
+ " <td>40</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>50</td>\n",
+ " <td>50</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>60</td>\n",
+ " <td>60</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>70</td>\n",
+ " <td>70</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0, 1, 1, 2),\n",
+ " (0, 2, 1, 2),\n",
+ " (0, 3, 1, 2),\n",
+ " (0, 4, 1, 2),\n",
+ " (0, 5, 1, 2),\n",
+ " (0, 6, 1, 2),\n",
+ " (1, 0, 1, 1),\n",
+ " (2, 0, 1, 1),\n",
+ " (3, 0, 1, 1),\n",
+ " (4, 0, 1, 1),\n",
+ " (5, 0, 1, 1),\n",
+ " (6, 0, 1, 1),\n",
+ " (7, 0, 1, 1),\n",
+ " (8, 0, 1, 1),\n",
+ " (9, 0, 1, 1),\n",
+ " (9, 0, 1, 1),\n",
+ " (9, 0, 1, 1),\n",
+ " (9, 0, 1, 1),\n",
+ " (10, 10, 2, 2),\n",
+ " (20, 20, 2, 2),\n",
+ " (30, 30, 2, 2),\n",
+ " (40, 40, 2, 2),\n",
+ " (50, 50, 2, 2),\n",
+ " (60, 60, 2, 2),\n",
+ " (70, 70, 2, 2)]"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql \n",
+ "DROP TABLE IF EXISTS test;\n",
+ "\n",
+ "CREATE TABLE test(\n",
+ " id1 INTEGER,\n",
+ " id2 INTEGER,\n",
+ " gr1 INTEGER,\n",
+ " gr2 INTEGER\n",
+ ");\n",
+ "\n",
+ "INSERT INTO test VALUES\n",
+ "(1,0,1,1),\n",
+ "(2,0,1,1),\n",
+ "(3,0,1,1),\n",
+ "(4,0,1,1),\n",
+ "(5,0,1,1),\n",
+ "(6,0,1,1),\n",
+ "(7,0,1,1),\n",
+ "(8,0,1,1),\n",
+ "(9,0,1,1),\n",
+ "(9,0,1,1),\n",
+ "(9,0,1,1),\n",
+ "(9,0,1,1),\n",
+ "(0,1,1,2),\n",
+ "(0,2,1,2),\n",
+ "(0,3,1,2),\n",
+ "(0,4,1,2),\n",
+ "(0,5,1,2),\n",
+ "(0,6,1,2),\n",
+ "(10,10,2,2),\n",
+ "(20,20,2,2),\n",
+ "(30,30,2,2),\n",
+ "(40,40,2,2),\n",
+ "(50,50,2,2),\n",
+ "(60,60,2,2),\n",
+ "(70,70,2,2);\n",
+ "\n",
+ "SELECT * FROM test ORDER BY id1, id2, gr1, gr2;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Sample without replacement"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "25 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>gr1</th>\n",
+ " <th>gr2</th>\n",
+ " <th>id1</th>\n",
+ " <th>id2</th>\n",
+ " <th>split</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>10</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>20</td>\n",
+ " <td>20</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>30</td>\n",
+ " <td>30</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>50</td>\n",
+ " <td>50</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>40</td>\n",
+ " <td>40</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>60</td>\n",
+ " <td>60</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>70</td>\n",
+ " <td>70</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 1, 2, 0, 0),\n",
+ " (1, 1, 4, 0, 0),\n",
+ " (1, 1, 5, 0, 0),\n",
+ " (1, 1, 7, 0, 0),\n",
+ " (1, 1, 8, 0, 0),\n",
+ " (1, 1, 9, 0, 0),\n",
+ " (1, 2, 0, 3, 0),\n",
+ " (1, 2, 0, 5, 0),\n",
+ " (1, 2, 0, 6, 0),\n",
+ " (2, 2, 10, 10, 0),\n",
+ " (2, 2, 20, 20, 0),\n",
+ " (2, 2, 30, 30, 0),\n",
+ " (2, 2, 50, 50, 0),\n",
+ " (1, 1, 1, 0, 1),\n",
+ " (1, 1, 3, 0, 1),\n",
+ " (1, 1, 6, 0, 1),\n",
+ " (1, 1, 9, 0, 1),\n",
+ " (1, 1, 9, 0, 1),\n",
+ " (1, 1, 9, 0, 1),\n",
+ " (1, 2, 0, 1, 1),\n",
+ " (1, 2, 0, 2, 1),\n",
+ " (1, 2, 0, 4, 1),\n",
+ " (2, 2, 40, 40, 1),\n",
+ " (2, 2, 60, 60, 1),\n",
+ " (2, 2, 70, 70, 1)]"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS out;\n",
+ "\n",
+ "SELECT madlib.train_test_split(\n",
+ " 'test', -- Source table\n",
+ " 'out', -- Output table\n",
+ " 0.5, -- Sample proportion\n",
+ " 0.5, -- Sample proportion\n",
+ " 'gr1,gr2', -- Strata definition\n",
+ " 'id1,id2', -- Columns to output\n",
+ " FALSE, -- Sample without
replacement\n",
+ " FALSE); -- Do not separate output
tables\n",
+ "\n",
+ "SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. Sample with replacement"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "12 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>gr1</th>\n",
+ " <th>gr2</th>\n",
+ " <th>id1</th>\n",
+ " <th>id2</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>5</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>20</td>\n",
+ " <td>20</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>30</td>\n",
+ " <td>30</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>70</td>\n",
+ " <td>70</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 1, 1, 0),\n",
+ " (1, 1, 6, 0),\n",
+ " (1, 1, 9, 0),\n",
+ " (1, 1, 9, 0),\n",
+ " (1, 1, 9, 0),\n",
+ " (1, 1, 9, 0),\n",
+ " (1, 2, 0, 2),\n",
+ " (1, 2, 0, 3),\n",
+ " (1, 2, 0, 5),\n",
+ " (2, 2, 20, 20),\n",
+ " (2, 2, 30, 30),\n",
+ " (2, 2, 70, 70)]"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS out, out_train, out_test;\n",
+ "\n",
+ "SELECT madlib.train_test_split(\n",
+ " 'test', -- Source table\n",
+ " 'out', -- Output table\n",
+ " 0.5, -- train_proportion\n",
+ " NULL, -- Default = 1 -
train_proportion = 0.5\n",
+ " 'gr1,gr2', -- Strata definition\n",
+ " 'id1,id2', -- Columns to output\n",
+ " TRUE, -- Sample with replacement\n",
+ " TRUE); -- Separate output tables\n",
+ "\n",
+ "SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "13 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>gr1</th>\n",
+ " <th>gr2</th>\n",
+ " <th>id1</th>\n",
+ " <th>id2</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>30</td>\n",
+ " <td>30</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>50</td>\n",
+ " <td>50</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>60</td>\n",
+ " <td>60</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>60</td>\n",
+ " <td>60</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, 1, 1, 0),\n",
+ " (1, 1, 4, 0),\n",
+ " (1, 1, 5, 0),\n",
+ " (1, 1, 7, 0),\n",
+ " (1, 1, 8, 0),\n",
+ " (1, 1, 9, 0),\n",
+ " (1, 2, 0, 1),\n",
+ " (1, 2, 0, 4),\n",
+ " (1, 2, 0, 4),\n",
+ " (2, 2, 30, 30),\n",
+ " (2, 2, 50, 50),\n",
+ " (2, 2, 60, 60),\n",
+ " (2, 2, 60, 60)]"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM out_test ORDER BY gr1,gr2,id1,id2;"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "25 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id1</th>\n",
+ " <th>id2</th>\n",
+ " <th>gr1</th>\n",
+ " <th>gr2</th>\n",
+ " <th>split</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>10</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>20</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>30</td>\n",
+ " <td>30</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>60</td>\n",
+ " <td>60</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>40</td>\n",
+ " <td>40</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>50</td>\n",
+ " <td>50</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>70</td>\n",
+ " <td>70</td>\n",
+ " <td>2</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(2, 0, 1, 1, 0),\n",
+ " (3, 0, 1, 1, 0),\n",
+ " (5, 0, 1, 1, 0),\n",
+ " (7, 0, 1, 1, 0),\n",
+ " (9, 0, 1, 1, 0),\n",
+ " (9, 0, 1, 1, 0),\n",
+ " (9, 0, 1, 1, 0),\n",
+ " (0, 4, 1, 2, 0),\n",
+ " (0, 6, 1, 2, 0),\n",
+ " (10, 10, 2, 2, 0),\n",
+ " (20, 20, 2, 2, 0),\n",
+ " (30, 30, 2, 2, 0),\n",
+ " (60, 60, 2, 2, 0),\n",
+ " (1, 0, 1, 1, 1),\n",
+ " (4, 0, 1, 1, 1),\n",
+ " (6, 0, 1, 1, 1),\n",
+ " (8, 0, 1, 1, 1),\n",
+ " (9, 0, 1, 1, 1),\n",
+ " (0, 1, 1, 2, 1),\n",
+ " (0, 2, 1, 2, 1),\n",
+ " (0, 3, 1, 2, 1),\n",
+ " (0, 5, 1, 2, 1),\n",
+ " (40, 40, 2, 2, 1),\n",
+ " (50, 50, 2, 2, 1),\n",
+ " (70, 70, 2, 2, 1)]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS out;\n",
+ "\n",
+ "SELECT madlib.train_test_split(\n",
+ " 'test', -- Source table\n",
+ " 'out', -- Output table\n",
+ " 0.5 -- Sample proportion\n",
+ ");\n",
+ "SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}