Repository: incubator-madlib-site Updated Branches: refs/heads/asf-site a72096891 -> e1b973ae4
added statified sampling workbook Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/commit/e1b973ae Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/tree/e1b973ae Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/diff/e1b973ae Branch: refs/heads/asf-site Commit: e1b973ae473c8deec08b70f191243c30ffb56f00 Parents: a720968 Author: Frank McQuillan <fmcquil...@pivotal.io> Authored: Wed Jun 28 15:46:03 2017 -0700 Committer: Frank McQuillan <fmcquil...@pivotal.io> Committed: Wed Jun 28 15:46:03 2017 -0700 ---------------------------------------------------------------------- .../stratified-sampling-v1.ipynb | 672 +++++++++++++++++++ 1 file changed, 672 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/e1b973ae/community-artifacts/stratified-sampling-v1.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/stratified-sampling-v1.ipynb b/community-artifacts/stratified-sampling-v1.ipynb new file mode 100644 index 0000000..75e02fd --- /dev/null +++ b/community-artifacts/stratified-sampling-v1.ipynb @@ -0,0 +1,672 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stratified sampling\n", + "Stratified sampling is a method for sampling subpopulations (strata) independently. It is commonly used to reduce sampling error by ensuring that subgroups are adequately represented in the sample.\n", + "\n", + "Stratified sampling was added in MADlib 1.12." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] + } + ], + "source": [ + "%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "u'Connected: gpdbchina@madlib'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Greenplum 4.3.10.0\n", + "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", + " \n", + "# PostgreSQL local\n", + "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", + "\n", + "# Greenplum 4.2.3.0\n", + "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>version</th>\n", + " </tr>\n", + " <tr>\n", + " <td>MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%sql select madlib.version();\n", + "#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Create input table" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "25 rows affected.\n", + "25 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id1</th>\n", + " <th>id2</th>\n", + " <th>gr1</th>\n", + " <th>gr2</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>10</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>30</td>\n", + " <td>30</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>40</td>\n", + " <td>40</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>50</td>\n", + " <td>50</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>60</td>\n", + " <td>60</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>70</td>\n", + " <td>70</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0, 1, 1),\n", + " (2, 0, 1, 1),\n", + " (3, 0, 1, 1),\n", + " (4, 0, 1, 1),\n", + " (5, 0, 1, 1),\n", + " (6, 0, 1, 1),\n", + " (7, 0, 1, 1),\n", + " (8, 0, 1, 1),\n", + " (9, 0, 1, 1),\n", + " (9, 0, 1, 1),\n", + " (9, 0, 1, 1),\n", + " (9, 0, 1, 1),\n", + " (0, 1, 1, 2),\n", + " (0, 2, 1, 2),\n", + " (0, 3, 1, 2),\n", + " (0, 4, 1, 2),\n", + " (0, 5, 1, 2),\n", + " (0, 6, 1, 2),\n", + " (10, 10, 2, 2),\n", + " (20, 20, 2, 2),\n", + " (30, 30, 2, 2),\n", + " (40, 40, 2, 2),\n", + " (50, 50, 2, 2),\n", + " (60, 60, 2, 2),\n", + " (70, 70, 2, 2)]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql \n", + "DROP TABLE IF EXISTS test;\n", + "\n", + "CREATE TABLE test(\n", + " id1 INTEGER,\n", + " id2 INTEGER,\n", + " gr1 INTEGER,\n", + " gr2 INTEGER\n", + ");\n", + "\n", + "INSERT INTO test VALUES\n", + "(1,0,1,1),\n", + "(2,0,1,1),\n", + "(3,0,1,1),\n", + "(4,0,1,1),\n", + "(5,0,1,1),\n", + "(6,0,1,1),\n", + "(7,0,1,1),\n", + "(8,0,1,1),\n", + "(9,0,1,1),\n", + "(9,0,1,1),\n", + "(9,0,1,1),\n", + "(9,0,1,1),\n", + "(0,1,1,2),\n", + "(0,2,1,2),\n", + "(0,3,1,2),\n", + "(0,4,1,2),\n", + "(0,5,1,2),\n", + "(0,6,1,2),\n", + "(10,10,2,2),\n", + "(20,20,2,2),\n", + "(30,30,2,2),\n", + "(40,40,2,2),\n", + "(50,50,2,2),\n", + "(60,60,2,2),\n", + "(70,70,2,2);\n", + "\n", + "SELECT * FROM test ORDER BY gr1, gr2, id1, id2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Sample without replacement" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "13 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>gr1</th>\n", + " <th>gr2</th>\n", + " <th>id1</th>\n", + " <th>id2</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>30</td>\n", + " <td>30</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>40</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>50</td>\n", + " <td>50</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>70</td>\n", + " <td>70</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 1, 1, 0),\n", + " (1, 1, 2, 0),\n", + " (1, 1, 3, 0),\n", + " (1, 1, 6, 0),\n", + " (1, 1, 9, 0),\n", + " (1, 1, 9, 0),\n", + " (1, 2, 0, 1),\n", + " (1, 2, 0, 2),\n", + " (1, 2, 0, 5),\n", + " (2, 2, 30, 30),\n", + " (2, 2, 40, 40),\n", + " (2, 2, 50, 50),\n", + " (2, 2, 70, 70)]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS out;\n", + "\n", + "SELECT madlib.stratified_sample(\n", + " 'test', -- Source table\n", + " 'out', -- Output table\n", + " 0.5, -- Sample proportion\n", + " 'gr1,gr2', -- Strata definition\n", + " 'id1,id2', -- Columns to output\n", + " FALSE); -- Sample without replacement\n", + "\n", + "SELECT * FROM out ORDER BY gr1,gr2,id1,id2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Sample with replacement" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "13 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>gr1</th>\n", + " <th>gr2</th>\n", + " <th>id1</th>\n", + " <th>id2</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>8</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>40</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>40</td>\n", + " <td>40</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 1, 1, 0),\n", + " (1, 1, 2, 0),\n", + " (1, 1, 3, 0),\n", + " (1, 1, 4, 0),\n", + " (1, 1, 8, 0),\n", + " (1, 1, 9, 0),\n", + " (1, 2, 0, 5),\n", + " (1, 2, 0, 5),\n", + " (1, 2, 0, 6),\n", + " (2, 2, 20, 20),\n", + " (2, 2, 20, 20),\n", + " (2, 2, 40, 40),\n", + " (2, 2, 40, 40)]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS out;\n", + "\n", + "SELECT madlib.stratified_sample(\n", + " 'test', -- Source table\n", + " 'out', -- Output table\n", + " 0.5, -- Sample proportion\n", + " 'gr1,gr2', -- Strata definition\n", + " 'id1,id2', -- Columns to output\n", + " TRUE); -- Sample with replacement\n", + "\n", + "SELECT * FROM out ORDER BY gr1,gr2,id1,id2;" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}