Repository: madlib-site Updated Branches: refs/heads/asf-site 5fa1ac070 -> acd339f65
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/stratified-sampling-v1.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/stratified-sampling-v1.ipynb b/community-artifacts/stratified-sampling-v1.ipynb deleted file mode 100644 index 75e02fd..0000000 --- a/community-artifacts/stratified-sampling-v1.ipynb +++ /dev/null @@ -1,672 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Stratified sampling\n", - "Stratified sampling is a method for sampling subpopulations (strata) independently. It is commonly used to reduce sampling error by ensuring that subgroups are adequately represented in the sample.\n", - "\n", - "Stratified sampling was added in MADlib 1.12." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] - } - ], - "source": [ - "%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Greenplum 4.3.10.0\n", - "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", - " \n", - "# PostgreSQL local\n", - "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", - "\n", - "# Greenplum 4.2.3.0\n", - "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>version</th>\n", - " </tr>\n", - " <tr>\n", - " <td>MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select madlib.version();\n", - "#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Create input table" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "25 rows affected.\n", - "25 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>id1</th>\n", - " <th>id2</th>\n", - " <th>gr1</th>\n", - " <th>gr2</th>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>4</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>7</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>8</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>9</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>9</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>9</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>9</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>3</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>4</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>5</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>6</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>10</td>\n", - " <td>10</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>20</td>\n", - " <td>20</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>30</td>\n", - " <td>30</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>40</td>\n", - " <td>40</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>50</td>\n", - " <td>50</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>60</td>\n", - " <td>60</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>70</td>\n", - " <td>70</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(1, 0, 1, 1),\n", - " (2, 0, 1, 1),\n", - " (3, 0, 1, 1),\n", - " (4, 0, 1, 1),\n", - " (5, 0, 1, 1),\n", - " (6, 0, 1, 1),\n", - " (7, 0, 1, 1),\n", - " (8, 0, 1, 1),\n", - " (9, 0, 1, 1),\n", - " (9, 0, 1, 1),\n", - " (9, 0, 1, 1),\n", - " (9, 0, 1, 1),\n", - " (0, 1, 1, 2),\n", - " (0, 2, 1, 2),\n", - " (0, 3, 1, 2),\n", - " (0, 4, 1, 2),\n", - " (0, 5, 1, 2),\n", - " (0, 6, 1, 2),\n", - " (10, 10, 2, 2),\n", - " (20, 20, 2, 2),\n", - " (30, 30, 2, 2),\n", - " (40, 40, 2, 2),\n", - " (50, 50, 2, 2),\n", - " (60, 60, 2, 2),\n", - " (70, 70, 2, 2)]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql \n", - "DROP TABLE IF EXISTS test;\n", - "\n", - "CREATE TABLE test(\n", - " id1 INTEGER,\n", - " id2 INTEGER,\n", - " gr1 INTEGER,\n", - " gr2 INTEGER\n", - ");\n", - "\n", - "INSERT INTO test VALUES\n", - "(1,0,1,1),\n", - "(2,0,1,1),\n", - "(3,0,1,1),\n", - "(4,0,1,1),\n", - "(5,0,1,1),\n", - "(6,0,1,1),\n", - "(7,0,1,1),\n", - "(8,0,1,1),\n", - "(9,0,1,1),\n", - "(9,0,1,1),\n", - "(9,0,1,1),\n", - "(9,0,1,1),\n", - "(0,1,1,2),\n", - "(0,2,1,2),\n", - "(0,3,1,2),\n", - "(0,4,1,2),\n", - "(0,5,1,2),\n", - "(0,6,1,2),\n", - "(10,10,2,2),\n", - "(20,20,2,2),\n", - "(30,30,2,2),\n", - "(40,40,2,2),\n", - "(50,50,2,2),\n", - "(60,60,2,2),\n", - "(70,70,2,2);\n", - "\n", - "SELECT * FROM test ORDER BY gr1, gr2, id1, id2;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Sample without replacement" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "1 rows affected.\n", - "13 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>gr1</th>\n", - " <th>gr2</th>\n", - " <th>id1</th>\n", - " <th>id2</th>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>6</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>9</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>9</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>5</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>30</td>\n", - " <td>30</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>40</td>\n", - " <td>40</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>50</td>\n", - " <td>50</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>70</td>\n", - " <td>70</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(1, 1, 1, 0),\n", - " (1, 1, 2, 0),\n", - " (1, 1, 3, 0),\n", - " (1, 1, 6, 0),\n", - " (1, 1, 9, 0),\n", - " (1, 1, 9, 0),\n", - " (1, 2, 0, 1),\n", - " (1, 2, 0, 2),\n", - " (1, 2, 0, 5),\n", - " (2, 2, 30, 30),\n", - " (2, 2, 40, 40),\n", - " (2, 2, 50, 50),\n", - " (2, 2, 70, 70)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "DROP TABLE IF EXISTS out;\n", - "\n", - "SELECT madlib.stratified_sample(\n", - " 'test', -- Source table\n", - " 'out', -- Output table\n", - " 0.5, -- Sample proportion\n", - " 'gr1,gr2', -- Strata definition\n", - " 'id1,id2', -- Columns to output\n", - " FALSE); -- Sample without replacement\n", - "\n", - "SELECT * FROM out ORDER BY gr1,gr2,id1,id2;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Sample with replacement" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "1 rows affected.\n", - "13 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>gr1</th>\n", - " <th>gr2</th>\n", - " <th>id1</th>\n", - " <th>id2</th>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>4</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>8</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>9</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>5</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>5</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>6</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>20</td>\n", - " <td>20</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>20</td>\n", - " <td>20</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>40</td>\n", - " <td>40</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>40</td>\n", - " <td>40</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(1, 1, 1, 0),\n", - " (1, 1, 2, 0),\n", - " (1, 1, 3, 0),\n", - " (1, 1, 4, 0),\n", - " (1, 1, 8, 0),\n", - " (1, 1, 9, 0),\n", - " (1, 2, 0, 5),\n", - " (1, 2, 0, 5),\n", - " (1, 2, 0, 6),\n", - " (2, 2, 20, 20),\n", - " (2, 2, 20, 20),\n", - " (2, 2, 40, 40),\n", - " (2, 2, 40, 40)]" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "DROP TABLE IF EXISTS out;\n", - "\n", - "SELECT madlib.stratified_sample(\n", - " 'test', -- Source table\n", - " 'out', -- Output table\n", - " 0.5, -- Sample proportion\n", - " 'gr1,gr2', -- Strata definition\n", - " 'id1,id2', -- Columns to output\n", - " TRUE); -- Sample with replacement\n", - "\n", - "SELECT * FROM out ORDER BY gr1,gr2,id1,id2;" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -}