http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Balanced-sampling-v1.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/Balanced-sampling-v1.ipynb b/community-artifacts/Balanced-sampling-v1.ipynb new file mode 100644 index 0000000..5f6ec23 --- /dev/null +++ b/community-artifacts/Balanced-sampling-v1.ipynb @@ -0,0 +1,3706 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Balanced sampling\n", + "\n", + "This module offers a number of re-sampling techniques including under-sampling majority classes, over-sampling minority classes, and combinations of the two.\n", + "\n", + "Balanced sampling was added in MADlib 1.14." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] + } + ], + "source": [ + "%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Greenplum Database 5.4.0 on GCP (demo machine)\n", + "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", + " \n", + "# PostgreSQL local\n", + "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", + "\n", + "# Greenplum Database 4.3.10.0\n", + "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>version</th>\n", + " </tr>\n", + " <tr>\n", + " <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%sql select madlib.version();\n", + "#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load data\n", + "Based in part on the flags data set from https://archive.ics.uci.edu/ml/datasets/Flags" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "22 rows affected.\n", + "22 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>Argentina</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2777</td>\n", + " <td>28</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>Australia</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>7690</td>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>Greece</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>132</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>Austria</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>84</td>\n", + " <td>8</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>Canada</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9976</td>\n", + " <td>24</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>China</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " <td>9561</td>\n", + " <td>1008</td>\n", + " <td>7</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>Denmark</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>43</td>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>Luxembourg</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>Norway</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>324</td>\n", + " <td>4</td>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>Portugal</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>92</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>Spain</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>505</td>\n", + " <td>38</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>Switzerland</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>41</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>UK</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>245</td>\n", + " <td>56</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>21</td>\n", + " <td>xElba</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <td>22</td>\n", + " <td>xPrussia</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>249</td>\n", + " <td>61</td>\n", + " <td>4</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", + " (2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", + " (8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", + " (9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", + " (3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", + " (5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", + " (6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", + " (7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", + " (12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", + " (14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", + " (15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", + " (16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", + " (18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", + " (19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", + " (10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (21, u'xElba', 3, 1, 1, 1, 6, None, None),\n", + " (22, u'xPrussia', 3, 1, 249, 61, 4, None, None)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS flags;\n", + "\n", + "CREATE TABLE flags (\n", + " id INTEGER,\n", + " name TEXT,\n", + " landmass INTEGER,\n", + " zone INTEGER,\n", + " area INTEGER,\n", + " population INTEGER,\n", + " language INTEGER,\n", + " colours INTEGER,\n", + " mainhue TEXT\n", + ");\n", + "\n", + "INSERT INTO flags VALUES\n", + "(1, 'Argentina', 2, 3, 2777, 28, 2, 2, 'blue'),\n", + "(2, 'Australia', 6, 2, 7690, 15, 1, 3, 'blue'),\n", + "(3, 'Austria', 3, 1, 84, 8, 4, 2, 'red'),\n", + "(4, 'Brazil', 2, 3, 8512, 119, 6, 4, 'green'),\n", + "(5, 'Canada', 1, 4, 9976, 24, 1, 2, 'red'),\n", + "(6, 'China', 5, 1, 9561, 1008, 7, 2, 'red'),\n", + "(7, 'Denmark', 3, 1, 43, 5, 6, 2, 'red'),\n", + "(8, 'Greece', 3, 1, 132, 10, 6, 2, 'blue'),\n", + "(9, 'Guatemala', 1, 4, 109, 8, 2, 2, 'blue'),\n", + "(10, 'Ireland', 3, 4, 70, 3, 1, 3, 'white'),\n", + "(11, 'Jamaica', 1, 4, 11, 2, 1, 3, 'green'),\n", + "(12, 'Luxembourg', 3, 1, 3, 0, 4, 3, 'red'),\n", + "(13, 'Mexico', 1, 4, 1973, 77, 2, 4, 'green'),\n", + "(14, 'Norway', 3, 1, 324, 4, 6, 3, 'red'),\n", + "(15, 'Portugal', 3, 4, 92, 10, 6, 5, 'red'),\n", + "(16, 'Spain', 3, 4, 505, 38, 2, 2, 'red'),\n", + "(17, 'Sweden', 3, 1, 450, 8, 6, 2, 'blue'),\n", + "(18, 'Switzerland', 3, 1, 41, 6, 4, 2, 'red'),\n", + "(19, 'UK', 3, 4, 245, 56, 1, 3, 'red'),\n", + "(20, 'USA', 1, 4, 9363, 231, 1, 3, 'white'),\n", + "(21, 'xElba', 3, 1, 1, 1, 6, NULL, NULL),\n", + "(22, 'xPrussia', 3, 1, 249, 61, 4, NULL, NULL);\n", + "\n", + "SELECT * FROM flags ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Uniform sampling \n", + "\n", + "All class values will be resampled so that they have the same number of rows. The output data size will be the same as the input data size, ignoring NULL values. Uniform sampling is the default for the 'class_size' parameter so we do not need to explicitly set it: " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>1</td>\n", + " <td>Argentina</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2777</td>\n", + " <td>28</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>2</td>\n", + " <td>Australia</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>7690</td>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>8</td>\n", + " <td>Greece</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>132</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>China</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " <td>9561</td>\n", + " <td>1008</td>\n", + " <td>7</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>7</td>\n", + " <td>Denmark</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>43</td>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>12</td>\n", + " <td>Luxembourg</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>18</td>\n", + " <td>Switzerland</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>41</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>19</td>\n", + " <td>UK</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>245</td>\n", + " <td>56</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(19L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", + " (18L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", + " (20L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", + " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (16L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (6L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (7L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", + " (10L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", + " (8L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", + " (1L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", + " (4L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", + " (2L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", + " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", + " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", + " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (13L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS output_table;\n", + "\n", + "SELECT madlib.balance_sample(\n", + " 'flags', -- Source table\n", + " 'output_table', -- Output table\n", + " 'mainhue'); -- Class column\n", + " \n", + "SELECT * FROM output_table ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we do uniform sampling again, but this time we specify a size for the output table:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "12 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>8</td>\n", + " <td>Greece</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>132</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>5</td>\n", + " <td>Canada</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9976</td>\n", + " <td>24</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>12</td>\n", + " <td>Luxembourg</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>18</td>\n", + " <td>Switzerland</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>41</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(3L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", + " (2L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (1L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (6L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (5L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", + " (9L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", + " (8L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", + " (7L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", + " (12L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (10L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (11L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS output_table;\n", + "\n", + "SELECT madlib.balance_sample(\n", + " 'flags', -- Source table\n", + " 'output_table', -- Output table\n", + " 'mainhue', -- Class column\n", + " 'uniform', -- Uniform sample\n", + " 12); -- Desired output table size\n", + "\n", + "SELECT * FROM output_table ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": true + }, + "source": [ + "# 3. Oversampling\n", + "Oversample with replacement such that all class values except NULLs end up with the same number of observations as the majority class. Countries with red flags is the majority class with 10 observations, so other class values will be oversampled to 10 observations:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "40 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>37</td>\n", + " <td>2</td>\n", + " <td>Australia</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>7690</td>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>39</td>\n", + " <td>2</td>\n", + " <td>Australia</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>7690</td>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>38</td>\n", + " <td>2</td>\n", + " <td>Australia</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>7690</td>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>40</td>\n", + " <td>8</td>\n", + " <td>Greece</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>132</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>32</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>31</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>35</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>33</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>36</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>34</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>3</td>\n", + " <td>Austria</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>84</td>\n", + " <td>8</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>Canada</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9976</td>\n", + " <td>24</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>6</td>\n", + " <td>China</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " <td>9561</td>\n", + " <td>1008</td>\n", + " <td>7</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>7</td>\n", + " <td>Denmark</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>43</td>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>12</td>\n", + " <td>Luxembourg</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>14</td>\n", + " <td>Norway</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>324</td>\n", + " <td>4</td>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>15</td>\n", + " <td>Portugal</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>92</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>16</td>\n", + " <td>Spain</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>505</td>\n", + " <td>38</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>18</td>\n", + " <td>Switzerland</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>41</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>19</td>\n", + " <td>UK</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>245</td>\n", + " <td>56</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>27</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>29</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>28</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>26</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>30</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>22</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>24</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>23</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>25</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>21</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(37L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", + " (39L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", + " (38L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", + " (40L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", + " (32L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (31L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (35L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (33L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (36L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (34L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (18L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (16L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (15L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (19L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (11L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (13L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (14L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (12L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (20L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", + " (8L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", + " (6L, 5, u'Canada', 1, 4, 9976, 24, 1, 2, u'red'),\n", + " (2L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", + " (5L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", + " (1L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", + " (9L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", + " (4L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", + " (10L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", + " (3L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", + " (7L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", + " (27L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (29L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (28L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (26L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (30L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (22L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (24L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (23L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (25L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white'),\n", + " (21L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS output_table;\n", + "\n", + "SELECT madlib.balance_sample(\n", + " 'flags', -- Source table\n", + " 'output_table', -- Output table\n", + " 'mainhue', -- Class column\n", + " 'oversample'); -- Oversample\n", + "\n", + "SELECT * FROM output_table ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Undersampling\n", + "Undersample such that all class values except NULLs end up with the same number of observations as the minority class. Countries with white flags is the minority class with 2 observations, so other class values will be undersampled to 2 observations:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "8 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>Argentina</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2777</td>\n", + " <td>28</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>14</td>\n", + " <td>Norway</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>324</td>\n", + " <td>4</td>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>19</td>\n", + " <td>UK</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>245</td>\n", + " <td>56</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", + " (1L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (4L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (3L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (6L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", + " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", + " (7L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (8L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS output_table;\n", + "\n", + "SELECT madlib.balance_sample(\n", + " 'flags', -- Source table\n", + " 'output_table', -- Output table\n", + " 'mainhue', -- Class column\n", + " 'undersample'); -- Undersample\n", + "\n", + "SELECT * FROM output_table ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the case of bootstrapping, we may want to undersample with replacement, so we set the 'with_replacement' parameter to TRUE:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "8 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>3</td>\n", + " <td>Austria</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>84</td>\n", + " <td>8</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>7</td>\n", + " <td>Denmark</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>43</td>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(7L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (8L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (2L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (1L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (4L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", + " (3L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", + " (6L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (5L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS output_table;\n", + "\n", + "SELECT madlib.balance_sample(\n", + " 'flags', -- Source table\n", + " 'output_table', -- Output table\n", + " 'mainhue', -- Class column\n", + " 'undersample', -- Undersample\n", + " NULL, -- Output table size will be calculated\n", + " NULL, -- No grouping\n", + " 'TRUE'); -- Sample with replacement\n", + "\n", + "SELECT * FROM output_table ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note above that some rows may appear multiple times above since we sampled with replacement." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Setting class size by count\n", + "\n", + "Here we set the number of rows for red and blue flags, and leave green and white flags unchanged:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "19 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " <td>Argentina</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2777</td>\n", + " <td>28</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>1</td>\n", + " <td>Argentina</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2777</td>\n", + " <td>28</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>Argentina</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2777</td>\n", + " <td>28</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>8</td>\n", + " <td>Greece</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>132</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>Austria</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>84</td>\n", + " <td>8</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>6</td>\n", + " <td>China</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " <td>9561</td>\n", + " <td>1008</td>\n", + " <td>7</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>Denmark</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>43</td>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>12</td>\n", + " <td>Luxembourg</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>14</td>\n", + " <td>Norway</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>324</td>\n", + " <td>4</td>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>16</td>\n", + " <td>Spain</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>505</td>\n", + " <td>38</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>19</td>\n", + " <td>UK</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>245</td>\n", + " <td>56</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(11L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", + " (12L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", + " (10L, 1, u'Argentina', 2, 3, 2777, 28, 2, 2, u'blue'),\n", + " (16L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", + " (14L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (13L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (15L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (17L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (9L, 13, u'Mexico', 1, 4, 1973, 77, 2, 4, u'green'),\n", + " (2L, 3, u'Austria', 3, 1, 84, 8, 4, 2, u'red'),\n", + " (3L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", + " (1L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", + " (6L, 12, u'Luxembourg', 3, 1, 3, 0, 4, 3, u'red'),\n", + " (7L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", + " (4L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", + " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", + " (18L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (19L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS output_table;\n", + "\n", + "SELECT madlib.balance_sample(\n", + " 'flags', -- Source table\n", + " 'output_table', -- Output table\n", + " 'mainhue', -- Class column\n", + " 'red=7, blue=7'); -- Want 7 reds and 7 blues\n", + "\n", + "SELECT * FROM output_table ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we set the number of rows for red and blue flags, and also set an output table size. This means that green and white flags will be uniformly sampled to get to the desired output table size:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "22 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>21</td>\n", + " <td>2</td>\n", + " <td>Australia</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>7690</td>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>22</td>\n", + " <td>8</td>\n", + " <td>Greece</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>132</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>6</td>\n", + " <td>China</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " <td>9561</td>\n", + " <td>1008</td>\n", + " <td>7</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>7</td>\n", + " <td>Denmark</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>43</td>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>14</td>\n", + " <td>Norway</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>324</td>\n", + " <td>4</td>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>15</td>\n", + " <td>Portugal</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>92</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>16</td>\n", + " <td>Spain</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>505</td>\n", + " <td>38</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>18</td>\n", + " <td>Switzerland</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>41</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>19</td>\n", + " <td>UK</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>245</td>\n", + " <td>56</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>red</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>10</td>\n", + " <td>Ireland</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>70</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>20</td>\n", + " <td>USA</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>9363</td>\n", + " <td>231</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>white</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(21L, 2, u'Australia', 6, 2, 7690, 15, 1, 3, u'blue'),\n", + " (22L, 8, u'Greece', 3, 1, 132, 10, 6, 2, u'blue'),\n", + " (16L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (17L, 9, u'Guatemala', 1, 4, 109, 8, 2, 2, u'blue'),\n", + " (19L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (20L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (18L, 17, u'Sweden', 3, 1, 450, 8, 6, 2, u'blue'),\n", + " (10L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (9L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (11L, 4, u'Brazil', 2, 3, 8512, 119, 6, 4, u'green'),\n", + " (8L, 11, u'Jamaica', 1, 4, 11, 2, 1, 3, u'green'),\n", + " (7L, 6, u'China', 5, 1, 9561, 1008, 7, 2, u'red'),\n", + " (2L, 7, u'Denmark', 3, 1, 43, 5, 6, 2, u'red'),\n", + " (4L, 14, u'Norway', 3, 1, 324, 4, 6, 3, u'red'),\n", + " (6L, 15, u'Portugal', 3, 4, 92, 10, 6, 5, u'red'),\n", + " (3L, 16, u'Spain', 3, 4, 505, 38, 2, 2, u'red'),\n", + " (1L, 18, u'Switzerland', 3, 1, 41, 6, 4, 2, u'red'),\n", + " (5L, 19, u'UK', 3, 4, 245, 56, 1, 3, u'red'),\n", + " (13L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (15L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (14L, 10, u'Ireland', 3, 4, 70, 3, 1, 3, u'white'),\n", + " (12L, 20, u'USA', 1, 4, 9363, 231, 1, 3, u'white')]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS output_table;\n", + "\n", + "SELECT madlib.balance_sample(\n", + " 'flags', -- Source table\n", + " 'output_table', -- Output table\n", + " 'mainhue', -- Class column\n", + " 'red=7, blue=7', -- Want 7 reds and 7 blues\n", + " 22); -- Desired output table size\n", + "\n", + "SELECT * FROM output_table ORDER BY mainhue, name;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 6. NULL handling\n", + "\n", + "To make NULL a valid class value, set the parameter to keep NULLs:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "25 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__madlib_id__</th>\n", + " <th>id</th>\n", + " <th>name</th>\n", + " <th>landmass</th>\n", + " <th>zone</th>\n", + " <th>area</th>\n", + " <th>population</th>\n", + " <th>language</th>\n", + " <th>colours</th>\n", + " <th>mainhue</th>\n", + " </tr>\n", + " <tr>\n", + " <td>24</td>\n", + " <td>1</td>\n", + " <td>Argentina</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2777</td>\n", + " <td>28</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>25</td>\n", + " <td>2</td>\n", + " <td>Australia</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>7690</td>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>22</td>\n", + " <td>8</td>\n", + " <td>Greece</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>132</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>21</td>\n", + " <td>9</td>\n", + " <td>Guatemala</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>109</td>\n", + " <td>8</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>23</td>\n", + " <td>17</td>\n", + " <td>Sweden</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>450</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " <td>blue</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>Brazil</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>8512</td>\n", + " <td>119</td>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>11</td>\n", + " <td>Jamaica</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>13</td>\n", + " <td>Mexico</td>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>1973</td>\n", + " <td>77</td>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>green</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>5</td>\n", + " <
<TRUNCATED>