http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Encoding-categorical-variables-v2.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/Encoding-categorical-variables-v2.ipynb b/community-artifacts/Encoding-categorical-variables-v2.ipynb new file mode 100644 index 0000000..5e4cb6f --- /dev/null +++ b/community-artifacts/Encoding-categorical-variables-v2.ipynb @@ -0,0 +1,4026 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Encoding categorical variables\n", + "This is the new module that replaces create_indicator_variables() which was deprecated as of MADlib v1.10" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] + } + ], + "source": [ + "%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Greenplum Database 5.4.0 on GCP (demo machine)\n", + "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", + " \n", + "# PostgreSQL local\n", + "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", + "\n", + "# Greenplum Database 4.3.10.0\n", + "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>version</th>\n", + " </tr>\n", + " <tr>\n", + " <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%sql select madlib.version();\n", + "#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load data set\n", + "Use a subset of the abalone dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "20 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>sex</th>\n", + " <th>length</th>\n", + " <th>diameter</th>\n", + " <th>height</th>\n", + " <th>rings</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>M</td>\n", + " <td>0.455</td>\n", + " <td>0.365</td>\n", + " <td>0.095</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>M</td>\n", + " <td>0.35</td>\n", + " <td>0.265</td>\n", + " <td>0.09</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>F</td>\n", + " <td>0.53</td>\n", + " <td>0.42</td>\n", + " <td>0.135</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>M</td>\n", + " <td>0.44</td>\n", + " <td>0.365</td>\n", + " <td>0.125</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>I</td>\n", + " <td>0.33</td>\n", + " <td>0.255</td>\n", + " <td>0.08</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>I</td>\n", + " <td>0.425</td>\n", + " <td>0.3</td>\n", + " <td>0.095</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>F</td>\n", + " <td>0.53</td>\n", + " <td>0.415</td>\n", + " <td>0.15</td>\n", + " <td>20</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>F</td>\n", + " <td>0.545</td>\n", + " <td>0.425</td>\n", + " <td>0.125</td>\n", + " <td>16</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>M</td>\n", + " <td>0.475</td>\n", + " <td>0.37</td>\n", + " <td>0.125</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>None</td>\n", + " <td>0.55</td>\n", + " <td>0.44</td>\n", + " <td>0.15</td>\n", + " <td>19</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>F</td>\n", + " <td>0.525</td>\n", + " <td>0.38</td>\n", + " <td>0.14</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>M</td>\n", + " <td>0.43</td>\n", + " <td>0.35</td>\n", + " <td>0.11</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>M</td>\n", + " <td>0.49</td>\n", + " <td>0.38</td>\n", + " <td>0.135</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>F</td>\n", + " <td>0.535</td>\n", + " <td>0.405</td>\n", + " <td>0.145</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>F</td>\n", + " <td>0.47</td>\n", + " <td>0.355</td>\n", + " <td>0.1</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>M</td>\n", + " <td>0.5</td>\n", + " <td>0.4</td>\n", + " <td>0.13</td>\n", + " <td>12</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>I</td>\n", + " <td>0.355</td>\n", + " <td>0.28</td>\n", + " <td>0.085</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>F</td>\n", + " <td>0.44</td>\n", + " <td>0.34</td>\n", + " <td>0.1</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>M</td>\n", + " <td>0.365</td>\n", + " <td>0.295</td>\n", + " <td>0.08</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>None</td>\n", + " <td>0.45</td>\n", + " <td>0.32</td>\n", + " <td>0.1</td>\n", + " <td>9</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'M', 0.455, 0.365, 0.095, 15),\n", + " (2, u'M', 0.35, 0.265, 0.09, 7),\n", + " (3, u'F', 0.53, 0.42, 0.135, 9),\n", + " (4, u'M', 0.44, 0.365, 0.125, 10),\n", + " (5, u'I', 0.33, 0.255, 0.08, 7),\n", + " (6, u'I', 0.425, 0.3, 0.095, 8),\n", + " (7, u'F', 0.53, 0.415, 0.15, 20),\n", + " (8, u'F', 0.545, 0.425, 0.125, 16),\n", + " (9, u'M', 0.475, 0.37, 0.125, 9),\n", + " (10, None, 0.55, 0.44, 0.15, 19),\n", + " (11, u'F', 0.525, 0.38, 0.14, 14),\n", + " (12, u'M', 0.43, 0.35, 0.11, 10),\n", + " (13, u'M', 0.49, 0.38, 0.135, 11),\n", + " (14, u'F', 0.535, 0.405, 0.145, 10),\n", + " (15, u'F', 0.47, 0.355, 0.1, 10),\n", + " (16, u'M', 0.5, 0.4, 0.13, 12),\n", + " (17, u'I', 0.355, 0.28, 0.085, 7),\n", + " (18, u'F', 0.44, 0.34, 0.1, 10),\n", + " (19, u'M', 0.365, 0.295, 0.08, 7),\n", + " (20, None, 0.45, 0.32, 0.1, 9)]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql \n", + "DROP TABLE IF EXISTS abalone;\n", + "\n", + "CREATE TABLE abalone (\n", + " id serial,\n", + " sex character varying,\n", + " length double precision,\n", + " diameter double precision,\n", + " height double precision,\n", + " rings int\n", + ");\n", + "\n", + "INSERT INTO abalone (sex, length, diameter, height, rings) VALUES\n", + "('M', 0.455, 0.365, 0.095, 15),\n", + "('M', 0.35, 0.265, 0.09, 7),\n", + "('F', 0.53, 0.42, 0.135, 9),\n", + "('M', 0.44, 0.365, 0.125, 10),\n", + "('I', 0.33, 0.255, 0.08, 7),\n", + "('I', 0.425, 0.3, 0.095, 8),\n", + "('F', 0.53, 0.415, 0.15, 20),\n", + "('F', 0.545, 0.425, 0.125, 16),\n", + "('M', 0.475, 0.37, 0.125, 9),\n", + "(null, 0.55, 0.44, 0.15, 19),\n", + "('F', 0.525, 0.38, 0.14, 14),\n", + "('M', 0.43, 0.35, 0.11, 10),\n", + "('M', 0.49, 0.38, 0.135, 11),\n", + "('F', 0.535, 0.405, 0.145, 10),\n", + "('F', 0.47, 0.355, 0.1, 10),\n", + "('M', 0.5, 0.4, 0.13, 12),\n", + "('I', 0.355, 0.28, 0.085, 7),\n", + "('F', 0.44, 0.34, 0.1, 10),\n", + "('M', 0.365, 0.295, 0.08, 7),\n", + "(null, 0.45, 0.32, 0.1, 9);\n", + "\n", + "SELECT * FROM abalone ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Create new table with one-hot encoding.\n", + "The column 'sex' is replaced by three columns encoding the values 'F', 'M' and 'I'. Null values are not encoded by default:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>length</th>\n", + " <th>diameter</th>\n", + " <th>height</th>\n", + " <th>rings</th>\n", + " <th>sex_F</th>\n", + " <th>sex_I</th>\n", + " <th>sex_M</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0.455</td>\n", + " <td>0.365</td>\n", + " <td>0.095</td>\n", + " <td>15</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0.35</td>\n", + " <td>0.265</td>\n", + " <td>0.09</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0.53</td>\n", + " <td>0.42</td>\n", + " <td>0.135</td>\n", + " <td>9</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0.44</td>\n", + " <td>0.365</td>\n", + " <td>0.125</td>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0.33</td>\n", + " <td>0.255</td>\n", + " <td>0.08</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0.425</td>\n", + " <td>0.3</td>\n", + " <td>0.095</td>\n", + " <td>8</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>0.53</td>\n", + " <td>0.415</td>\n", + " <td>0.15</td>\n", + " <td>20</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>0.545</td>\n", + " <td>0.425</td>\n", + " <td>0.125</td>\n", + " <td>16</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0.475</td>\n", + " <td>0.37</td>\n", + " <td>0.125</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>0.55</td>\n", + " <td>0.44</td>\n", + " <td>0.15</td>\n", + " <td>19</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>0.525</td>\n", + " <td>0.38</td>\n", + " <td>0.14</td>\n", + " <td>14</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>0.43</td>\n", + " <td>0.35</td>\n", + " <td>0.11</td>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>0.49</td>\n", + " <td>0.38</td>\n", + " <td>0.135</td>\n", + " <td>11</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>0.535</td>\n", + " <td>0.405</td>\n", + " <td>0.145</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>0.47</td>\n", + " <td>0.355</td>\n", + " <td>0.1</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>0.5</td>\n", + " <td>0.4</td>\n", + " <td>0.13</td>\n", + " <td>12</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>0.355</td>\n", + " <td>0.28</td>\n", + " <td>0.085</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>0.44</td>\n", + " <td>0.34</td>\n", + " <td>0.1</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>0.365</td>\n", + " <td>0.295</td>\n", + " <td>0.08</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>0.45</td>\n", + " <td>0.32</td>\n", + " <td>0.1</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0.455, 0.365, 0.095, 15, 0, 0, 1),\n", + " (2, 0.35, 0.265, 0.09, 7, 0, 0, 1),\n", + " (3, 0.53, 0.42, 0.135, 9, 1, 0, 0),\n", + " (4, 0.44, 0.365, 0.125, 10, 0, 0, 1),\n", + " (5, 0.33, 0.255, 0.08, 7, 0, 1, 0),\n", + " (6, 0.425, 0.3, 0.095, 8, 0, 1, 0),\n", + " (7, 0.53, 0.415, 0.15, 20, 1, 0, 0),\n", + " (8, 0.545, 0.425, 0.125, 16, 1, 0, 0),\n", + " (9, 0.475, 0.37, 0.125, 9, 0, 0, 1),\n", + " (10, 0.55, 0.44, 0.15, 19, 0, 0, 0),\n", + " (11, 0.525, 0.38, 0.14, 14, 1, 0, 0),\n", + " (12, 0.43, 0.35, 0.11, 10, 0, 0, 1),\n", + " (13, 0.49, 0.38, 0.135, 11, 0, 0, 1),\n", + " (14, 0.535, 0.405, 0.145, 10, 1, 0, 0),\n", + " (15, 0.47, 0.355, 0.1, 10, 1, 0, 0),\n", + " (16, 0.5, 0.4, 0.13, 12, 0, 0, 1),\n", + " (17, 0.355, 0.28, 0.085, 7, 0, 1, 0),\n", + " (18, 0.44, 0.34, 0.1, 10, 1, 0, 0),\n", + " (19, 0.365, 0.295, 0.08, 7, 0, 0, 1),\n", + " (20, 0.45, 0.32, 0.1, 9, 0, 0, 0)]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n", + "\n", + "SELECT madlib.encode_categorical_variables (\n", + " 'abalone', -- Source table\n", + " 'abalone_out', -- Output table\n", + " 'sex' -- Categorical columns\n", + " );\n", + "\n", + "SELECT * FROM abalone_out ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Encode null values\n", + "Now include NULL values in encoding (note the additional column 'sex_null'):" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>length</th>\n", + " <th>diameter</th>\n", + " <th>height</th>\n", + " <th>rings</th>\n", + " <th>sex_F</th>\n", + " <th>sex_I</th>\n", + " <th>sex_M</th>\n", + " <th>sex_null</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0.455</td>\n", + " <td>0.365</td>\n", + " <td>0.095</td>\n", + " <td>15</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0.35</td>\n", + " <td>0.265</td>\n", + " <td>0.09</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0.53</td>\n", + " <td>0.42</td>\n", + " <td>0.135</td>\n", + " <td>9</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0.44</td>\n", + " <td>0.365</td>\n", + " <td>0.125</td>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0.33</td>\n", + " <td>0.255</td>\n", + " <td>0.08</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0.425</td>\n", + " <td>0.3</td>\n", + " <td>0.095</td>\n", + " <td>8</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>0.53</td>\n", + " <td>0.415</td>\n", + " <td>0.15</td>\n", + " <td>20</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>0.545</td>\n", + " <td>0.425</td>\n", + " <td>0.125</td>\n", + " <td>16</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0.475</td>\n", + " <td>0.37</td>\n", + " <td>0.125</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>0.55</td>\n", + " <td>0.44</td>\n", + " <td>0.15</td>\n", + " <td>19</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>0.525</td>\n", + " <td>0.38</td>\n", + " <td>0.14</td>\n", + " <td>14</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>0.43</td>\n", + " <td>0.35</td>\n", + " <td>0.11</td>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>0.49</td>\n", + " <td>0.38</td>\n", + " <td>0.135</td>\n", + " <td>11</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>0.535</td>\n", + " <td>0.405</td>\n", + " <td>0.145</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>0.47</td>\n", + " <td>0.355</td>\n", + " <td>0.1</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>0.5</td>\n", + " <td>0.4</td>\n", + " <td>0.13</td>\n", + " <td>12</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>0.355</td>\n", + " <td>0.28</td>\n", + " <td>0.085</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>0.44</td>\n", + " <td>0.34</td>\n", + " <td>0.1</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>0.365</td>\n", + " <td>0.295</td>\n", + " <td>0.08</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>0.45</td>\n", + " <td>0.32</td>\n", + " <td>0.1</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0.455, 0.365, 0.095, 15, 0, 0, 1, 0),\n", + " (2, 0.35, 0.265, 0.09, 7, 0, 0, 1, 0),\n", + " (3, 0.53, 0.42, 0.135, 9, 1, 0, 0, 0),\n", + " (4, 0.44, 0.365, 0.125, 10, 0, 0, 1, 0),\n", + " (5, 0.33, 0.255, 0.08, 7, 0, 1, 0, 0),\n", + " (6, 0.425, 0.3, 0.095, 8, 0, 1, 0, 0),\n", + " (7, 0.53, 0.415, 0.15, 20, 1, 0, 0, 0),\n", + " (8, 0.545, 0.425, 0.125, 16, 1, 0, 0, 0),\n", + " (9, 0.475, 0.37, 0.125, 9, 0, 0, 1, 0),\n", + " (10, 0.55, 0.44, 0.15, 19, 0, 0, 0, 1),\n", + " (11, 0.525, 0.38, 0.14, 14, 1, 0, 0, 0),\n", + " (12, 0.43, 0.35, 0.11, 10, 0, 0, 1, 0),\n", + " (13, 0.49, 0.38, 0.135, 11, 0, 0, 1, 0),\n", + " (14, 0.535, 0.405, 0.145, 10, 1, 0, 0, 0),\n", + " (15, 0.47, 0.355, 0.1, 10, 1, 0, 0, 0),\n", + " (16, 0.5, 0.4, 0.13, 12, 0, 0, 1, 0),\n", + " (17, 0.355, 0.28, 0.085, 7, 0, 1, 0, 0),\n", + " (18, 0.44, 0.34, 0.1, 10, 1, 0, 0, 0),\n", + " (19, 0.365, 0.295, 0.08, 7, 0, 0, 1, 0),\n", + " (20, 0.45, 0.32, 0.1, 9, 0, 0, 0, 1)]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n", + "\n", + "SELECT madlib.encode_categorical_variables (\n", + " 'abalone', -- Source table\n", + " 'abalone_out', -- Output table\n", + " 'sex', -- Categorical columns\n", + " NULL, -- Categorical columns to exclude\n", + " NULL, -- Index columns\n", + " NULL, -- Top values\n", + " NULL, -- Value to drop for dummy encoding\n", + " TRUE -- Encode nulls\n", + " );\n", + "\n", + "SELECT * FROM abalone_out ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Encode all categorical variables and specify an index\n", + "Encode all categorical variables in the source table. Also, specify the column 'id' as the index (primary key) - this changes the output table to only include the index and the encoded variables:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>sex_F</th>\n", + " <th>sex_I</th>\n", + " <th>sex_M</th>\n", + " <th>rings_10</th>\n", + " <th>rings_11</th>\n", + " <th>rings_12</th>\n", + " <th>rings_14</th>\n", + " <th>rings_15</th>\n", + " <th>rings_16</th>\n", + " <th>rings_19</th>\n", + " <th>rings_20</th>\n", + " <th>rings_7</th>\n", + " <th>rings_8</th>\n", + " <th>rings_9</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0),\n", + " (2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),\n", + " (4, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (6, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0),\n", + " (7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),\n", + " (8, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),\n", + " (9, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),\n", + " (10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),\n", + " (11, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),\n", + " (12, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (13, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (14, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (15, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (16, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (17, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (18, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (19, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n", + "\n", + "SELECT madlib.encode_categorical_variables (\n", + " 'abalone', -- Source table\n", + " 'abalone_out', -- Output table\n", + " '*', -- Categorical columns\n", + " NULL, -- Categorical columns to exclude\n", + " 'id' -- Index columns\n", + " );\n", + "\n", + "SELECT * FROM abalone_out ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Encode top values\n", + "Now let's encode only the top values and group others into a miscellaneous bucket column. Top values can be global across all columns or specified by column. As an example of the latter, here are the top 2 'sex' values and the top 50% of 'rings' values:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>sex_M</th>\n", + " <th>sex_F</th>\n", + " <th>sex__misc__</th>\n", + " <th>rings_10</th>\n", + " <th>rings_7</th>\n", + " <th>rings_9</th>\n", + " <th>rings__misc__</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 1, 0, 0, 0, 0, 0, 1),\n", + " (2, 1, 0, 0, 0, 1, 0, 0),\n", + " (3, 0, 1, 0, 0, 0, 1, 0),\n", + " (4, 1, 0, 0, 1, 0, 0, 0),\n", + " (5, 0, 0, 1, 0, 1, 0, 0),\n", + " (6, 0, 0, 1, 0, 0, 0, 1),\n", + " (7, 0, 1, 0, 0, 0, 0, 1),\n", + " (8, 0, 1, 0, 0, 0, 0, 1),\n", + " (9, 1, 0, 0, 0, 0, 1, 0),\n", + " (10, 0, 0, 0, 0, 0, 0, 1),\n", + " (11, 0, 1, 0, 0, 0, 0, 1),\n", + " (12, 1, 0, 0, 1, 0, 0, 0),\n", + " (13, 1, 0, 0, 0, 0, 0, 1),\n", + " (14, 0, 1, 0, 1, 0, 0, 0),\n", + " (15, 0, 1, 0, 1, 0, 0, 0),\n", + " (16, 1, 0, 0, 0, 0, 0, 1),\n", + " (17, 0, 0, 1, 0, 1, 0, 0),\n", + " (18, 0, 1, 0, 1, 0, 0, 0),\n", + " (19, 1, 0, 0, 0, 1, 0, 0),\n", + " (20, 0, 0, 0, 0, 0, 1, 0)]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n", + "\n", + "SELECT madlib.encode_categorical_variables (\n", + " 'abalone', -- Source table\n", + " 'abalone_out', -- Output table\n", + " '*', -- Categorical columns\n", + " NULL, -- Categorical columns to exclude\n", + " 'id', -- Index columns\n", + " 'sex=2, rings=0.5' -- Top values\n", + " );\n", + "\n", + "SELECT * FROM abalone_out ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Show raw variables and encoded variables together\n", + "If you want to see both the raw categorical variable and its encoded form in the output_table, then include the categorical variables 'sex' and 'rings' in the index parameter. (Remember that this will not work if you specify '*' for the parameter 'categorical_cols', because in this case 'row_id' columns will not be encoded at all.)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>sex</th>\n", + " <th>rings</th>\n", + " <th>sex_F</th>\n", + " <th>sex_I</th>\n", + " <th>sex_M</th>\n", + " <th>rings_10</th>\n", + " <th>rings_11</th>\n", + " <th>rings_12</th>\n", + " <th>rings_14</th>\n", + " <th>rings_15</th>\n", + " <th>rings_16</th>\n", + " <th>rings_19</th>\n", + " <th>rings_20</th>\n", + " <th>rings_7</th>\n", + " <th>rings_8</th>\n", + " <th>rings_9</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>M</td>\n", + " <td>15</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>M</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>F</td>\n", + " <td>9</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>M</td>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>I</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>I</td>\n", + " <td>8</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>F</td>\n", + " <td>20</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>F</td>\n", + " <td>16</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>M</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>None</td>\n", + " <td>19</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>F</td>\n", + " <td>14</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>M</td>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>M</td>\n", + " <td>11</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>F</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>F</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>M</td>\n", + " <td>12</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>I</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>F</td>\n", + " <td>10</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>M</td>\n", + " <td>7</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>None</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, u'M', 15, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0),\n", + " (2, u'M', 7, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (3, u'F', 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),\n", + " (4, u'M', 10, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (5, u'I', 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (6, u'I', 8, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0),\n", + " (7, u'F', 20, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),\n", + " (8, u'F', 16, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),\n", + " (9, u'M', 9, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),\n", + " (10, None, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),\n", + " (11, u'F', 14, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),\n", + " (12, u'M', 10, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (13, u'M', 11, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (14, u'F', 10, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (15, u'F', 10, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (16, u'M', 12, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (17, u'I', 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (18, u'F', 10, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),\n", + " (19, u'M', 7, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),\n", + " (20, None, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n", + "\n", + "SELECT madlib.encode_categorical_variables (\n", + " 'abalone', -- Source table\n", + " 'abalone_out', -- Output table\n", + " 'sex, rings', -- Categorical columns\n", + " NULL, -- Categorical columns to exclude\n", + " 'id, sex, rings' -- Index columns\n", + " );\n", + "\n", + "SELECT * FROM abalone_out ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Dummy encoding\n", + "For dummy encoding, let's make the 'I' value from the 'sex' variable as the reference.\n", + "Here we use the 'value_to_drop' parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>sex_F</th>\n", + " <th>sex_M</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0, 1),\n", + " (2, 0, 1),\n", + " (3, 1, 0),\n", + " (4, 0, 1),\n", + " (5, 0, 0),\n", + " (6, 0, 0),\n", + " (7, 1, 0),\n", + " (8, 1, 0),\n", + " (9, 0, 1),\n", + " (10, 0, 0),\n", + " (11, 1, 0),\n", + " (12, 0, 1),\n", + " (13, 0, 1),\n", + " (14, 1, 0),\n", + " (15, 1, 0),\n", + " (16, 0, 1),\n", + " (17, 0, 0),\n", + " (18, 1, 0),\n", + " (19, 0, 1),\n", + " (20, 0, 0)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n", + "\n", + "SELECT madlib.encode_categorical_variables (\n", + " 'abalone', -- Source table\n", + " 'abalone_out', -- Output table\n", + " '*', -- Categorical columns\n", + " 'rings', -- Categorical columns to exclude\n", + " 'id', -- Index columns\n", + " NULL, -- Top value\n", + " 'sex=I' -- Value to drop for dummy encoding \n", + " );\n", + "\n", + "SELECT * FROM abalone_out ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Array output\n", + "Create an array output for the two categorical variables in the source table:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>__encoded_variables__</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),\n", + " (2, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),\n", + " (3, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),\n", + " (4, [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", + " (5, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),\n", + " (6, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),\n", + " (7, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),\n", + " (8, [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),\n", + " (9, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),\n", + " (10, [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),\n", + " (11, [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),\n", + " (12, [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", + " (13, [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", + " (14, [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", + " (15, [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", + " (16, [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),\n", + " (17, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),\n", + " (18, [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", + " (19, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),\n", + " (20, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;\n", + "\n", + "SELECT madlib.encode_categorical_variables (\n", + " 'abalone', -- Source table\n", + " 'abalone_out', -- Output table\n", + " '*', -- Categorical columns\n", + " NULL, -- Categorical columns to exclude\n", + " 'id', -- Index columns\n", + " NULL, -- Top values\n", + " NULL, -- Value to drop for dummy encoding\n", + " NULL, -- Encode nulls\n", + " 'array' -- Output type\n", + " );\n", + "\n", + "SELECT * FROM abalone_out ORDER BY id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And here is the dictionary table that specifies the index into the array:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>encoded_column_name</th>\n", + " <th>index</th>\n", + " <th>variable</th>\n", + " <th>value</th>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>1</td>\n", + " <td>sex</td>\n", + " <td>F</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>2</td>\n", + " <td>sex</td>\n", + " <td>I</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>3</td>\n", + " <td>sex</td>\n", + " <td>M</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>4</td>\n", + " <td>rings</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>5</td>\n", + " <td>rings</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>6</td>\n", + " <td>rings</td>\n", + " <td>12</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>7</td>\n", + " <td>rings</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>8</td>\n", + " <td>rings</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>9</td>\n", + " <td>rings</td>\n", + " <td>16</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>10</td>\n", + " <td>rings</td>\n", + " <td>19</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>11</td>\n", + " <td>rings</td>\n", + " <td>20</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>12</td>\n", + " <td>rings</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>13</td>\n", + " <td>rings</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <td>__encoded_variables__</td>\n", + " <td>14</td>\n", + " <td>rings</td>\n", + " <td>9</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'__encoded_variables__', 1, u'sex', u'F'),\n", + " (u'__encoded_variables__', 2, u'sex', u'I'),\n", + " (u'__encoded_variables__', 3, u'sex', u'M'),\n", + " (u'__encoded_variables__', 4, u'rings', u'10'),\n", + " (u'__encoded_variables__', 5, u'rings', u'11'),\n", + " (u'__encoded_variables__', 6, u'rings', u'12'),\n", + " (u'__encoded_variables__', 7, u'rings', u'14'),\n", + " (u'__encoded_variables__', 8, u'rings', u'15'),\n", + " (u'__encoded_variables__', 9, u'rings', u'16'),\n", + " (u'__encoded_variables__', 10, u'rings', u'19'),\n", + " (u'__encoded_variables__', 11, u'rings', u'20'),\n", + " (u'__encoded_variables__', 12, u'rings', u'7'),\n", + " (u'__encoded_variables__', 13, u'rings', u'8'),\n", + " (u'__encoded_variables__', 14, u'rings', u'9')]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%sql SELECT * FROM abalone_out_dictionary;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Dictionary output\n", + "Create a dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "20 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>sex_1</th>\n", + " <th>sex_2</th>\n", + " <th>sex_3</th>\n", + " <th>rings_1</th>\n", + " <th>rings_2</th>\n", + " <th>rings_3</th>\n", + " <th>rings_4</th>\n", + " <th>rings_5</th>\n", + " <th>rings_6</th>\n", + " <th>rings_7</th>\n", + " <th>rings_8</th>\n", + " <th>rings_9</th>\n", + " <th>rings_10</th>\n", + " <th>rings_11</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0<
<TRUNCATED>