http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Minibatch-preprocessor-v1.ipynb
----------------------------------------------------------------------
diff --git a/community-artifacts/Minibatch-preprocessor-v1.ipynb
b/community-artifacts/Minibatch-preprocessor-v1.ipynb
new file mode 100644
index 0000000..fe03a27
--- /dev/null
+++ b/community-artifacts/Minibatch-preprocessor-v1.ipynb
@@ -0,0 +1,1330 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Mini-batch preprocessor\n",
+ "\n",
+ "The mini-batch preprocessor is a utility that prepares input data for use
by models that support mini-batch as an optimization option. (This is currently
only the case for Neural Networks.) It is effectively a packing operation that
builds arrays of dependent and independent variables from the source data
table.\n",
+ "\n",
+ "The mini-batch preprocessor was added in MADlib 1.14."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13:
ShimWarning: The `IPython.config` package has been deprecated. You should
import from traitlets.config instead.\n",
+ " \"You should import from traitlets.config instead.\", ShimWarning)\n",
+
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
UserWarning: IPython.utils.traitlets has moved to a top-level traitlets
package.\n",
+ " warn(\"IPython.utils.traitlets has moved to a top-level traitlets
package.\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "u'Connected: gpadmin@madlib'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+ "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+ " \n",
+ "# PostgreSQL local\n",
+ "#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+ "\n",
+ "# Greenplum Database 4.3.10.0\n",
+ "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>version</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>MADlib version: 1.14-dev, git revision:
rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b,
cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release,
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++
compiler: g++ 4.4.7',)]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%sql select madlib.version();\n",
+ "#%sql select version();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Load data\n",
+ "Based on the well known iris dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "Done.\n",
+ "52 rows affected.\n",
+ "52 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>id</th>\n",
+ " <th>attributes</th>\n",
+ " <th>class_text</th>\n",
+ " <th>class</th>\n",
+ " <th>state</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'),
Decimal('0.1')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>[Decimal('5.1'), Decimal('3.4'), Decimal('1.5'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>6</td>\n",
+ " <td>[Decimal('5.0'), Decimal('3.5'), Decimal('1.3'),
Decimal('0.3')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>7</td>\n",
+ " <td>[Decimal('4.5'), Decimal('2.3'), Decimal('1.3'),
Decimal('0.3')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>8</td>\n",
+ " <td>[Decimal('4.4'), Decimal('3.2'), Decimal('1.3'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>9</td>\n",
+ " <td>[Decimal('5.0'), Decimal('3.5'), Decimal('1.6'),
Decimal('0.6')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>10</td>\n",
+ " <td>[Decimal('5.1'), Decimal('3.8'), Decimal('1.9'),
Decimal('0.4')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>11</td>\n",
+ " <td>[Decimal('4.8'), Decimal('3.0'), Decimal('1.4'),
Decimal('0.3')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>12</td>\n",
+ " <td>[Decimal('5.1'), Decimal('3.8'), Decimal('1.6'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>13</td>\n",
+ " <td>[Decimal('5.7'), Decimal('2.8'), Decimal('4.5'),
Decimal('1.3')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>14</td>\n",
+ " <td>[Decimal('6.3'), Decimal('3.3'), Decimal('4.7'),
Decimal('1.6')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>15</td>\n",
+ " <td>[Decimal('4.9'), Decimal('2.4'), Decimal('3.3'),
Decimal('1.0')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>16</td>\n",
+ " <td>[Decimal('6.6'), Decimal('2.9'), Decimal('4.6'),
Decimal('1.3')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>17</td>\n",
+ " <td>[Decimal('5.2'), Decimal('2.7'), Decimal('3.9'),
Decimal('1.4')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>18</td>\n",
+ " <td>[Decimal('5.0'), Decimal('2.0'), Decimal('3.5'),
Decimal('1.0')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>19</td>\n",
+ " <td>[Decimal('5.9'), Decimal('3.0'), Decimal('4.2'),
Decimal('1.5')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>20</td>\n",
+ " <td>[Decimal('6.0'), Decimal('2.2'), Decimal('4.0'),
Decimal('1.0')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>21</td>\n",
+ " <td>[Decimal('6.1'), Decimal('2.9'), Decimal('4.7'),
Decimal('1.4')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>22</td>\n",
+ " <td>[Decimal('5.6'), Decimal('2.9'), Decimal('3.6'),
Decimal('1.3')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>23</td>\n",
+ " <td>[Decimal('6.7'), Decimal('3.1'), Decimal('4.4'),
Decimal('1.4')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>24</td>\n",
+ " <td>[Decimal('5.6'), Decimal('3.0'), Decimal('4.5'),
Decimal('1.5')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>25</td>\n",
+ " <td>[Decimal('5.8'), Decimal('2.7'), Decimal('4.1'),
Decimal('1.0')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>26</td>\n",
+ " <td>[Decimal('6.2'), Decimal('2.2'), Decimal('4.5'),
Decimal('1.5')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>27</td>\n",
+ " <td>[Decimal('5.6'), Decimal('2.5'), Decimal('3.9'),
Decimal('1.1')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>28</td>\n",
+ " <td>[Decimal('5.0'), Decimal('3.4'), Decimal('1.5'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>29</td>\n",
+ " <td>[Decimal('4.4'), Decimal('2.9'), Decimal('1.4'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>30</td>\n",
+ " <td>[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'),
Decimal('0.1')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>31</td>\n",
+ " <td>[Decimal('5.4'), Decimal('3.7'), Decimal('1.5'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>32</td>\n",
+ " <td>[Decimal('4.8'), Decimal('3.4'), Decimal('1.6'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>33</td>\n",
+ " <td>[Decimal('4.8'), Decimal('3.0'), Decimal('1.4'),
Decimal('0.1')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>34</td>\n",
+ " <td>[Decimal('4.3'), Decimal('3.0'), Decimal('1.1'),
Decimal('0.1')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>35</td>\n",
+ " <td>[Decimal('5.8'), Decimal('4.0'), Decimal('1.2'),
Decimal('0.2')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>36</td>\n",
+ " <td>[Decimal('5.7'), Decimal('4.4'), Decimal('1.5'),
Decimal('0.4')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>37</td>\n",
+ " <td>[Decimal('5.4'), Decimal('3.9'), Decimal('1.3'),
Decimal('0.4')]</td>\n",
+ " <td>Iris_setosa</td>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>38</td>\n",
+ " <td>[Decimal('6.0'), Decimal('2.9'), Decimal('4.5'),
Decimal('1.5')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>39</td>\n",
+ " <td>[Decimal('5.7'), Decimal('2.6'), Decimal('3.5'),
Decimal('1.0')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>40</td>\n",
+ " <td>[Decimal('5.5'), Decimal('2.4'), Decimal('3.8'),
Decimal('1.1')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>41</td>\n",
+ " <td>[Decimal('5.5'), Decimal('2.4'), Decimal('3.7'),
Decimal('1.0')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>42</td>\n",
+ " <td>[Decimal('5.8'), Decimal('2.7'), Decimal('3.9'),
Decimal('1.2')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>43</td>\n",
+ " <td>[Decimal('6.0'), Decimal('2.7'), Decimal('5.1'),
Decimal('1.6')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>44</td>\n",
+ " <td>[Decimal('5.4'), Decimal('3.0'), Decimal('4.5'),
Decimal('1.5')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>45</td>\n",
+ " <td>[Decimal('6.0'), Decimal('3.4'), Decimal('4.5'),
Decimal('1.6')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>46</td>\n",
+ " <td>[Decimal('6.7'), Decimal('3.1'), Decimal('4.7'),
Decimal('1.5')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>47</td>\n",
+ " <td>[Decimal('6.3'), Decimal('2.3'), Decimal('4.4'),
Decimal('1.3')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>48</td>\n",
+ " <td>[Decimal('5.6'), Decimal('3.0'), Decimal('4.1'),
Decimal('1.3')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>49</td>\n",
+ " <td>[Decimal('5.5'), Decimal('2.5'), Decimal('4.0'),
Decimal('1.3')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>50</td>\n",
+ " <td>[Decimal('5.5'), Decimal('2.6'), Decimal('4.4'),
Decimal('1.2')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>51</td>\n",
+ " <td>[Decimal('6.1'), Decimal('3.0'), Decimal('4.6'),
Decimal('1.4')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>52</td>\n",
+ " <td>[Decimal('5.8'), Decimal('2.6'), Decimal('4.0'),
Decimal('1.2')]</td>\n",
+ " <td>Iris_versicolor</td>\n",
+ " <td>2</td>\n",
+ " <td>Tennessee</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(1, [Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (2, [Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (3, [Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (4, [Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('0.2')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (5, [Decimal('5.1'), Decimal('3.4'), Decimal('1.5'), Decimal('0.2')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (6, [Decimal('5.0'), Decimal('3.5'), Decimal('1.3'), Decimal('0.3')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (7, [Decimal('4.5'), Decimal('2.3'), Decimal('1.3'), Decimal('0.3')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (8, [Decimal('4.4'), Decimal('3.2'), Decimal('1.3'), Decimal('0.2')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (9, [Decimal('5.0'), Decimal('3.5'), Decimal('1.6'), Decimal('0.6')],
u'Iris_setosa', 1, u'Alaska'),\n",
+ " (10, [Decimal('5.1'), Decimal('3.8'), Decimal('1.9'),
Decimal('0.4')], u'Iris_setosa', 1, u'Alaska'),\n",
+ " (11, [Decimal('4.8'), Decimal('3.0'), Decimal('1.4'),
Decimal('0.3')], u'Iris_setosa', 1, u'Alaska'),\n",
+ " (12, [Decimal('5.1'), Decimal('3.8'), Decimal('1.6'),
Decimal('0.2')], u'Iris_setosa', 1, u'Alaska'),\n",
+ " (13, [Decimal('5.7'), Decimal('2.8'), Decimal('4.5'),
Decimal('1.3')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (14, [Decimal('6.3'), Decimal('3.3'), Decimal('4.7'),
Decimal('1.6')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (15, [Decimal('4.9'), Decimal('2.4'), Decimal('3.3'),
Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (16, [Decimal('6.6'), Decimal('2.9'), Decimal('4.6'),
Decimal('1.3')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (17, [Decimal('5.2'), Decimal('2.7'), Decimal('3.9'),
Decimal('1.4')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (18, [Decimal('5.0'), Decimal('2.0'), Decimal('3.5'),
Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (19, [Decimal('5.9'), Decimal('3.0'), Decimal('4.2'),
Decimal('1.5')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (20, [Decimal('6.0'), Decimal('2.2'), Decimal('4.0'),
Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (21, [Decimal('6.1'), Decimal('2.9'), Decimal('4.7'),
Decimal('1.4')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (22, [Decimal('5.6'), Decimal('2.9'), Decimal('3.6'),
Decimal('1.3')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (23, [Decimal('6.7'), Decimal('3.1'), Decimal('4.4'),
Decimal('1.4')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (24, [Decimal('5.6'), Decimal('3.0'), Decimal('4.5'),
Decimal('1.5')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (25, [Decimal('5.8'), Decimal('2.7'), Decimal('4.1'),
Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (26, [Decimal('6.2'), Decimal('2.2'), Decimal('4.5'),
Decimal('1.5')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (27, [Decimal('5.6'), Decimal('2.5'), Decimal('3.9'),
Decimal('1.1')], u'Iris_versicolor', 2, u'Alaska'),\n",
+ " (28, [Decimal('5.0'), Decimal('3.4'), Decimal('1.5'),
Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (29, [Decimal('4.4'), Decimal('2.9'), Decimal('1.4'),
Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (30, [Decimal('4.9'), Decimal('3.1'), Decimal('1.5'),
Decimal('0.1')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (31, [Decimal('5.4'), Decimal('3.7'), Decimal('1.5'),
Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (32, [Decimal('4.8'), Decimal('3.4'), Decimal('1.6'),
Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (33, [Decimal('4.8'), Decimal('3.0'), Decimal('1.4'),
Decimal('0.1')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (34, [Decimal('4.3'), Decimal('3.0'), Decimal('1.1'),
Decimal('0.1')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (35, [Decimal('5.8'), Decimal('4.0'), Decimal('1.2'),
Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (36, [Decimal('5.7'), Decimal('4.4'), Decimal('1.5'),
Decimal('0.4')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (37, [Decimal('5.4'), Decimal('3.9'), Decimal('1.3'),
Decimal('0.4')], u'Iris_setosa', 1, u'Tennessee'),\n",
+ " (38, [Decimal('6.0'), Decimal('2.9'), Decimal('4.5'),
Decimal('1.5')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (39, [Decimal('5.7'), Decimal('2.6'), Decimal('3.5'),
Decimal('1.0')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (40, [Decimal('5.5'), Decimal('2.4'), Decimal('3.8'),
Decimal('1.1')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (41, [Decimal('5.5'), Decimal('2.4'), Decimal('3.7'),
Decimal('1.0')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (42, [Decimal('5.8'), Decimal('2.7'), Decimal('3.9'),
Decimal('1.2')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (43, [Decimal('6.0'), Decimal('2.7'), Decimal('5.1'),
Decimal('1.6')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (44, [Decimal('5.4'), Decimal('3.0'), Decimal('4.5'),
Decimal('1.5')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (45, [Decimal('6.0'), Decimal('3.4'), Decimal('4.5'),
Decimal('1.6')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (46, [Decimal('6.7'), Decimal('3.1'), Decimal('4.7'),
Decimal('1.5')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (47, [Decimal('6.3'), Decimal('2.3'), Decimal('4.4'),
Decimal('1.3')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (48, [Decimal('5.6'), Decimal('3.0'), Decimal('4.1'),
Decimal('1.3')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (49, [Decimal('5.5'), Decimal('2.5'), Decimal('4.0'),
Decimal('1.3')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (50, [Decimal('5.5'), Decimal('2.6'), Decimal('4.4'),
Decimal('1.2')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (51, [Decimal('6.1'), Decimal('3.0'), Decimal('4.6'),
Decimal('1.4')], u'Iris_versicolor', 2, u'Tennessee'),\n",
+ " (52, [Decimal('5.8'), Decimal('2.6'), Decimal('4.0'),
Decimal('1.2')], u'Iris_versicolor', 2, u'Tennessee')]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_data;\n",
+ "\n",
+ "CREATE TABLE iris_data(\n",
+ " id serial,\n",
+ " attributes numeric[],\n",
+ " class_text varchar,\n",
+ " class integer,\n",
+ " state varchar\n",
+ ");\n",
+ "\n",
+ "INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES\n",
+ "(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),\n",
+ "(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),\n",
+ "(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),\n",
+ "(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),\n",
+ "(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),\n",
+ "(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),\n",
+ "(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),\n",
+ "(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),\n",
+ "(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),\n",
+ "(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),\n",
+ "(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),\n",
+ "(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),\n",
+ "(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),\n",
+ "(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),\n",
+ "(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),\n",
+ "(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),\n",
+ "(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),\n",
+ "(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),\n",
+ "(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),\n",
+ "(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),\n",
+ "(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),\n",
+ "(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),\n",
+ "(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),\n",
+ "(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),\n",
+ "(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),\n",
+ "(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),\n",
+ "(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),\n",
+ "(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),\n",
+ "(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),\n",
+ "(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),\n",
+ "(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),\n",
+ "(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),\n",
+ "(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),\n",
+ "(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),\n",
+ "(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),\n",
+ "(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),\n",
+ "(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),\n",
+ "(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),\n",
+ "(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),\n",
+ "(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),\n",
+ "(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),\n",
+ "(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),\n",
+ "(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),\n",
+ "(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),\n",
+ "(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),\n",
+ "(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),\n",
+ "(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),\n",
+ "(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),\n",
+ "(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),\n",
+ "(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),\n",
+ "(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),\n",
+ "(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');\n",
+ "\n",
+ "SELECT * FROM iris_data ORDER BY id;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Run preprocessor \n",
+ "\n",
+ "Run the preprocessor to generate the packed output table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__id__</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>[[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0,
0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0],
[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[0.0, 1.0], [0.0, 1.0]]</td>\n",
+ " <td>[[-1.10156217454914, 0.00385956572525086,
-1.14565239753098, -1.00286528298202], [-0.767560815504508, 0.806649237861967,
-1.07515071152907, -1.18456909732025], [-0.0995580974152422,
0.00385956572525086, 1.03989986852812, 1.17758048907675], [-0.433559456459875,
-0.598232688377286, 0.616889752516682, 0.995876674738521], [-0.934561495026824,
0.20455698375943, -1.07515071152907, -1.36627291165848], [1.23644733876329,
-1.60171977854818, 1.03989986852812, 1.17758048907675], [1.06944665924097,
-0.196837852308928, 1.18090324053193, 0.995876674738521], [0.0674425821070736,
-0.798930106411465, 0.969398182526215, 0.632469046062059], [0.568444620674023,
-0.598232688377286, 0.616889752516682, 0.632469046062059], [-0.600560135982193,
1.60943890999868, -0.793143967521448, -0.821161468643789], [-1.60256421311609,
-1.401022360514, -1.21615408353289, -1.00286528298202], [-0.600560135982193,
0.806649237861967, -1.07515071152907, -1.18456909732025], [-0.0995580974152422,
1.810136328
03286, -1.21615408353289, -0.821161468643789], [0.401443941151707,
-0.397535270343108, 1.03989986852812, 0.81417286040029], [-0.767560815504508,
-2.00311461461654, 0.334883008509056, 0.269061417385597], [0.234443261629389,
-0.196837852308928, 0.405384694510963, 0.81417286040029], [1.06944665924097,
0.00385956572525086, 1.11040155453003, 0.995876674738521], [1.4034480182856,
-1.401022360514, 0.969398182526215, 0.81417286040029], [-1.93656557216072,
0.00385956572525086, -1.3571574555367, -1.36627291165848], [0.0674425821070736,
-1.20032494247982, 0.546388066514775, 0.450765231723828], [0.0674425821070736,
-1.20032494247982, 0.475886380512869, 0.269061417385597], [2.07145073637487,
0.20455698375943, 0.969398182526215, 0.995876674738521], [0.73544530019634,
0.00385956572525086, 0.828394810522402, 1.17758048907675], [1.4034480182856,
0.605951819827788, 1.18090324053193, 1.35928430341498], [0.902445979718656,
-0.196837852308928, 1.03989986852812, 1.17758048907675], [-0.934561495026824, -1
.20032494247982, 0.193879636505243, 0.269061417385597]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>[[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0,
0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0],
[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0,
1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0],
[0.0, 1.0], [1.0, 0.0]]</td>\n",
+ " <td>[[-0.767560815504508, 1.00734665589615, -1.21615408353289,
-1.00286528298202], [-0.934561495026824, 0.20455698375943, -1.07515071152907,
-1.36627291165848], [0.568444620674023, -0.798930106411465, 0.687391438518589,
0.632469046062059], [0.568444620674023, -0.598232688377286, 0.757893124520495,
0.269061417385597], [0.401443941151707, 2.81362341820376, -1.07515071152907,
-0.821161468643789], [0.902445979718656, -1.60171977854818, 0.687391438518589,
0.269061417385597], [0.234443261629389, 0.00385956572525086, 0.757893124520495,
0.81417286040029], [-1.10156217454914, 0.806649237861967, -1.00464902552717,
-1.18456909732025], [-1.76956489263841, 0.00385956572525086, -1.21615408353289,
-1.18456909732025], [0.234443261629389, -0.999627524445644, 0.616889752516682,
0.450765231723828], [-0.767560815504508, 0.405254401793609, -1.28665576953479,
-1.18456909732025], [-0.600560135982193, 1.60943890999868, -1.00464902552717,
-1.18456909732025], [1.90445005685255, -0.19683785230
8928, 1.11040155453003, 0.81417286040029], [-0.767560815504508,
1.00734665589615, -1.00464902552717, -0.457753839967327], [0.234443261629389,
0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.0674425821070736,
1.00734665589615, -1.21615408353289, -1.18456909732025], [-1.76956489263841,
0.405254401793609, -1.21615408353289, -1.18456909732025], [0.902445979718656,
-0.598232688377286, 1.46290998453956, 1.35928430341498], [0.401443941151707,
-0.798930106411465, 0.334883008509056, 0.269061417385597], [-1.10156217454914,
0.00385956572525086, -1.14565239753098, -1.36627291165848], [2.07145073637487,
0.20455698375943, 1.18090324053193, 1.17758048907675], [-1.76956489263841,
-0.196837852308928, -1.14565239753098, -1.18456909732025], [0.568444620674023,
2.01083374606704, -1.28665576953479, -1.18456909732025], [0.0674425821070736,
-0.999627524445644, 0.687391438518589, 0.81417286040029], [0.902445979718656,
0.806649237861967, 1.03989986852812, 1.35928430341498], [-0.0995580974152422
, 1.4087414919645, -1.07515071152907, -1.18456909732025]]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0L, [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0],
[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0,
0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [0.0, 1.0]], [[-1.10156217454914, 0.00385956572525086, -1.14565239753098,
-1.00286528298202], [-0.767560815504508, 0.806649237861967, -1.07515071152907,
-1.18456909732025], [-0.0995580974152422, 0.00385956572525086,
1.03989986852812, 1.17758048907675], [-0.433559456459875, -0.598232688377286,
0.616889752516682, 0.995876674738521], [-0.934561495026824, 0.20455698375943,
-1.07515071152907, -1.36627291165848], [1.23644733876329, -1.60171977854818,
1.03989986852812, 1.17758048907675], [1.06944665924097, -0.196837852308928,
1.18090324053193, 0.995876674738521], [0.0674425821070736, -0.798930106411465,
0.969398182526215, 0.632469046062059], [0.568444620674023, -0.598232688377286,
0.616889752516682, 0.632469046062059], [-0.600560135982193, 1.60943890999868,
-0.793143967521448, -0.821161468643789], [-1.60256421311609, -1.401022360514,
-1.21615408353289, -1.00286528298202], [-0.600560135982193, 0.806649237861967,
-1.07515071152907, -1.18456909732025], [-0.0995580974152422, 1.81013632803286,
-1.21615408353289, -0.821161468643789], [0.401443941151707, -0.397535270343108,
1.03989986852812, 0.81417286040029], [-0.767560815504508, -2.00311461461654,
0.334883008509056, 0.269061417385597], [0.234443261629389, -0.196837852308928,
0.405384694510963, 0.81417286040029], [1.06944665924097, 0.00385956572525086,
1.11040155453003, 0.995876674738521], [1.4034480182856, -1.401022360514,
0.969398182526215, 0.81417286040029], [-1.93656557216072, 0.00385956572525086,
-1.3571574555367, -1.36627291165848], [0.0674425821070736, -1.20032494247982,
0.546388066514775, 0.450765231723828], [0.0674425821070736, -1.20032494247982,
0.475886380512869, 0.269061417385597], [2.07145073637487, 0
.20455698375943, 0.969398182526215, 0.995876674738521], [0.73544530019634,
0.00385956572525086, 0.828394810522402, 1.17758048907675], [1.4034480182856,
0.605951819827788, 1.18090324053193, 1.35928430341498], [0.902445979718656,
-0.196837852308928, 1.03989986852812, 1.17758048907675], [-0.934561495026824,
-1.20032494247982, 0.193879636505243, 0.269061417385597]]),\n",
+ " (1L, [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0],
[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0,
0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0],
[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0,
1.0], [1.0, 0.0]], [[-0.767560815504508, 1.00734665589615, -1.21615408353289,
-1.00286528298202], [-0.934561495026824, 0.20455698375943, -1.07515071152907,
-1.36627291165848], [0.568444620674023, -0.798930106411465, 0.687391438518589,
0.632469046062059], [0.568444620674023, -0.598232688377286, 0.757893124520495,
0.269061417385597], [0.401443941151707, 2.81362341820376, -1.07515071152907,
-0.821161468643789], [0.902445979718656, -1.60171977854818, 0.687391438518589,
0.269061417385597], [0.234443261629389, 0.00385956572525086, 0.757893124520495,
0.81417286040029], [-1.10156217454914, 0.806649237861967, -1.00464902552717,
-1.18456909732025], [-1.76956489263841, 0.00385956572525086,
-1.21615408353289, -1.18456909732025], [0.234443261629389, -0.999627524445644,
0.616889752516682, 0.450765231723828], [-0.767560815504508, 0.405254401793609,
-1.28665576953479, -1.18456909732025], [-0.600560135982193, 1.60943890999868,
-1.00464902552717, -1.18456909732025], [1.90445005685255, -0.196837852308928,
1.11040155453003, 0.81417286040029], [-0.767560815504508, 1.00734665589615,
-1.00464902552717, -0.457753839967327], [0.234443261629389,
0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.0674425821070736,
1.00734665589615, -1.21615408353289, -1.18456909732025], [-1.76956489263841,
0.405254401793609, -1.21615408353289, -1.18456909732025], [0.902445979718656,
-0.598232688377286, 1.46290998453956, 1.35928430341498], [0.401443941151707,
-0.798930106411465, 0.334883008509056, 0.269061417385597], [-1.10156217454914,
0.00385956572525086, -1.14565239753098, -1.36627291165848], [2.07145073637487,
0.20455698375943, 1.18090324053193, 1.17758048907675], [-1.76956489263841, -0.
196837852308928, -1.14565239753098, -1.18456909732025], [0.568444620674023,
2.01083374606704, -1.28665576953479, -1.18456909732025], [0.0674425821070736,
-0.999627524445644, 0.687391438518589, 0.81417286040029], [0.902445979718656,
0.806649237861967, 1.03989986852812, 1.35928430341498], [-0.0995580974152422,
1.4087414919645, -1.07515071152907, -1.18456909732025]])]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary,
iris_data_packed_standardization;\n",
+ "\n",
+ "SELECT madlib.minibatch_preprocessor('iris_data', -- Source
table\n",
+ " 'iris_data_packed', -- Output
table\n",
+ " 'class_text', -- Dependent
variable\n",
+ " 'attributes' -- Independent
variables\n",
+ " );\n",
+ "\n",
+ "SELECT * FROM iris_data_packed ORDER BY __id__;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For small datasets like in this example, buffer size is mainly determined
by the number of segments in the database. For a Greenplum database with 2
segments, there will be 2 rows with a buffer size of 26. For PostgresSQL, there
would be only one row with a buffer size of 52 since it is a single node
database. For larger data sets, other factors go into computing buffers size
besides number of segments. \n",
+ "\n",
+ "Also, note above that the dependent variable has been one-hot encoded
since it is categorical. Here is a sample of the packed output table\n",
+ "\n",
+ "Review the output summary table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>source_table</th>\n",
+ " <th>output_table</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " <th>dependent_vartype</th>\n",
+ " <th>buffer_size</th>\n",
+ " <th>class_values</th>\n",
+ " <th>num_rows_processed</th>\n",
+ " <th>num_missing_rows_skipped</th>\n",
+ " <th>grouping_cols</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>iris_data</td>\n",
+ " <td>iris_data_packed</td>\n",
+ " <td>class_text</td>\n",
+ " <td>attributes</td>\n",
+ " <td>character varying</td>\n",
+ " <td>26</td>\n",
+ " <td>[u'Iris_setosa', u'Iris_versicolor']</td>\n",
+ " <td>52</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'iris_data', u'iris_data_packed', u'class_text', u'attributes',
u'character varying', 26, [u'Iris_setosa', u'Iris_versicolor'], 52, 0, None)]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM iris_data_packed_summary;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Review the output standardization table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>mean</th>\n",
+ " <th>std</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>[5.45961538462, 2.99807692308, 3.025,
0.851923076923]</td>\n",
+ " <td>[0.598799958695, 0.498262513686, 1.41840579525,
0.550346179381]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[([5.45961538462, 2.99807692308, 3.025, 0.851923076923],
[0.598799958695, 0.498262513686, 1.41840579525, 0.550346179381])]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM iris_data_packed_standardization;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. Change buffer size \n",
+ "\n",
+ "Generally the default buffer size will work well, but if you have
occasion to change it:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "6 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__id__</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>[[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0,
0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]]</td>\n",
+ " <td>[[-0.0995580974152422, 0.00385956572525086,
1.03989986852812, 1.17758048907675], [-0.767560815504508, 1.00734665589615,
-1.00464902552717, -0.457753839967327], [-0.934561495026824, 0.20455698375943,
-1.07515071152907, -1.36627291165848], [0.568444620674023, -0.798930106411465,
0.687391438518589, 0.632469046062059], [-0.767560815504508, 0.405254401793609,
-1.28665576953479, -1.18456909732025], [-0.767560815504508, -2.00311461461654,
0.334883008509056, 0.269061417385597], [2.07145073637487, 0.20455698375943,
0.969398182526215, 0.995876674738521], [0.401443941151707, 2.81362341820376,
-1.07515071152907, -0.821161468643789], [-0.934561495026824, 0.20455698375943,
-1.07515071152907, -1.36627291165848], [0.0674425821070736, -1.20032494247982,
0.475886380512869, 0.269061417385597]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>[[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0,
1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]</td>\n",
+ " <td>[[0.0674425821070736, -0.999627524445644,
0.687391438518589, 0.81417286040029], [-0.0995580974152422, 1.4087414919645,
-1.07515071152907, -1.18456909732025], [-0.0995580974152422, 1.81013632803286,
-1.21615408353289, -0.821161468643789], [1.06944665924097, 0.00385956572525086,
1.11040155453003, 0.995876674738521], [0.0674425821070736, -0.798930106411465,
0.969398182526215, 0.632469046062059], [-1.10156217454914, 0.806649237861967,
-1.00464902552717, -1.18456909732025], [-1.10156217454914, 0.00385956572525086,
-1.14565239753098, -1.00286528298202], [-0.600560135982193, 1.60943890999868,
-1.00464902552717, -1.18456909732025], [0.902445979718656, -0.598232688377286,
1.46290998453956, 1.35928430341498], [0.401443941151707, -0.798930106411465,
0.334883008509056, 0.269061417385597]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>[[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]</td>\n",
+ " <td>[[-1.10156217454914, 0.00385956572525086,
-1.14565239753098, -1.36627291165848], [0.234443261629389, 0.00385956572525086,
1.03989986852812, 1.17758048907675], [0.902445979718656, -0.196837852308928,
1.03989986852812, 1.17758048907675], [2.07145073637487, 0.20455698375943,
1.18090324053193, 1.17758048907675], [1.90445005685255, -0.196837852308928,
1.11040155453003, 0.81417286040029], [-0.600560135982193, 0.806649237861967,
-1.07515071152907, -1.18456909732025], [-0.433559456459875, -0.598232688377286,
0.616889752516682, 0.995876674738521], [0.73544530019634, 0.00385956572525086,
0.828394810522402, 1.17758048907675], [1.06944665924097, -0.196837852308928,
1.18090324053193, 0.995876674738521], [-0.767560815504508, 1.00734665589615,
-1.21615408353289, -1.00286528298202]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>3</td>\n",
+ " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0,
0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]]</td>\n",
+ " <td>[[0.902445979718656, 0.806649237861967, 1.03989986852812,
1.35928430341498], [1.4034480182856, -1.401022360514, 0.969398182526215,
0.81417286040029], [0.568444620674023, -0.598232688377286, 0.616889752516682,
0.632469046062059], [1.4034480182856, 0.605951819827788, 1.18090324053193,
1.35928430341498], [-1.60256421311609, -1.401022360514, -1.21615408353289,
-1.00286528298202], [-1.76956489263841, -0.196837852308928, -1.14565239753098,
-1.18456909732025], [0.0674425821070736, -1.20032494247982, 0.546388066514775,
0.450765231723828], [0.0674425821070736, 1.00734665589615, -1.21615408353289,
-1.18456909732025], [-1.76956489263841, 0.405254401793609, -1.21615408353289,
-1.18456909732025], [0.234443261629389, -0.999627524445644, 0.616889752516682,
0.450765231723828]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>4</td>\n",
+ " <td>[[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]</td>\n",
+ " <td>[[0.568444620674023, 2.01083374606704, -1.28665576953479,
-1.18456909732025], [0.234443261629389, -0.196837852308928, 0.405384694510963,
0.81417286040029], [-0.934561495026824, -1.20032494247982, 0.193879636505243,
0.269061417385597], [0.568444620674023, -0.598232688377286, 0.757893124520495,
0.269061417385597], [0.902445979718656, -1.60171977854818, 0.687391438518589,
0.269061417385597], [1.23644733876329, -1.60171977854818, 1.03989986852812,
1.17758048907675], [-1.76956489263841, 0.00385956572525086, -1.21615408353289,
-1.18456909732025], [0.401443941151707, -0.397535270343108, 1.03989986852812,
0.81417286040029], [0.234443261629389, 0.00385956572525086, 0.757893124520495,
0.81417286040029], [-0.767560815504508, 0.806649237861967, -1.07515071152907,
-1.18456909732025]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>5</td>\n",
+ " <td>[[1.0, 0.0], [1.0, 0.0]]</td>\n",
+ " <td>[[-0.600560135982193, 1.60943890999868,
-0.793143967521448, -0.821161468643789], [-1.93656557216072,
0.00385956572525086, -1.3571574555367, -1.36627291165848]]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0L, [[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0],
[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]],
[[-0.0995580974152422, 0.00385956572525086, 1.03989986852812,
1.17758048907675], [-0.767560815504508, 1.00734665589615, -1.00464902552717,
-0.457753839967327], [-0.934561495026824, 0.20455698375943, -1.07515071152907,
-1.36627291165848], [0.568444620674023, -0.798930106411465, 0.687391438518589,
0.632469046062059], [-0.767560815504508, 0.405254401793609, -1.28665576953479,
-1.18456909732025], [-0.767560815504508, -2.00311461461654, 0.334883008509056,
0.269061417385597], [2.07145073637487, 0.20455698375943, 0.969398182526215,
0.995876674738521], [0.401443941151707, 2.81362341820376, -1.07515071152907,
-0.821161468643789], [-0.934561495026824, 0.20455698375943, -1.07515071152907,
-1.36627291165848], [0.0674425821070736, -1.20032494247982, 0.475886380512869,
0.269061417385597]]),\n",
+ " (1L, [[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0],
[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]],
[[0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029],
[-0.0995580974152422, 1.4087414919645, -1.07515071152907, -1.18456909732025],
[-0.0995580974152422, 1.81013632803286, -1.21615408353289, -0.821161468643789],
[1.06944665924097, 0.00385956572525086, 1.11040155453003, 0.995876674738521],
[0.0674425821070736, -0.798930106411465, 0.969398182526215, 0.632469046062059],
[-1.10156217454914, 0.806649237861967, -1.00464902552717, -1.18456909732025],
[-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.00286528298202],
[-0.600560135982193, 1.60943890999868, -1.00464902552717, -1.18456909732025],
[0.902445979718656, -0.598232688377286, 1.46290998453956, 1.35928430341498],
[0.401443941151707, -0.798930106411465, 0.334883008509056,
0.269061417385597]]),\n",
+ " (2L, [[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]],
[[-1.10156217454914, 0.00385956572525086, -1.14565239753098,
-1.36627291165848], [0.234443261629389, 0.00385956572525086, 1.03989986852812,
1.17758048907675], [0.902445979718656, -0.196837852308928, 1.03989986852812,
1.17758048907675], [2.07145073637487, 0.20455698375943, 1.18090324053193,
1.17758048907675], [1.90445005685255, -0.196837852308928, 1.11040155453003,
0.81417286040029], [-0.600560135982193, 0.806649237861967, -1.07515071152907,
-1.18456909732025], [-0.433559456459875, -0.598232688377286, 0.616889752516682,
0.995876674738521], [0.73544530019634, 0.00385956572525086, 0.828394810522402,
1.17758048907675], [1.06944665924097, -0.196837852308928, 1.18090324053193,
0.995876674738521], [-0.767560815504508, 1.00734665589615, -1.21615408353289,
-1.00286528298202]]),\n",
+ " (3L, [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0],
[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]],
[[0.902445979718656, 0.806649237861967, 1.03989986852812, 1.35928430341498],
[1.4034480182856, -1.401022360514, 0.969398182526215, 0.81417286040029],
[0.568444620674023, -0.598232688377286, 0.616889752516682, 0.632469046062059],
[1.4034480182856, 0.605951819827788, 1.18090324053193, 1.35928430341498],
[-1.60256421311609, -1.401022360514, -1.21615408353289, -1.00286528298202],
[-1.76956489263841, -0.196837852308928, -1.14565239753098, -1.18456909732025],
[0.0674425821070736, -1.20032494247982, 0.546388066514775, 0.450765231723828],
[0.0674425821070736, 1.00734665589615, -1.21615408353289, -1.18456909732025],
[-1.76956489263841, 0.405254401793609, -1.21615408353289, -1.18456909732025],
[0.234443261629389, -0.999627524445644, 0.616889752516682,
0.450765231723828]]),\n",
+ " (4L, [[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]],
[[0.568444620674023, 2.01083374606704, -1.28665576953479, -1.18456909732025],
[0.234443261629389, -0.196837852308928, 0.405384694510963, 0.81417286040029],
[-0.934561495026824, -1.20032494247982, 0.193879636505243, 0.269061417385597],
[0.568444620674023, -0.598232688377286, 0.757893124520495, 0.269061417385597],
[0.902445979718656, -1.60171977854818, 0.687391438518589, 0.269061417385597],
[1.23644733876329, -1.60171977854818, 1.03989986852812, 1.17758048907675],
[-1.76956489263841, 0.00385956572525086, -1.21615408353289, -1.18456909732025],
[0.401443941151707, -0.397535270343108, 1.03989986852812, 0.81417286040029],
[0.234443261629389, 0.00385956572525086, 0.757893124520495, 0.81417286040029],
[-0.767560815504508, 0.806649237861967, -1.07515071152907,
-1.18456909732025]]),\n",
+ " (5L, [[1.0, 0.0], [1.0, 0.0]], [[-0.600560135982193,
1.60943890999868, -0.793143967521448, -0.821161468643789], [-1.93656557216072,
0.00385956572525086, -1.3571574555367, -1.36627291165848]])]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary,
iris_data_packed_standardization;\n",
+ "\n",
+ "SELECT madlib.minibatch_preprocessor('iris_data', -- Source
table\n",
+ " 'iris_data_packed', -- Output
table\n",
+ " 'class_text', -- Dependent
variable\n",
+ " 'attributes', -- Independent
variables\n",
+ " NULL, -- Grouping\n",
+ " 10 -- Buffer
size\n",
+ " );\n",
+ "\n",
+ "SELECT * FROM iris_data_packed ORDER BY __id__;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Review the output summary data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>source_table</th>\n",
+ " <th>output_table</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " <th>dependent_vartype</th>\n",
+ " <th>buffer_size</th>\n",
+ " <th>class_values</th>\n",
+ " <th>num_rows_processed</th>\n",
+ " <th>num_missing_rows_skipped</th>\n",
+ " <th>grouping_cols</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>iris_data</td>\n",
+ " <td>iris_data_packed</td>\n",
+ " <td>class_text</td>\n",
+ " <td>attributes</td>\n",
+ " <td>character varying</td>\n",
+ " <td>10</td>\n",
+ " <td>[u'Iris_setosa', u'Iris_versicolor']</td>\n",
+ " <td>52</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'iris_data', u'iris_data_packed', u'class_text', u'attributes',
u'character varying', 10, [u'Iris_setosa', u'Iris_versicolor'], 52, 0, None)]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM iris_data_packed_summary;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 4. Grouping\n",
+ "\n",
+ "Run the preprocessor with grouping by state:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "5 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__id__</th>\n",
+ " <th>state</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>Alaska</td>\n",
+ " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0],
[0.0, 1.0], [1.0, 0.0]]</td>\n",
+ " <td>[[1.26030711687938, -1.615325368523, 1.10943660794792,
1.24354000843452], [1.10129640587123, -0.126074175104234, 1.2524188915498,
1.05700900716934], [0.306242850830503, -0.977074857057813, 0.680489757142278,
0.497416003373807], [0.942285694863087, -1.615325368523, 0.751980898943218,
0.310885002108629], [0.783274983854942, 0.0866759953841608, 0.894963182545097,
1.24354000843452], [-0.806832126226518, 0.299426165872556, -1.03529764608027,
-1.36789400927797], [-0.488810704210227, 1.78867735929132, -0.963806504279335,
-1.18136300801279], [-1.60188568126725, 0.512176336360951, -1.17827992968215,
-1.18136300801279], [-0.965842837234665, 0.0866759953841608, -1.10678878788121,
-0.994832006747614], [-0.647821415218373, 1.15042684782613, -1.17827992968215,
-0.994832006747614], [-0.647821415218373, -2.04082570949979, 0.394525189938519,
0.310885002108629], [2.05536067192011, 0.299426165872556, 1.03794546614698,
1.05700900716934], [-0.647821415218373, 0.512176336360951, -1.24
977107148309, -1.18136300801279]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>Alaska</td>\n",
+ " <td>[[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0,
1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0],
[1.0, 0.0], [1.0, 0.0]]</td>\n",
+ " <td>[[1.41931782788752, 0.724926506849345, 1.2524188915498,
1.4300710096997], [-0.647821415218373, 1.15042684782613, -0.963806504279335,
-0.435239002952081], [0.624264272846795, -0.551574516081023, 0.823472040744157,
0.310885002108629], [-1.4428749702591, -1.4025751980346, -1.17827992968215,
-0.994832006747614], [0.306242850830503, -0.126074175104234, 0.466016331739459,
0.870478005904162], [1.89634996091196, -0.126074175104234, 1.18092774974886,
0.870478005904162], [-0.32979999320208, -0.551574516081023, 0.680489757142278,
1.05700900716934], [0.46525356183865, -0.338824345592629, 1.10943660794792,
0.870478005904162], [0.306242850830503, 0.0866759953841608, 1.10943660794792,
1.24354000843452], [-0.488810704210227, 0.93767667733774, -1.03529764608027,
-1.18136300801279], [-0.488810704210227, 1.78867735929132, -0.749333078876516,
-0.808301005482437], [0.147232139822357, 1.15042684782613, -1.17827992968215,
-1.18136300801279], [-1.60188568126725, 0.0866759953841608, -1.1
7827992968215, -1.18136300801279]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>2</td>\n",
+ " <td>Alaska</td>\n",
+ " <td>[[0.0, 1.0]]</td>\n",
+ " <td>[[-0.806832126226518, -1.18982502754621, 0.25154290633664,
0.310885002108629]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>Tennessee</td>\n",
+ " <td>[[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0,
1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0],
[1.0, 0.0], [0.0, 1.0]]</td>\n",
+ " <td>[[-0.0286196553591748, -1.22176567731394,
0.412632633639227, 0.22669394242252], [-0.207492501354026, 1.25994585473,
-1.12079945083087, -1.19014319771823], [0.507998882625381, -0.839963903153331,
0.621737008794241, 0.580903227457708], [-0.922983885333435, 0.687243193489089,
-1.12079945083087, -1.19014319771823], [1.04461742060994, -0.0763603548321211,
1.03994575910427, 0.935112512492896], [2.11785449657905, 0.114540532248182,
1.10964721748927, 1.11221715501049], [0.507998882625381, -0.649063016073029,
0.552035550409236, 0.580903227457708], [-1.99622096130255, -0.267261241912424,
-1.19050090921588, -1.19014319771823], [1.40236311259964, -1.41266656439424,
0.90054284233426, 0.758007869975302], [0.32912603663053, 2.59625206429212,
-1.12079945083087, -0.835933912683043], [-0.207492501354026, 1.6417476288906,
-1.26020236760088, -0.835933912683043], [-2.1750938072974, -0.0763603548321211,
-1.39960528437089, -1.36724784023582], [-0.0286196553591748, -1.22176567731394,
0.
482334092024232, 0.403798584940115]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>Tennessee</td>\n",
+ " <td>[[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0,
1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0],
[0.0, 1.0]]</td>\n",
+ " <td>[[0.865744574615085, 0.687243193489089, 0.970244300719264,
1.28932179752808], [-0.0286196553591748, -0.839963903153331, 0.90054284233426,
0.580903227457708], [-1.28072957732314, 0.687243193489089, -1.05109799244587,
-1.19014319771823], [-1.10185673132829, 0.114540532248182, -1.12079945083087,
-1.36724784023582], [-0.0286196553591748, -1.03086479023363, 0.621737008794241,
0.758007869975302], [-0.207492501354026, -0.0763603548321211,
0.970244300719264, 1.11221715501049], [0.865744574615085, -0.649063016073029,
1.38845305102929, 1.28932179752808], [0.150253190635677, -0.0763603548321211,
0.691438467179245, 0.758007869975302], [0.32912603663053, -0.839963903153331,
0.273229716869218, 0.22669394242252], [-1.28072957732314, -0.0763603548321211,
-1.19050090921588, -1.36724784023582], [0.507998882625381, 1.8326485159709,
-1.32990382598589, -1.19014319771823], [0.865744574615085, -0.267261241912424,
0.970244300719264, 1.11221715501049]]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0L, u'Alaska', [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0,
1.0], [0.0, 1.0], [1.0, 0.0]], [[1.26030711687938, -1.615325368523,
1.10943660794792, 1.24354000843452], [1.10129640587123, -0.126074175104234,
1.2524188915498, 1.05700900716934], [0.306242850830503, -0.977074857057813,
0.680489757142278, 0.497416003373807], [0.942285694863087, -1.615325368523,
0.751980898943218, 0.310885002108629], [0.783274983854942, 0.0866759953841608,
0.894963182545097, 1.24354000843452], [-0.806832126226518, 0.299426165872556,
-1.03529764608027, -1.36789400927797], [-0.488810704210227, 1.78867735929132,
-0.963806504279335, -1.18136300801279], [-1.60188568126725, 0.512176336360951,
-1.17827992968215, -1.18136300801279], [-0.965842837234665, 0.0866759953841608,
-1.10678878788121, -0.994832006747614], [-0.647821415218373, 1.15042684782613,
-1.17827992968215, -0.994832006747614], [-0.647821415218373, -2.040825709499
79, 0.394525189938519, 0.310885002108629], [2.05536067192011,
0.299426165872556, 1.03794546614698, 1.05700900716934], [-0.647821415218373,
0.512176336360951, -1.24977107148309, -1.18136300801279]]),\n",
+ " (1L, u'Alaska', [[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0],
[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0,
0.0], [1.0, 0.0], [1.0, 0.0]], [[1.41931782788752, 0.724926506849345,
1.2524188915498, 1.4300710096997], [-0.647821415218373, 1.15042684782613,
-0.963806504279335, -0.435239002952081], [0.624264272846795,
-0.551574516081023, 0.823472040744157, 0.310885002108629], [-1.4428749702591,
-1.4025751980346, -1.17827992968215, -0.994832006747614], [0.306242850830503,
-0.126074175104234, 0.466016331739459, 0.870478005904162], [1.89634996091196,
-0.126074175104234, 1.18092774974886, 0.870478005904162], [-0.32979999320208,
-0.551574516081023, 0.680489757142278, 1.05700900716934], [0.46525356183865,
-0.338824345592629, 1.10943660794792, 0.870478005904162], [0.306242850830503,
0.0866759953841608, 1.10943660794792, 1.24354000843452], [-0.488810704210227,
0.93767667733774, -1.03529764608027, -1.18136300801279], [-0.488810704210227,
1.78867735929132,
-0.749333078876516, -0.808301005482437], [0.147232139822357,
1.15042684782613, -1.17827992968215, -1.18136300801279], [-1.60188568126725,
0.0866759953841608, -1.17827992968215, -1.18136300801279]]),\n",
+ " (2L, u'Alaska', [[0.0, 1.0]], [[-0.806832126226518,
-1.18982502754621, 0.25154290633664, 0.310885002108629]]),\n",
+ " (0L, u'Tennessee', [[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0],
[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0,
0.0], [1.0, 0.0], [0.0, 1.0]], [[-0.0286196553591748, -1.22176567731394,
0.412632633639227, 0.22669394242252], [-0.207492501354026, 1.25994585473,
-1.12079945083087, -1.19014319771823], [0.507998882625381, -0.839963903153331,
0.621737008794241, 0.580903227457708], [-0.922983885333435, 0.687243193489089,
-1.12079945083087, -1.19014319771823], [1.04461742060994, -0.0763603548321211,
1.03994575910427, 0.935112512492896], [2.11785449657905, 0.114540532248182,
1.10964721748927, 1.11221715501049], [0.507998882625381, -0.649063016073029,
0.552035550409236, 0.580903227457708], [-1.99622096130255, -0.267261241912424,
-1.19050090921588, -1.19014319771823], [1.40236311259964, -1.41266656439424,
0.90054284233426, 0.758007869975302], [0.32912603663053, 2.59625206429212,
-1.12079945083087, -0.835933912683043], [-0.207492501354026, 1.64174762889
06, -1.26020236760088, -0.835933912683043], [-2.1750938072974,
-0.0763603548321211, -1.39960528437089, -1.36724784023582],
[-0.0286196553591748, -1.22176567731394, 0.482334092024232,
0.403798584940115]]),\n",
+ " (1L, u'Tennessee', [[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0],
[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0,
0.0], [0.0, 1.0]], [[0.865744574615085, 0.687243193489089, 0.970244300719264,
1.28932179752808], [-0.0286196553591748, -0.839963903153331, 0.90054284233426,
0.580903227457708], [-1.28072957732314, 0.687243193489089, -1.05109799244587,
-1.19014319771823], [-1.10185673132829, 0.114540532248182, -1.12079945083087,
-1.36724784023582], [-0.0286196553591748, -1.03086479023363, 0.621737008794241,
0.758007869975302], [-0.207492501354026, -0.0763603548321211,
0.970244300719264, 1.11221715501049], [0.865744574615085, -0.649063016073029,
1.38845305102929, 1.28932179752808], [0.150253190635677, -0.0763603548321211,
0.691438467179245, 0.758007869975302], [0.32912603663053, -0.839963903153331,
0.273229716869218, 0.22669394242252], [-1.28072957732314, -0.0763603548321211,
-1.19050090921588, -1.36724784023582], [0.507998882625381, 1.8326485159709
, -1.32990382598589, -1.19014319771823], [0.865744574615085,
-0.267261241912424, 0.970244300719264, 1.11221715501049]])]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary,
iris_data_packed_standardization;\n",
+ "\n",
+ "SELECT madlib.minibatch_preprocessor('iris_data', -- Source
table\n",
+ " 'iris_data_packed', -- Output
table\n",
+ " 'class_text', -- Dependent
variable\n",
+ " 'attributes', -- Independent
variables\n",
+ " 'state' -- Grouping\n",
+ " );\n",
+ "\n",
+ "SELECT * FROM iris_data_packed ORDER BY state, __id__;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Review the output summary table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>source_table</th>\n",
+ " <th>output_table</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " <th>dependent_vartype</th>\n",
+ " <th>buffer_size</th>\n",
+ " <th>class_values</th>\n",
+ " <th>num_rows_processed</th>\n",
+ " <th>num_missing_rows_skipped</th>\n",
+ " <th>grouping_cols</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>iris_data</td>\n",
+ " <td>iris_data_packed</td>\n",
+ " <td>class_text</td>\n",
+ " <td>attributes</td>\n",
+ " <td>character varying</td>\n",
+ " <td>13</td>\n",
+ " <td>[u'Iris_setosa', u'Iris_versicolor']</td>\n",
+ " <td>52</td>\n",
+ " <td>0</td>\n",
+ " <td>state</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'iris_data', u'iris_data_packed', u'class_text', u'attributes',
u'character varying', 13, [u'Iris_setosa', u'Iris_versicolor'], 52, 0,
u'state')]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM iris_data_packed_summary;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Review the output standardization table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>state</th>\n",
+ " <th>mean</th>\n",
+ " <th>std</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>Alaska</td>\n",
+ " <td>[5.40740740740741, 2.95925925925926, 2.94814814814815,
0.833333333333333]</td>\n",
+ " <td>[0.628888452645665, 0.470034875978888, 1.39877469405147,
0.536103914747325]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>Tennessee</td>\n",
+ " <td>[5.516, 3.04, 3.108, 0.872]</td>\n",
+ " <td>[0.55905634778617, 0.523832034148353, 1.43469021046357,
0.564637937088893]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'Alaska', [5.40740740740741, 2.95925925925926, 2.94814814814815,
0.833333333333333], [0.628888452645665, 0.470034875978888, 1.39877469405147,
0.536103914747325]),\n",
+ " (u'Tennessee', [5.516, 3.04, 3.108, 0.872], [0.55905634778617,
0.523832034148353, 1.43469021046357, 0.564637937088893])]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM iris_data_packed_standardization ORDER BY state;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 5. Integer dependent variable for classification\n",
+ "\n",
+ "If the depedent variable is scalar integer, and you have not already
encoded it, you can ask the preprocessor to encode it for you:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done.\n",
+ "1 rows affected.\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>__id__</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>0</td>\n",
+ " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0],
[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0,
0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0],
[0.0, 1.0], [0.0, 1.0]]</td>\n",
+ " <td>[[0.902445979718656, -1.60171977854818, 0.687391438518589,
0.269061417385597], [0.401443941151707, -0.798930106411465, 0.334883008509056,
0.269061417385597], [0.568444620674023, -0.598232688377286, 0.757893124520495,
0.269061417385597], [0.0674425821070736, -1.20032494247982, 0.475886380512869,
0.269061417385597], [-0.934561495026824, -1.20032494247982, 0.193879636505243,
0.269061417385597], [-1.76956489263841, 0.405254401793609, -1.21615408353289,
-1.18456909732025], [0.568444620674023, -0.598232688377286, 0.616889752516682,
0.632469046062059], [-0.767560815504508, 1.00734665589615, -1.21615408353289,
-1.00286528298202], [-0.0995580974152422, 1.4087414919645, -1.07515071152907,
-1.18456909732025], [-0.767560815504508, 1.00734665589615, -1.00464902552717,
-0.457753839967327], [-0.600560135982193, 1.60943890999868, -1.00464902552717,
-1.18456909732025], [-0.934561495026824, 0.20455698375943, -1.07515071152907,
-1.36627291165848], [0.234443261629389, 0.003859565725
25086, 1.03989986852812, 1.17758048907675], [-0.0995580974152422,
1.81013632803286, -1.21615408353289, -0.821161468643789], [-1.76956489263841,
0.00385956572525086, -1.21615408353289, -1.18456909732025], [-1.60256421311609,
-1.401022360514, -1.21615408353289, -1.00286528298202], [-1.10156217454914,
0.806649237861967, -1.00464902552717, -1.18456909732025], [-0.767560815504508,
0.405254401793609, -1.28665576953479, -1.18456909732025], [-1.76956489263841,
-0.196837852308928, -1.14565239753098, -1.18456909732025], [-1.93656557216072,
0.00385956572525086, -1.3571574555367, -1.36627291165848], [1.06944665924097,
-0.196837852308928, 1.18090324053193, 0.995876674738521], [0.568444620674023,
2.01083374606704, -1.28665576953479, -1.18456909732025], [0.401443941151707,
2.81362341820376, -1.07515071152907, -0.821161468643789], [0.0674425821070736,
-0.999627524445644, 0.687391438518589, 0.81417286040029], [0.902445979718656,
-0.196837852308928, 1.03989986852812, 1.17758048907675], [0.56844462067
4023, -0.798930106411465, 0.687391438518589, 0.632469046062059]]</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>1</td>\n",
+ " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0],
[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0,
1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[1.0, 0.0], [1.0, 0.0]]</td>\n",
+ " <td>[[0.902445979718656, 0.806649237861967, 1.03989986852812,
1.35928430341498], [0.902445979718656, -0.598232688377286, 1.46290998453956,
1.35928430341498], [-0.0995580974152422, 0.00385956572525086, 1.03989986852812,
1.17758048907675], [0.234443261629389, 0.00385956572525086, 0.757893124520495,
0.81417286040029], [0.0674425821070736, -1.20032494247982, 0.546388066514775,
0.450765231723828], [1.23644733876329, -1.60171977854818, 1.03989986852812,
1.17758048907675], [-1.10156217454914, 0.00385956572525086, -1.14565239753098,
-1.00286528298202], [1.4034480182856, -1.401022360514, 0.969398182526215,
0.81417286040029], [1.4034480182856, 0.605951819827788, 1.18090324053193,
1.35928430341498], [-0.600560135982193, 0.806649237861967, -1.07515071152907,
-1.18456909732025], [0.401443941151707, -0.397535270343108, 1.03989986852812,
0.81417286040029], [-0.767560815504508, 0.806649237861967, -1.07515071152907,
-1.18456909732025], [1.06944665924097, 0.00385956572525086, 1.110401
55453003, 0.995876674738521], [0.234443261629389, -0.999627524445644,
0.616889752516682, 0.450765231723828], [0.0674425821070736, 1.00734665589615,
-1.21615408353289, -1.18456909732025], [2.07145073637487, 0.20455698375943,
0.969398182526215, 0.995876674738521], [0.73544530019634, 0.00385956572525086,
0.828394810522402, 1.17758048907675], [0.234443261629389, -0.196837852308928,
0.405384694510963, 0.81417286040029], [-0.767560815504508, -2.00311461461654,
0.334883008509056, 0.269061417385597], [1.90445005685255, -0.196837852308928,
1.11040155453003, 0.81417286040029], [-0.934561495026824, 0.20455698375943,
-1.07515071152907, -1.36627291165848], [2.07145073637487, 0.20455698375943,
1.18090324053193, 1.17758048907675], [0.0674425821070736, -0.798930106411465,
0.969398182526215, 0.632469046062059], [-0.433559456459875, -0.598232688377286,
0.616889752516682, 0.995876674738521], [-1.10156217454914, 0.00385956572525086,
-1.14565239753098, -1.36627291165848], [-0.600560135982193, 1.60943890
999868, -0.793143967521448, -0.821161468643789]]</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(0L, [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0,
0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0],
[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0,
1.0], [0.0, 1.0]], [[0.902445979718656, -1.60171977854818, 0.687391438518589,
0.269061417385597], [0.401443941151707, -0.798930106411465, 0.334883008509056,
0.269061417385597], [0.568444620674023, -0.598232688377286, 0.757893124520495,
0.269061417385597], [0.0674425821070736, -1.20032494247982, 0.475886380512869,
0.269061417385597], [-0.934561495026824, -1.20032494247982, 0.193879636505243,
0.269061417385597], [-1.76956489263841, 0.405254401793609, -1.21615408353289,
-1.18456909732025], [0.568444620674023, -0.598232688377286, 0.616889752516682,
0.632469046062059], [-0.767560815504508, 1.00734665589615, -1.21615408353289,
-1.00286528298202], [-0.0995580974152422, 1.4087414919645,
-1.07515071152907, -1.18456909732025], [-0.767560815504508, 1.00734665589615,
-1.00464902552717, -0.457753839967327], [-0.600560135982193, 1.60943890999868,
-1.00464902552717, -1.18456909732025], [-0.934561495026824, 0.20455698375943,
-1.07515071152907, -1.36627291165848], [0.234443261629389, 0.00385956572525086,
1.03989986852812, 1.17758048907675], [-0.0995580974152422, 1.81013632803286,
-1.21615408353289, -0.821161468643789], [-1.76956489263841,
0.00385956572525086, -1.21615408353289, -1.18456909732025], [-1.60256421311609,
-1.401022360514, -1.21615408353289, -1.00286528298202], [-1.10156217454914,
0.806649237861967, -1.00464902552717, -1.18456909732025], [-0.767560815504508,
0.405254401793609, -1.28665576953479, -1.18456909732025], [-1.76956489263841,
-0.196837852308928, -1.14565239753098, -1.18456909732025], [-1.93656557216072,
0.00385956572525086, -1.3571574555367, -1.36627291165848], [1.06944665924097,
-0.196837852308928, 1.18090324053193, 0.995876674738521], [0.56844462067402
3, 2.01083374606704, -1.28665576953479, -1.18456909732025],
[0.401443941151707, 2.81362341820376, -1.07515071152907, -0.821161468643789],
[0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029],
[0.902445979718656, -0.196837852308928, 1.03989986852812, 1.17758048907675],
[0.568444620674023, -0.798930106411465, 0.687391438518589,
0.632469046062059]]),\n",
+ " (1L, [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0,
0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0],
[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0,
0.0], [1.0, 0.0]], [[0.902445979718656, 0.806649237861967, 1.03989986852812,
1.35928430341498], [0.902445979718656, -0.598232688377286, 1.46290998453956,
1.35928430341498], [-0.0995580974152422, 0.00385956572525086, 1.03989986852812,
1.17758048907675], [0.234443261629389, 0.00385956572525086, 0.757893124520495,
0.81417286040029], [0.0674425821070736, -1.20032494247982, 0.546388066514775,
0.450765231723828], [1.23644733876329, -1.60171977854818, 1.03989986852812,
1.17758048907675], [-1.10156217454914, 0.00385956572525086, -1.14565239753098,
-1.00286528298202], [1.4034480182856, -1.401022360514, 0.969398182526215,
0.81417286040029], [1.4034480182856, 0.605951819827788, 1.1809032405
3193, 1.35928430341498], [-0.600560135982193, 0.806649237861967,
-1.07515071152907, -1.18456909732025], [0.401443941151707, -0.397535270343108,
1.03989986852812, 0.81417286040029], [-0.767560815504508, 0.806649237861967,
-1.07515071152907, -1.18456909732025], [1.06944665924097, 0.00385956572525086,
1.11040155453003, 0.995876674738521], [0.234443261629389, -0.999627524445644,
0.616889752516682, 0.450765231723828], [0.0674425821070736, 1.00734665589615,
-1.21615408353289, -1.18456909732025], [2.07145073637487, 0.20455698375943,
0.969398182526215, 0.995876674738521], [0.73544530019634, 0.00385956572525086,
0.828394810522402, 1.17758048907675], [0.234443261629389, -0.196837852308928,
0.405384694510963, 0.81417286040029], [-0.767560815504508, -2.00311461461654,
0.334883008509056, 0.269061417385597], [1.90445005685255, -0.196837852308928,
1.11040155453003, 0.81417286040029], [-0.934561495026824, 0.20455698375943,
-1.07515071152907, -1.36627291165848], [2.07145073637487, 0.20455698375943,
1.18090324053193, 1.17758048907675], [0.0674425821070736, -0.798930106411465,
0.969398182526215, 0.632469046062059], [-0.433559456459875, -0.598232688377286,
0.616889752516682, 0.995876674738521], [-1.10156217454914, 0.00385956572525086,
-1.14565239753098, -1.36627291165848], [-0.600560135982193, 1.60943890999868,
-0.793143967521448, -0.821161468643789]])]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary,
iris_data_packed_standardization;\n",
+ "\n",
+ "SELECT madlib.minibatch_preprocessor('iris_data', -- Source
table\n",
+ " 'iris_data_packed', -- Output
table\n",
+ " 'class', -- Integer
dependent variable\n",
+ " 'attributes', -- Independent
variables\n",
+ " NULL, -- Grouping\n",
+ " NULL, -- Buffer
size\n",
+ " TRUE -- Encode
scalar int dependent variable\n",
+ " );\n",
+ "\n",
+ "SELECT * FROM iris_data_packed ORDER BY __id__;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Review output summary table:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<table>\n",
+ " <tr>\n",
+ " <th>source_table</th>\n",
+ " <th>output_table</th>\n",
+ " <th>dependent_varname</th>\n",
+ " <th>independent_varname</th>\n",
+ " <th>dependent_vartype</th>\n",
+ " <th>buffer_size</th>\n",
+ " <th>class_values</th>\n",
+ " <th>num_rows_processed</th>\n",
+ " <th>num_missing_rows_skipped</th>\n",
+ " <th>grouping_cols</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <td>iris_data</td>\n",
+ " <td>iris_data_packed</td>\n",
+ " <td>class</td>\n",
+ " <td>attributes</td>\n",
+ " <td>integer</td>\n",
+ " <td>26</td>\n",
+ " <td>[1, 2]</td>\n",
+ " <td>52</td>\n",
+ " <td>0</td>\n",
+ " <td>None</td>\n",
+ " </tr>\n",
+ "</table>"
+ ],
+ "text/plain": [
+ "[(u'iris_data', u'iris_data_packed', u'class', u'attributes',
u'integer', 26, [1, 2], 52, 0, None)]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT * FROM iris_data_packed_summary;"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}