[6/7] madlib-site git commit: add new workbooks for 1dot13

2017-12-08 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/Elastic-net-v2.ipynb
--
diff --git a/community-artifacts/Elastic-net-v2.ipynb 
b/community-artifacts/Elastic-net-v2.ipynb
new file mode 100644
index 000..b6082f0
--- /dev/null
+++ b/community-artifacts/Elastic-net-v2.ipynb
@@ -0,0 +1,2078 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Elastic net (MADlib v1.10+)\n",
+"Demonstrates elastic net, including these updates:\n",
+"- in MADlib 1.10: grouping and cross validation which were introduced \n",
+"- in MADlib 1.13: report negative root mean squared error instead of the 
negative mean squared error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpdbchina@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum 4.3.10.0\n",
+"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum 4.2.3.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.13-dev, git revision: 
rel/v1.12-42-gedc93f5, cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-42-gedc93f5, 
cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## 1.  Create data set\n",
+"House prices and characteristics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "27 rows affected.\n",
+  "27 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "tax\n&

[3/7] madlib-site git commit: add new workbooks for 1dot13

2017-12-08 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/kNN-v2.ipynb
--
diff --git a/community-artifacts/kNN-v2.ipynb b/community-artifacts/kNN-v2.ipynb
new file mode 100644
index 000..5b74e48
--- /dev/null
+++ b/community-artifacts/kNN-v2.ipynb
@@ -0,0 +1,751 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# k-Nearest Neighbors\n",
+"Finds k nearest data points to a given data point and outputs majority 
vote value of output classes in case of classification, and average value of 
target values in case of regression. KNN was first added in MADlib 1.10 and the 
interface was updated in 1.13."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpdbchina@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum 4.3.10.0\n",
+"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum 4.2.3.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.13-dev, git revision: 
rel/v1.12-41-g4aa0732, cmake configuration time: Tue Dec  5 20:44:49 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-41-g4aa0732, 
cmake configuration time: Tue Dec  5 20:44:49 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1.  Load data for classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "9 rows affected.\n",
+  "9 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "data\n",
+   "label\n",
+   "\n",
+  

[4/7] madlib-site git commit: add new workbooks for 1dot13

2017-12-08 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/Logistic-regression-v1.ipynb
--
diff --git a/community-artifacts/Logistic-regression-v1.ipynb 
b/community-artifacts/Logistic-regression-v1.ipynb
new file mode 100644
index 000..226049d
--- /dev/null
+++ b/community-artifacts/Logistic-regression-v1.ipynb
@@ -0,0 +1,892 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Logistic regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+    }
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: fmcquillan@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum 4.3.10.0\n",
+"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.12, git revision: unknown, cmake 
configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build 
system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration 
time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: 
Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"This data set is related to predicting a second heart attack given 
treatment and health factors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "20 rows affected.\n",
+  "20 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "second_attack\n",
+   "treatment\n",
+   "trait_anxiety\n",
+   "\n",
+   "\n",
+   "1\n",
+   "1\n",
+   "1\n",
+   "70\n",
+   "\n",
+   "\n",
+   "2\n",
+   "1\n",
+   "1\n",
+   

madlib-site git commit: website update for 1.14 release

2018-05-02 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/asf-site f732f863c -> 39604a00c


website update for 1.14 release


Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/39604a00
Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/39604a00
Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/39604a00

Branch: refs/heads/asf-site
Commit: 39604a00c6284e43d211480a5f9054e33fbb0dc1
Parents: f732f86
Author: Frank McQuillan 
Authored: Wed May 2 09:46:48 2018 -0700
Committer: Frank McQuillan 
Committed: Wed May 2 09:46:48 2018 -0700

--
 design.pdf | Bin 1929401 -> 1930975 bytes
 documentation.html |   9 +
 download.html  |  24 +++-
 index.html |  39 ++-
 4 files changed, 62 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/39604a00/design.pdf
--
diff --git a/design.pdf b/design.pdf
index 073fdb1..164ecc1 100644
Binary files a/design.pdf and b/design.pdf differ

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/39604a00/documentation.html
--
diff --git a/documentation.html b/documentation.html
index 41cc7f0..4670603 100644
--- a/documentation.html
+++ b/documentation.html
@@ -55,6 +55,7 @@ jQuery(document).ready(function() {
 The primary documentation reference material providing 
detailed information on the functions and algorithms within MADlib as well as 
background theory and references into the literature.
 
 Older Documentation
+MADlib v1.13
 MADlib v1.12
 MADlib v1.11
 MADlib v1.10
@@ -98,6 +99,14 @@ jQuery(document).ready(function() {
 
 
 
+
+
+https://github.com/apache/madlib-site/tree/asf-site/community-artifacts;>Jupyter
 Notebooks for Getting Started
+Includes many of the most commonly used algorithms by data 
scientists.
+
+
+
+
 
 
 Community 
Portal

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/39604a00/download.html
--
diff --git a/download.html b/download.html
index 997c93f..59ded0d 100644
--- a/download.html
+++ b/download.html
@@ -58,7 +58,7 @@
Current Release


-   v1.13
+   v1.14
Source Code and Convenience 
Binaries
 
MADlib source code 
and convenience binaries are available from the Apache distribution site.
@@ -66,10 +66,10 @@
Latest 
stable release:
 

-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-src.tar.gz=download;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.md5;>md5,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.sha512;>sha512)
 
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-bin-Linux.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.md5;>md5,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). GPDB 4.3.x, PostgreSQL 9.5 and 
9.6.
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm.md5;>md5,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.3.x.
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg=download;>Mac
 OS X   (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.asc;>pgp,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.md5;>md5,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.sha512;>sha512)
 — OS 10.6 and higher.  For PostgreSQL 9.5 and 9.6.
+   

madlib-site git commit: updated 1.13 links to archive.apache.org

2018-05-03 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/asf-site e76da81ae -> 5fa1ac070


updated 1.13 links to archive.apache.org


Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/5fa1ac07
Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/5fa1ac07
Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/5fa1ac07

Branch: refs/heads/asf-site
Commit: 5fa1ac07007dce077c18ee36052a500faaad19fd
Parents: e76da81
Author: Frank McQuillan 
Authored: Thu May 3 15:39:39 2018 -0700
Committer: Frank McQuillan 
Committed: Thu May 3 15:39:39 2018 -0700

--
 download.html | 8 
 index.html| 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/5fa1ac07/download.html
--
diff --git a/download.html b/download.html
index 0e03047..8728d10 100644
--- a/download.html
+++ b/download.html
@@ -104,10 +104,10 @@
Release 
artifacts:
 

-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-src.tar.gz=download;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.md5;>md5,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.sha512;>sha512)
 
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-bin-Linux.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.md5;>md5,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). GPDB 4.3.x, PostgreSQL 9.5 and 
9.6.
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm.md5;>md5,
  https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.3.x.
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg=download;>Mac
 OS X   (https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.asc;>pgp,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.md5;>md5,
 https://www.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.sha512;>sha512)
 — OS 10.6 and higher.  For PostgreSQL 9.5 and 9.6.
+   https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz;>Source
 code tar.gz (https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.asc;>pgp,
 https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.md5;>md5,
 https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-src.tar.gz.sha512;>sha512)
 
+   https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm;>Linux
   (https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.asc;>pgp,
  https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.md5;>md5,
  https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). GPDB 4.3.x, PostgreSQL 9.5 and 
9.6.
+   https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm;>Linux
   (https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm.asc;>pgp,
  https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux-GPDB5.rpm.md5;>md5,
  https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.3.x.
+   https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg;>Mac
 OS X   (https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.asc;>pgp,
 https://archive.apache.org/dist/madlib/1.13/apache-madlib-1.13-bin-Darwin.dmg.md5;>md5,
 

madlib git commit: minor edit to minibatch preproc user doc

2018-08-02 Thread fmcquillan
Repository: madlib
Updated Branches:
  refs/heads/master 186390f7c -> 298fed799


minor edit to minibatch preproc user doc


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/298fed79
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/298fed79
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/298fed79

Branch: refs/heads/master
Commit: 298fed799f3e8c728195882bea01479b644ee248
Parents: 186390f
Author: Frank McQuillan 
Authored: Thu Aug 2 10:34:17 2018 -0700
Committer: Frank McQuillan 
Committed: Thu Aug 2 10:34:17 2018 -0700

--
 .../utilities/minibatch_preprocessing.sql_in| 16 
 1 file changed, 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/298fed79/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
--
diff --git 
a/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in 
b/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
index 75adcc9..ead43d9 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.sql_in
@@ -144,22 +144,6 @@ already encoded the dependent variable yourself,  you can 
ignore this parameter.
 Also, if you want to encode float values for some reason, cast them to text
 first.
   
-
-  one_hot_encode_int_dep_var (optional)
-   BOOLEAN. default: FALSE.
-  A flag to decide whether to one-hot encode dependent variables that are
-scalar integers. This parameter is ignored if the dependent variable is not a
-scalar integer.
-
-@note The mini-batch preprocessor automatically encodes
-dependent variables that are boolean and character types such as text, char and
-varchar.  However, scalar integers are a special case because they can be used
-in both classification and regression problems, so you must tell the mini-batch
-preprocessor whether you want to encode them or not. In the case that you have
-already encoded the dependent variable yourself,  you can ignore this 
parameter.
-Also, if you want to encode float values for some reason, cast them to text
-first.
-  
 
 
 Output tables



[02/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/mlp-v3.ipynb
--
diff --git a/community-artifacts/mlp-v3.ipynb b/community-artifacts/mlp-v3.ipynb
deleted file mode 100644
index 8c585a6..000
--- a/community-artifacts/mlp-v3.ipynb
+++ /dev/null
@@ -1,4584 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Multilayer Perceptron\n",
-"\n",
-"Multilayer Perceptron (MLP) is a type of neural network that can be used 
for regression and classification.\n",
-"\n",
-"This version of the workbook includes mini-batching which was added in 
the 1.14 release."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-"scrolled": true
-   },
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpadmin@madlib'"
-  ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
-"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum Database 4.3.10.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, 
cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Classification without Mini-Batching\n",
-"\n",
-"# 1.  Create input table for classification"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "52 rows affected.\n",
-  "52 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   &

[16/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Decision-trees-v1.ipynb
--
diff --git a/community-artifacts/Decision-trees-v1.ipynb 
b/community-artifacts/Decision-trees-v1.ipynb
deleted file mode 100644
index 02a60ef..000
--- a/community-artifacts/Decision-trees-v1.ipynb
+++ /dev/null
@@ -1,3051 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Decision trees\n",
-"\n",
-"A decision tree is a supervised learning method that can be used for 
classification and regression. It consists of a structure in which internal 
nodes represent tests on attributes, and the branches from nodes represent the 
result of those tests. Each leaf node is a class label and the paths from root 
to leaf nodes define the set of classification or regression rules."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpadmin@madlib'"
-  ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
-"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum Database 4.3.10.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.14, git revision: 
rc/1.13-rc1-68-g1c81cb1, cmake configuration time: Tue Apr 24 15:54:15 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.14, git revision: rc/1.13-rc1-68-g1c81cb1, cmake 
configuration time: Tue Apr 24 15:54:15 UTC 2018, build type: release, build 
system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: 
g++ 4.4.7',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Decision tree classification examples"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# 1. Load data\n",
-"Data set related to whether to play golf or not."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- 

[15/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Decision-trees-v2.ipynb
--
diff --git a/community-artifacts/Decision-trees-v2.ipynb 
b/community-artifacts/Decision-trees-v2.ipynb
new file mode 100644
index 000..5b55b03
--- /dev/null
+++ b/community-artifacts/Decision-trees-v2.ipynb
@@ -0,0 +1,3208 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Decision trees\n",
+"\n",
+"A decision tree is a supervised learning method that can be used for 
classification and regression. It consists of a structure in which internal 
nodes represent tests on attributes, and the branches from nodes represent the 
result of those tests. Each leaf node is a class label and the paths from root 
to leaf nodes define the set of classification or regression rules.\n",
+"\n",
+"This notebook includes impurity importance which was added in 1.15."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug  1 18:34:10 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, 
cmake configuration time: Wed Aug  1 18:34:10 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Decision tree classification examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Data set related to whether to play golf or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Don

[01/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/asf-site 5fa1ac070 -> acd339f65


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/stratified-sampling-v1.ipynb
--
diff --git a/community-artifacts/stratified-sampling-v1.ipynb 
b/community-artifacts/stratified-sampling-v1.ipynb
deleted file mode 100644
index 75e02fd..000
--- a/community-artifacts/stratified-sampling-v1.ipynb
+++ /dev/null
@@ -1,672 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Stratified sampling\n",
-"Stratified sampling is a method for sampling subpopulations (strata) 
independently. It is commonly used to reduce sampling error by ensuring that 
subgroups are adequately represented in the sample.\n",
-"\n",
-"Stratified sampling was added in MADlib 1.12."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-"scrolled": true
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "The sql extension is already loaded. To reload it, use:\n",
-  "  %reload_ext sql\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpdbchina@madlib'"
-  ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum 4.3.10.0\n",
-"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum 4.2.3.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.12-dev, git revision: 
rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, 
cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
-  ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# 1.  Create input table"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "25 rows affected.\n",
-  "25 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "id1\n",
-   "id2\n",
-   "gr1\n",
-   "gr2\n",
-   "\n",
-   "\n",
-   "1\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "2\n",
-   "

[06/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/SVM-novelty-detection-v2.ipynb
--
diff --git a/community-artifacts/SVM-novelty-detection-v2.ipynb 
b/community-artifacts/SVM-novelty-detection-v2.ipynb
new file mode 100755
index 000..678d7c9
--- /dev/null
+++ b/community-artifacts/SVM-novelty-detection-v2.ipynb
@@ -0,0 +1,511 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Novelty detection using 1-class SVM\n",
+"\n",
+"Classifies new data as similar or different to the training set.  This 
method is an unsupervised method that builds a decision boundary between the 
data and origin in kernel space and can be used as a novelty detector."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+"collapsed": true
+   },
+   "outputs": [],
+   "source": [
+"# Setup\n",
+"%matplotlib inline\n",
+"\n",
+"import pandas as pd\n",
+"import numpy as np\n",
+"import matplotlib.pyplot as plt\n",
+"import matplotlib.font_manager"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "image/png": 
"iVBORw0KGgoNSUhEUgAAAW8AAAD7CAYAAAClvBX1BHNCSVQICAgIfAhkiAlwSFlz\nAAALEgAACxIB0t1+/AAAHfZJREFUeJzt3X9wXOV97/H3IwulS2yMZTmG4OCACOMCHiOby3DHnWsN\nYXcZOlUq6x9CSJUfjSZznZofx0RQU6IEcYkTtvnVtB6RTKzAMPQmvmrVznSPlXbEHTE3vQk2lDhQ\niIcyIQZSYXJBwyay0XP/eM6uVqtdayWt9uzZ/bxmdtgfZ88+LObjZ7/Pj2OstYiISLQ0hd0AERFZ\nPIW3iEgEKbxFRCJI4S0iEkEKbxGRCFJ4i4hEUHO1PsgYozmJIiJLYK01hc9VtedtrQ319oUvfCH0\nNtTKTd+Fvgt9F9H4LkpR2UREJIIU3iIiEdRQ4d3Z2Rl2E2qGvotZ+i5m6buYVevfhTlbTaWiH2SM\nrdZniYjUC2MMNuwBSxERqQyFt4hIBCm8RUQiSOEtIhJBCm+RMvm+T08iQU8ige/7YTdHGpxmm4iU\nwfd9eru7OZDJANAfizE8MkIymQy5ZVLvNNtEZBmGUikOZDL0Ar3AgUyGoVQq7GZFin65VFbVNqYS\nkcZV+Muld2JCv1yWSeEtUoY+z6N3YgLyyyaeF3KroiP/lwsAwS8XhffSKbxFypBMJhkeGcmVSoY9\nT8EjodKApTQ83/dzodynUF4RGvBdulIDlgpvaWgKlerRX5JLs+LhbYxpAn4KvGKt7SryusJbak5P\nIkHX2FiuFjsMjMbjHD5yJMxmieRUY6rgbcDPK3g+EREpoSLhbYzZBNwEfKcS5xOplj7Pc6USXK+7\nPxajT7NIJAIqUjYxxvwAeABYC3gqm0iUqBYrtaxU2WTZUwWNMX8IvG6tfdoY0wnM+5CsgYGB3P3O\nzs6av1KFNIZkMqnAlpoxPj7O+Pj4gsctu+dtjPkfw
 
K3AGSAGrAH+l7X2TwqOU89bRGSRqjJV0Biz\nC5VNREQqRhtTiYjUES3SERGpYep5i4jUEYV3hWnPYhGpBoV3BWX3yegaG3NLrru7FeAiDWqlO3IK\n7wrS1VYam351SVY1OnLaz1ukAnSlGMlXjYtPKLwrSFdbaVy6UoxUm8K7gnS1FRGB6nTkNM9bpAJ0\nUQcpVKkNz3QlHZEVpt0JZSUovEVEIkgrLEVE6ojCW0QkghTeIiIRpPAWEYkghbeISAQpvEVEIkjh\nLSISQQpvkQainQ/rhxbpiDQILeGPJq2wFGlwPYmE21s6eDwMjMbjHD5yJMxmyQK0wlJEpI4ovEUa\nRJ/nuVIJrtfdH4vRt8htSn3fJ5HoIZHoUc08ZCqbiDSQ5ex86Ps+3d29ZDIHAIjF+hkZGVbNfIWp\n5i0i8/i+Tyo1BIDn9Z01iBOJHsbGuiCvah6Pj3LkyOGVb2gDKxXeupKOSAPJD+tdu7bzwAPfyvWk\nJyZ61ZOOEIW3SIMoLH

[07/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Random-forest-v2.ipynb
--
diff --git a/community-artifacts/Random-forest-v2.ipynb 
b/community-artifacts/Random-forest-v2.ipynb
new file mode 100644
index 000..87605b7
--- /dev/null
+++ b/community-artifacts/Random-forest-v2.ipynb
@@ -0,0 +1,3082 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest\n",
+"\n",
+"Random forests build an ensemble of classifiers, each of which is a tree 
model constructed using bootstrapped samples from the input data. The results 
of these models are then combined to yield a single prediction, which, at the 
expense of some loss in interpretation, have been found to be highly 
accurate.\n",
+"\n",
+"Please also refer to the decision tree user documentation for information 
relevant to the implementation of random forests in MADlib.\n",
+"\n",
+"This notebook includes impurity importance which was added in 1.15."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug  1 18:34:10 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, 
cmake configuration time: Wed Aug  1 18:34:10 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest classification examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Data set related to whether to play golf or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+   

[13/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Elastic-net-v3.ipynb
--
diff --git a/community-artifacts/Elastic-net-v3.ipynb 
b/community-artifacts/Elastic-net-v3.ipynb
new file mode 100644
index 000..7592fe6
--- /dev/null
+++ b/community-artifacts/Elastic-net-v3.ipynb
@@ -0,0 +1,2049 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Elastic net (MADlib v1.10+)\n",
+"Demonstrates elastic net, including these updates:\n",
+"- in MADlib 1.10: grouping and cross validation introduced \n",
+"- in MADlib 1.13: report negative root mean squared error instead of the 
negative mean squared error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+     "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-23-gabafa66, cmake configuration time: Wed Jul 11 00:36:05 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-gabafa66, 
cmake configuration time: Wed Jul 11 00:36:05 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## 1.  Create data set\n",
+"House prices and characteristics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "27 rows affected.\n",
+  "27 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "tax\n",
+   "bedroom\n",
+   "bath\n",
+   "price\n",
+   "size\n",
+   "lot\n",
+   "zipcode\n",
+   "\n",
+   "\n",
+   "1\n",
+   "590\n",
+   "2\n",
+   "1.0\n",
+   "5\n",
+   "770\n",
+   "22100\n",
+   "94301\n",
+   "\n",
+   "\n",
+  

[09/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Novelty-detection-demo-1.ipynb
--
diff --git a/community-artifacts/Novelty-detection-demo-1.ipynb 
b/community-artifacts/Novelty-detection-demo-1.ipynb
deleted file mode 100755
index 563bda4..000
--- a/community-artifacts/Novelty-detection-demo-1.ipynb
+++ /dev/null
@@ -1,478 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Novelty detection using 1-class SVM\n",
-"\n",
-"Classifies new data as similar or different to the training set.  This 
method is an unsupervised method that builds a decision boundary between the 
data and origin in kernel space and can be used as a novelty detector."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "The sql extension is already loaded. To reload it, use:\n",
-  "  %reload_ext sql\n"
- ]
-}
-   ],
-   "source": [
-"# Setup\n",
-"%load_ext sql\n",
-"# %sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n",
-"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"%matplotlib inline\n",
-"\n",
-"import pandas as pd\n",
-"import numpy as np\n",
-"import matplotlib.pyplot as plt\n",
-"import matplotlib.font_manager"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "data": {
-  "image/png": 
"iVBORw0KGgoNSUhEUgAAAW8AAAD7CAYAAAClvBX1BHNCSVQICAgIfAhkiAlwSFlz\nAAALEgAACxIB0t1+/AAAHdNJREFUeJzt3W9wXNWZ5/HvkWWRTrADso2J48SAnRk2hICMijJFaqVN\n0mqGqWhG0hvCwDSwi3Zq+WOsNiiOCHFheRUnESQwM8WYsEhhirCVYTUjZid9LZKSqkSF7LA2lJeB\nAHaGTUIYYpydGOiJsHX2xbndarW69cfq7tu3+/ep6qL76va9R23z+PRznnOOsdYiIiLhUhd0A0RE\nZOkUvEVEQkjBW0QkhBS8RURCSMFbRCSEFLxFREKovlw3MsaoJlFE5DRYa03usbL2vK21gT6++tWv\nBt6GSnnos9Bnoc8iHJ9FIUqbiIiEkIK3iEgI1VTwbm1tDboJFUOfxQx9FjP0Wcyo9M/CzJdTKeqN\njLHlupeISLUwxmCDHrAUEZHiUPAWEQkhBW8RkRBS8BYRCSEFbxGREFLwFhEJIQVvEZEQUvAWEQkh\nBW8RkRAKffD2PI+utja62trwPC/o5oiIlEWop8d7nke8o4N9qRQAvZEIwyMjxGKxot5HRCQohabH\nhzp4d7W10T42Rtx/PQyMRqM8eeBAUe8jIhKUkq9tYoypM8YcNMaMFuuaIiKSXzG3QdsO/BOwuojX\nnFd3IkF8chKy0yaJRLluLyISmKL0vI0xG4Grge8U43qLFYvFGB4ZYTQaZTQaVb5bQk2D77IURcl5\nG2O+D+wFPgwkrLXtec7Ret4iBWjwXQoplPNedtrEGPOHwL9Ya583xrQCc26Stnv37szz1tbWit+p\nQqRc9g8Osi+Vygy+k0qxf3BQwbsGjY+PMz4+vuB5y+55G2P+K3AdcBKIAKuA/2Gt/dOc89TzFilA\nlVNSSFlKBY0xLShtIrJkSptIISVLm4jI8qUH3/cPD
 
gIwnEgocMu8Qj1JR0Sk2mkDYhGRKqLgLSIS\nQgreIiIhpOAtIhJCCt4iIiGk4C0iUkKlWrNGpYIiIiVSjMlXVbkZg4hIJSvGsgeq8xYRqSKaHi8i\nUiKl3DBGaRMRkRLyPC+zZk33aaxZo5y3iEgIKectIlJFFLxFpCy0R2dxKW0iIiWnzSZOn3LeIhIY\nbfN2+pTzFhE8z6OtrYu2ti6lLkJOdd4iNcLzPDo64qRS+wCYnIwzMjJcltRFKeuda5XSJiI1oq2t\ni7GxdshKXkSjoxw48GRZ7r/ceudapQ2IRSQvz/MYHNwPQCLRXbKgGovFFLCLSMFbpEYkEt1MTsbT\nmQsikV5aWm4LLJUiy6O0iUgNye1lDw7uDzSVIgtT2kRE5qQu0oFcwkelgiI1Zu/evaxZs4U1a7aw\nYcMqIpFeXOX1MJFIL4lEd9BNlEVQ2kSkhuzdu5e77/468IB/5Hbi8Q7eeOMEUNoBSzk9mmEpIqxZ\ns4Xjx79Cdo67sXEPb7/9WpDNknlohqWILJpmYlY+BW8JBa1IN7/FBtuenhuB20nnuOF2/9jsa3V0\nxBkba2dsrJ2Ojrg+80pkrS3Lw91KZOmSyaRdH4nYIbBDYNdHIjaZTAbdrIqRTCZtJLLewpCFIRuJ\nrC/4+SSTSbt58ydtff05dtWqj9v+/v4550Sjnf61rP8YstFoZ+b9TU0ttrFxs21qulJ/DmXgx865\nMTXfwVI8FLzldHVGo3ZoJpLYIbCd0WjQzaoY8wXbbPMF+WQyaaPRThuNdtqmppY511u16mM2Ho9b\nYxotbLOQsLDaGnO2bWpqURAvoULBW3XeIjVi164BfyalG6xMpWbqvLNnWTY03MGKFTs4dSr9zp2c\nOJFiePgJ4FbgYuAOoB5r7+PQIfd+zcwsL+W8peJ1JxJuFTpclrY3EqG7BlekK5TXTiS689Zq33DD\nDaxcuZ6VK9cTjUZ54YX/k74S
 
0AU8xNGjr3LttbeQSp0PnAucy9TURk6deh94CBgF/hr4C+BDwLP+\neWcCv+c/d4FfE37KLF93vBQPlDaRZUgmk7YzGrWd0WhNfkVfKK+dnfZIJpM2Ho9bWJ053z2/0sKZ\nFtbmHE/4z8+ysM5/vm1O6sQd+4iF9VnvX28hWTBVI8uHct4i4TVfXru/v982Nm62kci5NhJZZxsb\nN1tjPphzfsIPzp/KE5Q3+wE4O2Anc4L82f41zi0Q1FfnHfyU5SsUvJU2EQmx9IzJ48e/Qir1NVIp\ny/Hjf4y19cBTWWc+A3wLl+rItQ64BvhnXKrEA2K43PhOXH77s8AQkG+i3a+Bm5mYOFicX0oWZdkD\nlsaYjcB3gfXANPCwtfaB+d8lIkuRbznXRGKYa6+9BTfVPZ519qh/bAcuDw7wsv/f7qxzD+MC8hrg\nJHAecCVwLXAO8BbwH4CngQNAJ/ATYHvWvdK59jeBnxXjV5VFKkbP+yTQY629CLgCuMUYc2ERrisi\nvlgsxsiIW641Gh1dVGVHQ0MD9fV3UV9/F5//fLM/qPkmcB2uauQRYBD4MvABXOB+BNgAvAPcCEwC\n/wnYBPwt8CX/9Xagx7/Wm6e1oJVmcS5P0dc2Mcb8LfCgtfaHOcdtse8lUuvmLjS1E9ezfpj+/rsA\nuO++RwH4whc+k1mA6ujRoxw5cgfM2s/9YeCnwDf9Y7244PwM8Aug3z/fA3ZTV/cSH/rQarZsuYCB\ngV1LKhPM3U8zEulVqWEBZVnP2xhzHnAp7ruViJRYX18fAPfdt4dU6l3AUF//fc45ZyOPPvo4R478\nM/BpAIaHn2Dz5n/HBRdcwOrVH8650mHgVVzgzk7BfA2XPpn2z/H8n+9jehpOnNjJ

[03/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/mlp-mnist-v2.ipynb
--
diff --git a/community-artifacts/mlp-mnist-v2.ipynb 
b/community-artifacts/mlp-mnist-v2.ipynb
deleted file mode 100644
index 3c1ad14..000
--- a/community-artifacts/mlp-mnist-v2.ipynb
+++ /dev/null
@@ -1,1154 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Neural networks\n",
-"\n",
-"Multilayer perceptron (MLP) using the well known MNIST data set.\n",
-"\n",
-"Updated to include mini-batching which was added in the 1.14 release.\n",
-"\n",
-"# Intro"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "image/jpeg": 
"/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF
 
NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId
 
ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/
 
WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N
 

[10/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/MLP-v4.ipynb
--
diff --git a/community-artifacts/MLP-v4.ipynb b/community-artifacts/MLP-v4.ipynb
new file mode 100644
index 000..a6b62d6
--- /dev/null
+++ b/community-artifacts/MLP-v4.ipynb
@@ -0,0 +1,4588 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Multilayer Perceptron\n",
+"\n",
+"Multilayer Perceptron (MLP) is a type of neural network that can be used 
for regression and classification.\n",
+"\n",
+"This version of the workbook includes mini-batching added in 1.14 and 
momentum added in 1.15"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+"scrolled": true
+   },
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-23-g5c4331d, cmake configuration time: Thu Jul  5 17:46:06 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-g5c4331d, 
cmake configuration time: Thu Jul  5 17:46:06 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Classification without Mini-Batching\n",
+"\n",
+"# 1.  Create input table for classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "52 rows affected.\n",
+  "52 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   &

[05/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/SVM-v1.ipynb
--
diff --git a/community-artifacts/SVM-v1.ipynb b/community-artifacts/SVM-v1.ipynb
new file mode 100644
index 000..405710d
--- /dev/null
+++ b/community-artifacts/SVM-v1.ipynb
@@ -0,0 +1,2806 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Support Vector Machines\n",
+"Support Vector Machines (SVMs) are models for regression and 
classification tasks. SVM models have two particularly desirable features: 
robustness in the presence of noisy data and applicability to a variety of data 
configurations. At its core, a linear SVM model is a hyperplane separating two 
distinct classes of data (in the case of classification problems), in such a 
way that the distance between the hyperplane and the nearest training data 
point (called the margin) is maximized. Vectors that lie on this margin are 
called support vectors. With the support vectors fixed, perturbations of 
vectors beyond the margin will not affect the model; this contributes to the 
model’s robustness. By substituting a kernel function for the usual inner 
product, one can approximate a large variety of decision boundaries in addition 
to linear hyperplanes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-25-gda13eb7, cmake configuration time: Tue Jul 10 21:37:52 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-25-gda13eb7, 
cmake configuration time: Tue Jul 10 21:37:52 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+"collapsed": true
+   },
+   "source": [
+"# Classification\n",
+"# 1. Create input data set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "15 rows affected.\n",
+  "15 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "

[14/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Elastic-net-v2.ipynb
--
diff --git a/community-artifacts/Elastic-net-v2.ipynb 
b/community-artifacts/Elastic-net-v2.ipynb
deleted file mode 100644
index b6082f0..000
--- a/community-artifacts/Elastic-net-v2.ipynb
+++ /dev/null
@@ -1,2078 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Elastic net (MADlib v1.10+)\n",
-"Demonstrates elastic net, including these updates:\n",
-"- in MADlib 1.10: grouping and cross validation which were introduced \n",
-"- in MADlib 1.13: report negative root mean squared error instead of the 
negative mean squared error"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpdbchina@madlib'"
-  ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum 4.3.10.0\n",
-"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum 4.2.3.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.13-dev, git revision: 
rel/v1.12-42-gedc93f5, cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-42-gedc93f5, 
cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"## 1.  Create data set\n",
-"House prices and characteristics."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "27 rows affected.\n",
-  "27 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "id\n",
-   "tax\n&

[04/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Stratified-sampling-v2.ipynb
--
diff --git a/community-artifacts/Stratified-sampling-v2.ipynb 
b/community-artifacts/Stratified-sampling-v2.ipynb
new file mode 100644
index 000..daa417b
--- /dev/null
+++ b/community-artifacts/Stratified-sampling-v2.ipynb
@@ -0,0 +1,672 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Stratified sampling\n",
+"Stratified sampling is a method for sampling subpopulations (strata) 
independently. It is commonly used to reduce sampling error by ensuring that 
subgroups are adequately represented in the sample.\n",
+"\n",
+"Stratified sampling was added in MADlib 1.12."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+"scrolled": true
+   },
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpdbchina@madlib'"
+  ]
+     },
+     "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.12-dev, git revision: 
rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, 
cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
+  ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1.  Create input table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "25 rows affected.\n",
+  "25 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id1\n",
+   "id2\n",
+   "gr1\n",
+   "gr2\n",
+   "\n",
+   "\n",
+   "1\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "2\n",
+   "0\n",
+   "1\n&quo

madlib-site git commit: website update for 1dot15

2018-08-11 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/asf-site 573d66d85 -> 127c0b7e7


website update for 1dot15


Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/127c0b7e
Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/127c0b7e
Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/127c0b7e

Branch: refs/heads/asf-site
Commit: 127c0b7e7dd5d760bdbee5e82928e70418af1447
Parents: 573d66d
Author: Frank McQuillan 
Authored: Sat Aug 11 09:32:51 2018 -0700
Committer: Frank McQuillan 
Committed: Sat Aug 11 09:37:21 2018 -0700

--
 documentation.html |  3 ++-
 download.html  | 42 +++---
 index.html | 42 +++---
 3 files changed, 56 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/127c0b7e/documentation.html
--
diff --git a/documentation.html b/documentation.html
index 4670603..0d01094 100644
--- a/documentation.html
+++ b/documentation.html
@@ -55,6 +55,7 @@ jQuery(document).ready(function() {
 The primary documentation reference material providing 
detailed information on the functions and algorithms within MADlib as well as 
background theory and references into the literature.
 
 Older Documentation
+MADlib v1.14
 MADlib v1.13
 MADlib v1.12
 MADlib v1.11
@@ -102,7 +103,7 @@ jQuery(document).ready(function() {
 
 
 https://github.com/apache/madlib-site/tree/asf-site/community-artifacts;>Jupyter
 Notebooks for Getting Started
-Includes many of the most commonly used algorithms by data 
scientists.
+Includes many commonly used algorithms by data scientists.
 
 
 

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/127c0b7e/download.html
--
diff --git a/download.html b/download.html
index 8728d10..ce790ad 100644
--- a/download.html
+++ b/download.html
@@ -58,7 +58,7 @@
Current Release


-   v1.14
+   v1.15
Source Code and Convenience 
Binaries
 
MADlib source code 
and convenience binaries are available from the Apache distribution site.
@@ -66,10 +66,13 @@
Latest 
stable release:
 

-   http://apache.org/dyn/closer.cgi?filename=madlib/1.14/apache-madlib-1.14-src.tar.gz=download;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-src.tar.gz.sha512;>sha512)
 
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.14/apache-madlib-1.14-bin-Linux-GPDB43.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-bin-Linux-GPDB43.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-bin-Linux-GPDB43.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). GPDB 4.3.x.
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.14/apache-madlib-1.14-bin-Linux.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-bin-Linux.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.x, PostgreSQL 9.6 and 
10.2.
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.14/apache-madlib-1.14-bin-Darwin.dmg=download;>Mac
 OS X   (https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-bin-Darwin.dmg.asc;>pgp,
 https://www.apache.org/dist/madlib/1.14/apache-madlib-1.14-bin-Darwin.dmg.sha512;>sha512)
 — OS 10.6 and higher.  For PostgreSQL 9.6 and 10.2.
+   http://apache.org/dyn/closer.cgi?filename=madlib/1.15/apache-madlib-1.15-src.tar.gz=download;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-src.tar.gz.sha512;>sha512)
 
+
+   http://apache.org/dyn/closer.cgi?filename=madlib/1.15/apache-madlib-1.15-bin-Linux-GPDB43.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-bin-Linux-GPDB43.rpm.asc;>pgp,
  

madlib git commit: add note to user docs on vec2cols about unequal arrays

2018-08-17 Thread fmcquillan
Repository: madlib
Updated Branches:
  refs/heads/master a3b59356f -> 5e707f745


add note to user docs on vec2cols about unequal arrays


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/5e707f74
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/5e707f74
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/5e707f74

Branch: refs/heads/master
Commit: 5e707f745c50343dd7395a3e8f86c04428210977
Parents: a3b5935
Author: Frank McQuillan 
Authored: Fri Aug 17 13:38:20 2018 -0700
Committer: Frank McQuillan 
Committed: Fri Aug 17 13:38:20 2018 -0700

--
 .../postgres/modules/stats/correlation.sql_in| 10 +-
 .../postgres/modules/utilities/cols2vec.sql_in   |  4 ++--
 .../postgres/modules/utilities/vec2cols.sql_in   | 19 ---
 3 files changed, 19 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/5e707f74/src/ports/postgres/modules/stats/correlation.sql_in
--
diff --git a/src/ports/postgres/modules/stats/correlation.sql_in 
b/src/ports/postgres/modules/stats/correlation.sql_in
index 64ed27e..3bf3e46 100644
--- a/src/ports/postgres/modules/stats/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/correlation.sql_in
@@ -222,7 +222,7 @@ SELECT * FROM example_data_output ORDER BY column_position;
 
  column_position |  variable   | temperature | humidity 
 -+-+-+--
-   1 | temperature |   1 | 
+   1 | temperature |   1 | 
2 | humidity| 0.00607993890408995 |1
 (2 rows)
 
@@ -259,11 +259,11 @@ SELECT * FROM example_data_output ORDER BY day, 
column_position;
 
  column_position |  variable   | day  |temperature| humidity 
 -+-+--+---+--
-   1 | temperature | Mon  | 1 | 
+   1 | temperature | Mon  | 1 | 
2 | humidity| Mon  | 0.616876934548786 |1
-   1 | temperature | Tues | 1 | 
+   1 | temperature | Tues | 1 | 
2 | humidity| Tues | 0.616876934548786 |1
-   1 | temperature | Wed  | 1 | 
+   1 | temperature | Wed  | 1 | 
2 | humidity| Wed  | -0.28969669368457 |1
 (6 rows)
 
@@ -315,7 +315,7 @@ SELECT * FROM example_data_output ORDER BY column_position;
 
  column_position |  variable   |   temperature| humidity 
 -+-+--+--
-   1 | temperature | 507.926664293343 | 
+   1 | temperature | 507.926664293343 | 
2 | humidity| 2.40227839088644 | 307.359914560342
 (2 rows)
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/5e707f74/src/ports/postgres/modules/utilities/cols2vec.sql_in
--
diff --git a/src/ports/postgres/modules/utilities/cols2vec.sql_in 
b/src/ports/postgres/modules/utilities/cols2vec.sql_in
index 82a1f94..0c54ab5 100644
--- a/src/ports/postgres/modules/utilities/cols2vec.sql_in
+++ b/src/ports/postgres/modules/utilities/cols2vec.sql_in
@@ -82,8 +82,8 @@ values.
 
 list_of_features_to_exclude (optional)
 TEXT. Default NULL.
-Comma-separated string of column names to exclude from the feature array.  
-Typically used when 'list_of_features' is set to '*'.
+Comma-separated string of column names to exclude from the feature array.  
Typically used 
+when 'list_of_features' is set to '*'.
 
 cols_to_output (optional)
 TEXT. Default NULL.

http://git-wip-us.apache.org/repos/asf/madlib/blob/5e707f74/src/ports/postgres/modules/utilities/vec2cols.sql_in
--
diff --git a/src/ports/postgres/modules/utilities/vec2cols.sql_in 
b/src/ports/postgres/modules/utilities/vec2cols.sql_in
index 989074c..115e015 100644
--- a/src/ports/postgres/modules/utilities/vec2cols.sql_in
+++ b/src/ports/postgres/modules/utilities/vec2cols.sql_in
@@ -72,23 +72,28 @@ vec2cols(
 same name already exists, an error will be returned.
 
 vector_col
-TEXT. Name of the column containing the feature array.  
-Must be a one-dimensional array.
+TEXT. Name of the column containing the feature array.  Must be a 
one-dimensional array.
 
 feature_names (optional)
-TEXT[]. Array of names associated with the feature array.  
-Note that this array exists in the
-summary table created by the function 'cols2vec'.  
-If the 'feature_names' array is not specified,

[2/2] madlib-site git commit: fix decision tree jupyter notebook

2018-04-24 Thread fmcquillan
fix decision tree jupyter notebook


Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/f732f863
Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/f732f863
Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/f732f863

Branch: refs/heads/asf-site
Commit: f732f863cead81f4ecee5fbe3efb9dd362964c57
Parents: 418f361
Author: Frank McQuillan 
Authored: Tue Apr 24 09:10:53 2018 -0700
Committer: Frank McQuillan 
Committed: Tue Apr 24 09:10:53 2018 -0700

--
 community-artifacts/Decision-trees-v1.ipynb | 1785 --
 1 file changed, 1623 insertions(+), 162 deletions(-)
--




[1/2] madlib-site git commit: fix decision tree jupyter notebook

2018-04-24 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/asf-site 418f361cf -> f732f863c


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/f732f863/community-artifacts/Decision-trees-v1.ipynb
--
diff --git a/community-artifacts/Decision-trees-v1.ipynb 
b/community-artifacts/Decision-trees-v1.ipynb
index e97b943..02a60ef 100644
--- a/community-artifacts/Decision-trees-v1.ipynb
+++ b/community-artifacts/Decision-trees-v1.ipynb
@@ -11,15 +11,17 @@
   },
   {
"cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 1,
"metadata": {},
"outputs": [
 {
- "name": "stdout",
+ "name": "stderr",
  "output_type": "stream",
  "text": [
-  "The sql extension is already loaded. To reload it, use:\n",
-  "  %reload_ext sql\n"
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
  ]
 }
],
@@ -29,26 +31,26 @@
   },
   {
"cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 2,
"metadata": {},
"outputs": [
 {
  "data": {
   "text/plain": [
-   "u'Connected: fmcquillan@madlib'"
+   "u'Connected: gpadmin@madlib'"
   ]
  },
- "execution_count": 35,
+ "execution_count": 2,
  "metadata": {},
  "output_type": "execute_result"
 }
],
"source": [
 "# Greenplum Database 5.4.0 on GCP (demo machine)\n",
-"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
 "\n",
 "# PostgreSQL local\n",
-"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
 "\n",
 "# Greenplum Database 4.3.10.0\n",
 "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
@@ -56,9 +58,37 @@
   },
   {
"cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
"metadata": {},
-   "outputs": [],
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14, git revision: 
rc/1.13-rc1-68-g1c81cb1, cmake configuration time: Tue Apr 24 15:54:15 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14, git revision: rc/1.13-rc1-68-g1c81cb1, cmake 
configuration time: Tue Apr 24 15:54:15 UTC 2018, build type: release, build 
system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: 
g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
"source": [
 "%sql select madlib.version();\n",
 "#%sql select version();"
@@ -81,7 +111,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 4,
"metadata": {},
"outputs": [
 {
@@ -282,7 +312,7 @@
" (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, 
u\"Don't Play\", 1.0)]"
   ]
  },
- "execution_count": 36,
+ "execution_count": 4,
  "metadata": {},
  "output_type": "execute_result"
 }
@@ -332,7 +362,7 @@
   },
   {
"cell_type": "code",
-   &quo

[07/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/PageRank-v1.ipynb
--
diff --git a/community-artifacts/PageRank-v1.ipynb 
b/community-artifacts/PageRank-v1.ipynb
deleted file mode 100644
index 32b1caf..000
--- a/community-artifacts/PageRank-v1.ipynb
+++ /dev/null
@@ -1,774 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# PageRank\n",
-"The PageRank algorithm produces a probability distribution representing 
the likelihood that a person randomly traversing a graph will arrive at any 
particular vertex. PageRank was added in MADlib 1.11."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: fmcquillan@madlib'"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum 4.3.10.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum 4.2.3.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.11-dev, git revision: 
rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 
UTC 2017, build type: Release, build system: 
Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: 
g++ 4.4.0\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.11-dev, git revision: 
rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 
UTC 2017, build type: Release, build system: 
Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: 
g++ 4.4.0',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# 1.  Create vertex and edge tables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "Done.\n",
-  "7 rows affected.\n",
-  "22 rows affected.\n",
-  "22 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-

[09/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/MLP.ipynb
--
diff --git a/community-artifacts/MLP.ipynb b/community-artifacts/MLP.ipynb
deleted file mode 100644
index dcd0cdb..000
--- a/community-artifacts/MLP.ipynb
+++ /dev/null
@@ -1,514 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"- This demo uses the popular MNIST dataset, which consists of 70,000 hand 
written digits and is used for \n",
-"classification.\n",
-"\n",
-"## Current best accuracy on postgres\n",
-"\n",
-"### train_accuracy\n",
-"\n",
-"- 99.64%\n",
-"\n",
-"### test_accuracy\n",
-"\n",
-"- 96.79%\n",
-"\n",
-"### Parameters\n",
-"- Hidden layers: [200,200], tanh activation, n_iterations=10, 
learning_rate_init=0.001, learning_rate_policy=constant, lambda=0.0001, 
tolerance=0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: csloan@postgres'"
-  ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%load_ext sql\n",
-"%sql postgresql://csloan@localhost:5432/postgres"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "DROP TABLE\n",
-  "CREATE TABLE\n",
-  "COPY 6\n",
-  "DROP TABLE\n",
-  "CREATE TABLE\n",
-  "COPY 1\n"
- ]
-}
-   ],
-   "source": [
-"%%bash\n",
-"# Note that these datasets are available from 
https://github.com/apache/incubator-madlib-site\n;,
-"gunzip -c ../data/mnist_train.sql.gz > ../data/mnist_train.sql\n",
-"gunzip -c ../data/mnist_test.sql.gz > ../data/mnist_test.sql\n",
-"psql -f ../data/mnist_train.sql\n",
-"psql -f ../data/mnist_test.sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "image/png": 
"iVBORw0KGgoNSUhEUgAAA9YAAAGDCAIAAABBVx+IAABFLklEQVR42u3df3AU953n/7ElukHM\nGCOBsSQ2TFivZZ8FxsjEsXyxoZxS4iOK76I9fw/OW18l32/B9+vvslvl2dqUtZVjvHVfcanyuGqB\nS1Wou0TZJOZqK3IdYflWzMYb4hxKiKzlh0UBwpYH2xLIFgIxg2BaI/Pt6Y7aTXfPaDQaTXd/5vn4\ng5GE0OfT/Zn59IuP3vPpO28BKKE7AwBKiAgOEMEBIjgAIjgAAABABAcA\nAABABAcAAACI4AARHAARHCCCAwAAACCCA0ARpYb7jmT3zihnCADK55IQz3FJ6DmX\nIIIDQHEk+mJ/ld2+gRSnCADKxcUj0RyXhB8VdkkgggMlRQQHgGykx1/5xVG7V5plzg0AlItw\n+8/sV4Jf/fTPVxHBAWB+MngoJDvgxABAOXG4EIRC0px+ZOER/A4AyILpGgCAeYngAIjg\nQBlE8FsAoGFKBQCgRBEcABEcAEonFT8YjXREY7FoJNodT4nRlngHJeQwud6oB/vghW4I/wRg\nIiKCA4DrAbxv1/Z9oW07o5HIS+1S146951L+b0u8gxJymFxv1IN98EI3hH8CMBERwQHA/QTe3334\ncl1TXWa3cDncXHPx4P55u4F9ydoS76CEHCbXG/VgH7zQDeGfAExERHAAcF8iHlek4B/u0SBJNVIy\nPqz4vS3xDkrIYXK9UQ/2wQvdEP4JwEREBAcA9ykp80pJZgUlkUz4vS3xDkrIYXK9UQ/2wQvdEP4J\nwEREBAcA90m33bA+M4nLkuT3tsQ7KCGHyfVGPdgHL3RD+CcAExERHADcFwrXSkoyof/GUlGSSjBc\nF/J7W+IdlJDD5HqjHuyDF7oh/BOAiYgIDgDukxu3t
 
tQM92g7WKXiR+K1LVsaZb+3Jd5BCTlMrjfq\nwT54oRvCPwGYiIjgAOCFDN4U2d2e3NfRGYvt6gpseyWyRvZ/W+IdlJDD5HqjHuyDF7oh/BOAiYgI\nDgBeEGpo69wT64hEorFoW4MsRlviHZSQw+R6ox7sgxe6IfwTgImICA4AAAD4DBEcIIID\nRHAARHCACA4AAACACA4QwQEiOAAiOEAEBwAAAEAEBwAAAIjg\nAIjgABEcIIIDIIIDgEel4gejkY5oLBaNRLvjKTHaEu+ghBwm1xv1YB+80A3hnwBM\nRERwAHA9gPft2r4vtG1nNBJ5qV3q2rH3XMr/bYl3UEIOk+uNerAPXuiG8E8AJiIiOAC4n8D7uw9f\nrmuqk9WP5XBzzcWD+wdSfm9LvIMScphcb9SDffBCN4R/AjAREcEBwH2JeFyRgiFJ+0SSaqRkfFjx\ne1viHZSQw+R6ox7sgxe6IfwTgImICA4A7lNS5pWSzApKIpnwe1viHZSQw+R6ox7sgxe6IfwTgImI\nCA4A7pNk2fRZZhKXJcnvbYl3UEIOk+uNerAPXuiG8E8AJiIiOAC4LxSulZRkQv+NpaIklWC4LuT3\ntsQ7KCGHyfVGPdgHL3RD+CcAExERHADcJzdubakZ7tF2sErFj8RrW7Y0yn5vS7yDEnKYXG/Ug33w\nQjeEfwIwERHBAcALGbwpsrs9ua+jMxbb1RXY9kpkjez/tsQ7KCGHyfVGPdgHL3RD+CcAExERHAC8\nINTQ1rkn1hGJRGPRtgZZjLbEOyghh8n1Rj3YBy90Q/gnABMRERwAAADwGSI4QAQHiOAA\niOAAERwARCCtWLF48dIVSyVOBQDASejepeqF4o/uLuxCQQQHADtlZOT69SsjVxROBQDA\nSeLSFfVC8eHVwi4URPBy8alm
 

[01/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/notebook-updates-1dot14 [created] 3f849b9e4


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/mlp-v3.ipynb
--
diff --git a/community-artifacts/mlp-v3.ipynb b/community-artifacts/mlp-v3.ipynb
new file mode 100644
index 000..8c585a6
--- /dev/null
+++ b/community-artifacts/mlp-v3.ipynb
@@ -0,0 +1,4584 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Multilayer Perceptron\n",
+"\n",
+"Multilayer Perceptron (MLP) is a type of neural network that can be used 
for regression and classification.\n",
+"\n",
+"This version of the workbook includes mini-batching which was added in 
the 1.14 release."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+"scrolled": true
+   },
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, 
cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Classification without Mini-Batching\n",
+"\n",
+"# 1.  Create input table for classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "52 rows affected.\n",
+  "52 rows affected.\n"
+ ]
+},
+{
+ "d

[06/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Random-forest-v1.ipynb
--
diff --git a/community-artifacts/Random-forest-v1.ipynb 
b/community-artifacts/Random-forest-v1.ipynb
new file mode 100644
index 000..bac8363
--- /dev/null
+++ b/community-artifacts/Random-forest-v1.ipynb
@@ -0,0 +1,2899 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest\n",
+"\n",
+"Random forests build an ensemble of classifiers, each of which is a tree 
model constructed using bootstrapped samples from the input data. The results 
of these models are then combined to yield a single prediction, which, at the 
expense of some loss in interpretation, have been found to be highly 
accurate.\n",
+"\n",
+"Please also refer to the decision tree user documentation for information 
relevant to the implementation of random forests in MADlib."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+     },
+     "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, 
cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest classification examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Data set related to whether to play golf or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "14 rows affected.\n",
+  "14 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "OUTLOOK\n",
+   "temperature\

[11/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Encoding-categorical-variables-v2.ipynb
--
diff --git a/community-artifacts/Encoding-categorical-variables-v2.ipynb 
b/community-artifacts/Encoding-categorical-variables-v2.ipynb
new file mode 100644
index 000..5e4cb6f
--- /dev/null
+++ b/community-artifacts/Encoding-categorical-variables-v2.ipynb
@@ -0,0 +1,4026 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Encoding categorical variables\n",
+"This is the new module that replaces create_indicator_variables() which 
was deprecated as of MADlib v1.10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, 
cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## 1.  Load data set\n",
+"Use a subset of the abalone dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "20 rows affected.\n",
+  "20 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "sex\n",
+   "length\n",
+   "

[12/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb
--
diff --git a/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb 
b/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb
deleted file mode 100644
index 409de20..000
--- a/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb
+++ /dev/null
@@ -1,2748 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Encoding categorical variables (MADlib v1.10+)\n",
-"This is the new module that replaces create_indicator_variables() which 
has been deprecated as of MADlib v1.10"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "The sql extension is already loaded. To reload it, use:\n",
-  "  %reload_ext sql\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpdbchina@madlib'"
-  ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.10.0-dev, git revision: 
rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb  7 19:45:19 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, 
cmake configuration time: Tue Feb  7 19:45:19 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
-  ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"## 1.  Load data set\n",
-"Use a subset of the abalone dataset:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "20 rows affected.\n",
-  "20 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "id\n",
-   "sex\n",
-   "length\n",
-   "diameter\n",
-   "height\n",
-   "rings\n",
-   "\n",
-   "\n",
-   "1\n",
-   "M\n",
-   "0.455\n",
-   "0.365\n",
-   "0.095\n",
-   "15\n",
-   "\n",
-   "\n",
-   "2\n",
-  

[13/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Decision-trees-v1.ipynb
--
diff --git a/community-artifacts/Decision-trees-v1.ipynb 
b/community-artifacts/Decision-trees-v1.ipynb
new file mode 100644
index 000..e97b943
--- /dev/null
+++ b/community-artifacts/Decision-trees-v1.ipynb
@@ -0,0 +1,1590 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Decision trees\n",
+"\n",
+"A decision tree is a supervised learning method that can be used for 
classification and regression. It consists of a structure in which internal 
nodes represent tests on attributes, and the branches from nodes represent the 
result of those tests. Each leaf node is a class label and the paths from root 
to leaf nodes define the set of classification or regression rules."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: fmcquillan@madlib'"
+  ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Decision tree classification examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Data set related to whether to play golf or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "14 rows affected.\n",
+  "14 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "OUTLOOK\n",
+   "temperature\n",
+   "humidity\n",
+   "Temp_Humidity\n",
+   "clouds_airquality\n",
+   "windy\n",
+   "class\n",
+   "observation_weight\n",
+   "\n",
+   "\n",
+   "1\n",
+   "sunny\n",
+   "85.0\n",
+   "85.0\n",
+   "[85.0, 85.0]\n",
+   "[u'none', u'unhealthy']\n",
+   "False\n",
+   "Don't Play\n",
+   "5.0\n",
+   "\n",
+   "\n",
+   "2\n",
+   "sunny\n",
+   "80.0\n",
+   "90.0\n",
+   "[80.0, 90.0]\n",
+   "[u'none', u'moderate']\n",
+   "True\n",
+   "Don't Play\n",
+   "5.0\n",
+   "\n",
+   "\n",
+   "3\n",
+   "overcast\n",
+   "83.0\n",
+   "78.0\n",
+   

[03/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/mlp-mnist-v2.ipynb
--
diff --git a/community-artifacts/mlp-mnist-v2.ipynb 
b/community-artifacts/mlp-mnist-v2.ipynb
new file mode 100644
index 000..3c1ad14
--- /dev/null
+++ b/community-artifacts/mlp-mnist-v2.ipynb
@@ -0,0 +1,1154 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Neural networks\n",
+"\n",
+"Multilayer perceptron (MLP) using the well known MNIST data set.\n",
+"\n",
+"Updated to include mini-batching which was added in the 1.14 release.\n",
+"\n",
+"# Intro"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "image/jpeg": 
"/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF
 
NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId
 
ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/
 
WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N
 

[15/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
jupyter notebooks for 1.14 release


Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/3f849b9e
Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/3f849b9e
Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/3f849b9e

Branch: refs/heads/notebook-updates-1dot14
Commit: 3f849b9e496063ab70adf139fece33c2e32583eb
Parents: 6c103d3
Author: Frank McQuillan 
Authored: Mon Apr 23 14:56:06 2018 -0700
Committer: Frank McQuillan 
Committed: Mon Apr 23 14:56:06 2018 -0700

--
 community-artifacts/Balanced-sampling-v1.ipynb  | 3706 ++
 community-artifacts/Decision-trees-v1.ipynb | 1590 ++
 ...coding-categorical-variables-1dot10-v1.ipynb | 2748 ---
 .../Encoding-categorical-variables-v2.ipynb | 4026 +++
 community-artifacts/LDA-v1.ipynb| 2034 
 community-artifacts/MLP.ipynb   |  514 --
 .../Minibatch-preprocessor-v1.ipynb | 1330 +
 community-artifacts/PageRank-v1.ipynb   |  774 ---
 community-artifacts/PageRank-v2.ipynb   |  889 
 community-artifacts/Random-forest-v1.ipynb  | 2899 +++
 community-artifacts/Summary-v1.ipynb| 1026 
 community-artifacts/Summary-v2.ipynb| 1017 
 community-artifacts/Term-frequency-v1.ipynb | 1062 
 community-artifacts/kNN-v2.ipynb|  751 ---
 community-artifacts/kNN-v3.ipynb|  857 
 community-artifacts/mlp-mnist-v2.ipynb  | 1154 +
 community-artifacts/mlp-v2.ipynb| 3755 --
 community-artifacts/mlp-v3.ipynb| 4584 ++
 images/neural-net-head.jpg  |  Bin 0 -> 326157 bytes
 19 files changed, 25148 insertions(+), 9568 deletions(-)
--




[05/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Summary-v1.ipynb
--
diff --git a/community-artifacts/Summary-v1.ipynb 
b/community-artifacts/Summary-v1.ipynb
deleted file mode 100644
index 57c3611..000
--- a/community-artifacts/Summary-v1.ipynb
+++ /dev/null
@@ -1,1026 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "The sql extension is already loaded. To reload it, use:\n",
-  "  %reload_ext sql\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: fmcquillan@madlib'"
-  ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum 4.3.10.0\n",
-"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"%sql postgresql://fmcquillan@localhost:5432/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.12, git revision: unknown, cmake 
configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build 
system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration 
time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: 
Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]"
-  ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# 1. On-line help"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "summary\n",
-   "\n",
-   "\n",
-   "'summary' is a generic function used to 
produce summary statisticsof any data table.  The function 
invokes particular 'methods' fromthe MADlib library to provide 
an overview of the data.---For an overview 
on usage, run:SELECT madlib.summary('usage');
---For an example, run:SELECT 
madlib.summary('example')\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u\"\\n'summary' is a generic function used to produce 
summary statistics\\nof any data table.  The function invokes 
particular 'methods' from\\nthe MADlib library to provide an 
overview of the data.\\n---\\nFor an overview on 
usage, run:\\nSELECT madlib.summary('usage');\\n
---\\nFor an example, run:\\nSELECT 
madlib.summary('example')\\n\",)]"
-  ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql SELECT * FROM madlib.summary();"
-   ]
-  },

[07/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/PageRank-v1.ipynb
--
diff --git a/community-artifacts/PageRank-v1.ipynb 
b/community-artifacts/PageRank-v1.ipynb
deleted file mode 100644
index 32b1caf..000
--- a/community-artifacts/PageRank-v1.ipynb
+++ /dev/null
@@ -1,774 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# PageRank\n",
-"The PageRank algorithm produces a probability distribution representing 
the likelihood that a person randomly traversing a graph will arrive at any 
particular vertex. PageRank was added in MADlib 1.11."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: fmcquillan@madlib'"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum 4.3.10.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum 4.2.3.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.11-dev, git revision: 
rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 
UTC 2017, build type: Release, build system: 
Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: 
g++ 4.4.0\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.11-dev, git revision: 
rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 
UTC 2017, build type: Release, build system: 
Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: 
g++ 4.4.0',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# 1.  Create vertex and edge tables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "Done.\n",
-  "7 rows affected.\n",
-  "22 rows affected.\n",
-  "22 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-

[10/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/LDA-v1.ipynb
--
diff --git a/community-artifacts/LDA-v1.ipynb b/community-artifacts/LDA-v1.ipynb
new file mode 100644
index 000..19a199c
--- /dev/null
+++ b/community-artifacts/LDA-v1.ipynb
@@ -0,0 +1,2034 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Latent Dirichlet Allocation \n",
+"\n",
+"Latent Dirichlet Allocation (LDA) is a generative probabilistic model for 
natural texts. It is used in problems such as automated topic discovery, 
collaborative filtering, and document classification.\n",
+"\n",
+"In addition to an implementation of LDA, this MADlib module also provides 
a number of additional helper functions to interpret results of the LDA output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-15-g7ffad03, cmake configuration time: Wed Feb 21 01:33:31 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-15-g7ffad03, 
cmake configuration time: Wed Feb 21 01:33:31 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1.  Prepare documents\n",
+"The examples below are short strings extracted from various Wikipedia 
documents. First we create a document table with one document per row:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "

[03/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/mlp-mnist-v2.ipynb
--
diff --git a/community-artifacts/mlp-mnist-v2.ipynb 
b/community-artifacts/mlp-mnist-v2.ipynb
new file mode 100644
index 000..3c1ad14
--- /dev/null
+++ b/community-artifacts/mlp-mnist-v2.ipynb
@@ -0,0 +1,1154 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Neural networks\n",
+"\n",
+"Multilayer perceptron (MLP) using the well known MNIST data set.\n",
+"\n",
+"Updated to include mini-batching which was added in the 1.14 release.\n",
+"\n",
+"# Intro"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "image/jpeg": 
"/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF
 
NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId
 
ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/
 
WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N
 

[06/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Random-forest-v1.ipynb
--
diff --git a/community-artifacts/Random-forest-v1.ipynb 
b/community-artifacts/Random-forest-v1.ipynb
new file mode 100644
index 000..bac8363
--- /dev/null
+++ b/community-artifacts/Random-forest-v1.ipynb
@@ -0,0 +1,2899 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest\n",
+"\n",
+"Random forests build an ensemble of classifiers, each of which is a tree 
model constructed using bootstrapped samples from the input data. The results 
of these models are then combined to yield a single prediction, which, at the 
expense of some loss in interpretation, have been found to be highly 
accurate.\n",
+"\n",
+"Please also refer to the decision tree user documentation for information 
relevant to the implementation of random forests in MADlib."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+     },
+     "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, 
cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest classification examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Data set related to whether to play golf or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "14 rows affected.\n",
+  "14 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "OUTLOOK\n",
+   "temperature\

[14/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Balanced-sampling-v1.ipynb
--
diff --git a/community-artifacts/Balanced-sampling-v1.ipynb 
b/community-artifacts/Balanced-sampling-v1.ipynb
new file mode 100644
index 000..5f6ec23
--- /dev/null
+++ b/community-artifacts/Balanced-sampling-v1.ipynb
@@ -0,0 +1,3706 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Balanced sampling\n",
+"\n",
+"This module offers a number of re-sampling techniques including 
under-sampling majority classes, over-sampling minority classes, and 
combinations of the two.\n",
+"\n",
+"Balanced sampling was added in MADlib 1.14."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+     "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, 
cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Based in part on the flags data set from 
https://archive.ics.uci.edu/ml/datasets/Flags;
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "22 rows affected.\n",
+  "22 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "name\n",
+   "landmass\n",
+   "zone\n",
+   "area\n",
+   "population\n",
+   "language\n",
+   "colours\n",
+   "mainhue\n",
+   "\n",
+   "\n",
+   "1\n",
+   "Argentina\n",

[madlib-site] Git Push Summary

2018-04-23 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/notebook-updates-1dot14 [deleted] 3f849b9e4


[11/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Encoding-categorical-variables-v2.ipynb
--
diff --git a/community-artifacts/Encoding-categorical-variables-v2.ipynb 
b/community-artifacts/Encoding-categorical-variables-v2.ipynb
new file mode 100644
index 000..5e4cb6f
--- /dev/null
+++ b/community-artifacts/Encoding-categorical-variables-v2.ipynb
@@ -0,0 +1,4026 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Encoding categorical variables\n",
+"This is the new module that replaces create_indicator_variables() which 
was deprecated as of MADlib v1.10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, 
cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## 1.  Load data set\n",
+"Use a subset of the abalone dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "20 rows affected.\n",
+  "20 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "sex\n",
+   "length\n",
+   "

[04/15] madlib-site git commit: jupyter notebooks for 1.14 release

2018-04-23 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Term-frequency-v1.ipynb
--
diff --git a/community-artifacts/Term-frequency-v1.ipynb 
b/community-artifacts/Term-frequency-v1.ipynb
new file mode 100644
index 000..99a0cd0
--- /dev/null
+++ b/community-artifacts/Term-frequency-v1.ipynb
@@ -0,0 +1,1062 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Term Frequency\n",
+"Term frequency computes the number of times that a word or term occurs in 
a document.  Term frequency is often used as part of a larger text processing 
pipeline, which may include operations such as stemming, stop word removal and 
topic modelling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: fmcquillan@madlib'"
+  ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum 4.3.10.0\n",
+"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.13, git revision: unknown, cmake 
configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build 
system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.13, git revision: unknown, cmake configuration 
time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: 
Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang',)]"
+  ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1.  Prepare documents\n",
+"First we create a document table with one document per row:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "4 rows affected.\n",
+  "4 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "docid\n",
+   "contents\n",
+   "\n",
+   "\n",
+   "0\n",
+   "I like to eat broccoli and bananas. I ate a banana and 
spinach smoothie for breakfast.\n",
+   "\n",
+   "\n",
+   "1\n",
+   "Chinchillas and kittens are cute.\n",
+   "\n",
+   "\n",
+   "2\n",
+   "My sister adopted two kittens yesterday.\n",
+   "\n",
+   "\n",
+   "3\n",
+   "Look at this cute hamster munching on a piece of 
broccoli.\n",
+   "  

madlib-site git commit: update website for 1.15.1 release

2018-10-15 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/asf-site 127c0b7e7 -> 6d7f908b5


update website for 1.15.1 release


Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/6d7f908b
Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/6d7f908b
Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/6d7f908b

Branch: refs/heads/asf-site
Commit: 6d7f908b550848b438cf94b3176ce963814bf367
Parents: 127c0b7
Author: Frank McQuillan 
Authored: Mon Oct 15 10:50:17 2018 -0700
Committer: Frank McQuillan 
Committed: Mon Oct 15 10:50:17 2018 -0700

--
 documentation.html |  1 +
 download.html  | 37 -
 index.html | 16 
 3 files changed, 45 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6d7f908b/documentation.html
--
diff --git a/documentation.html b/documentation.html
index 0d01094..8727541 100644
--- a/documentation.html
+++ b/documentation.html
@@ -55,6 +55,7 @@ jQuery(document).ready(function() {
 The primary documentation reference material providing 
detailed information on the functions and algorithms within MADlib as well as 
background theory and references into the literature.
 
 Older Documentation
+MADlib v1.15
 MADlib v1.14
 MADlib v1.13
 MADlib v1.12

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/6d7f908b/download.html
--
diff --git a/download.html b/download.html
index ce790ad..1781c9d 100644
--- a/download.html
+++ b/download.html
@@ -58,7 +58,7 @@
Current Release


-   v1.15
+   v1.15.1
Source Code and Convenience 
Binaries
 
MADlib source code 
and convenience binaries are available from the Apache distribution site.
@@ -66,13 +66,15 @@
Latest 
stable release:
 

-   http://apache.org/dyn/closer.cgi?filename=madlib/1.15/apache-madlib-1.15-src.tar.gz=download;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-src.tar.gz.sha512;>sha512)
 
+   https://dist.apache.org/repos/dist/release/madlib/1.15.1/apache-madlib-1.15.1-src.tar.gz;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-src.tar.gz.sha512;>sha512)
 
+
+   https://dist.apache.org/repos/dist/release/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux-GPDB43.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux-GPDB43.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux-GPDB43.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). GPDB 4.3.x.
+
+   https://dist.apache.org/repos/dist/release/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.x, PostgreSQL 9.6 and 
10.x.
 
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.15/apache-madlib-1.15-bin-Linux-GPDB43.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-bin-Linux-GPDB43.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-bin-Linux-GPDB43.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). GPDB 4.3.x.
+   https://dist.apache.org/repos/dist/release/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux.deb;>Linux
   (https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux.deb.asc;>pgp,
  https://www.apache.org/dist/madlib/1.15.1/apache-madlib-1.15.1-bin-Linux.deb.sha512;>sha512)
 — Ubuntu 16.04. GPDB 5.x, PostgreSQL 9.6 and 10.x.
 
-   http://apache.org/dyn/closer.cgi?filename=madlib/1.15/apache-madlib-1.15-bin-Linux.rpm=download;>Linux
   (https://www.apache.org/dist/madlib/1.15/apache-madlib-1.15-bin-Linux.rpm.asc;>pgp,
  

madlib git commit: update NOTICE file to 2019

2019-01-09 Thread fmcquillan
Repository: madlib
Updated Branches:
  refs/heads/master 70afde269 -> d00f09166


update NOTICE file to 2019


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/d00f0916
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/d00f0916
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/d00f0916

Branch: refs/heads/master
Commit: d00f09166fbb06c8a6ac9a3eb6d75fc20cc6fef8
Parents: 70afde2
Author: Frank McQuillan 
Authored: Wed Jan 9 17:34:11 2019 -0800
Committer: Frank McQuillan 
Committed: Wed Jan 9 17:34:11 2019 -0800

--
 NOTICE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/d00f0916/NOTICE
--
diff --git a/NOTICE b/NOTICE
index feb18f0..7cbfa51 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
 Apache MADlib
-Copyright 2016-2018 The Apache Software Foundation.
+Copyright 2016-2019 The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).



[madlib] branch master updated: update user docs for loading model arch

2019-03-28 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new fd04db3  update user docs for loading model arch
fd04db3 is described below

commit fd04db3f96c07a865a345b3072f9d0b4c3cc5bda
Author: Frank McQuillan 
AuthorDate: Thu Mar 28 17:53:32 2019 -0700

update user docs for loading model arch
---
 doc/mainpage.dox.in|   8 +-
 .../deep_learning/keras_model_arch_table.sql_in| 224 +++--
 .../utilities/minibatch_preprocessing_dl.sql_in|   3 +
 3 files changed, 211 insertions(+), 24 deletions(-)

diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 826e8d7..e221319 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -287,11 +287,11 @@ Interface and implementation are subject to change.
 @{
 @defgroup grp_cg Conjugate Gradient
 @defgroup grp_dl Deep Learning
-@brief A collection of deep learning interfaces.
-@details A collection of deep learning interfaces.
+@brief A collection of modules for deep learning.
+@details A collection of modules for deep learning.
 @{
-@defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for 
Image Data
-@defgroup grp_keras_model_arch Helper Function to Load Model 
Architectures to Table
+@defgroup grp_keras_model_arch Load Model Architecture
+@defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for 
Images
 @}
 @defgroup grp_bayes Naive Bayes Classification
 @defgroup grp_sample Random Sampling
diff --git 
a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in 
b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
index 7626107..bb734ab 100644
--- a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
+++ b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
@@ -30,22 +30,28 @@ m4_include(`SQLCommon.m4')
 /**
 @addtogroup grp_keras_model_arch
 
+@brief Utility function to load model architectures and weights into a table 
for
+use by deep learning algorithms.
+
 Contents
-Helper Function to Load Model 
Architectures to Table
-Helper Function to Delete 
Model Architectures from Table
+Load Model Architecture
+Delete Model 
Architecture
 Examples
 
 
-The architecture of the model to be used in madlib_keras_train()
-function must be stored in a table, the details of which must be
-provided as parameters to the madlib_keras_train module. load_keras_model is
-a helper function to help users insert JSON blobs of Keras model
-architectures into a table. If the output table already exists, the model_arch
-specified will be added as a new row into the table. The output table could 
thus
-act as a repository of Keras model architectures.
+This utility function loads model architectures and
+weights into a table for use by deep learning algorithms.
+Model architecture is in JSON form
+and model weights are in the form of double precision arrays.
+If the output table already exists, a new row is inserted
+into the table so it can act as a repository for multiple model
+architectures.
+
+There is also a utility function to delete a model architecture
+from the model architecture table.
 
-delete_keras_model can be used to delete the model architecture corresponding
-to the provided model_id from the model architecture repository table 
(keras_model_arch_table).
+@anchor load_keras_model
+@par Load Model Architecture
 
 
 load_keras_model(
@@ -56,17 +62,17 @@ load_keras_model(
 \b Arguments
 
   keras_model_arch_table
-  VARCHAR. Output table to load keras model arch.
+  VARCHAR. Output table to load keras model architecture.
   
 
   model_arch
-  JSON. JSON of the model architecture to insert.
+  JSON. JSON of the model architecture to load.
   
 
 
 Output table
 
-The output table produced by load_keras_model contains the following 
columns:
+The output table contains the following columns:
 
   
 model_id
@@ -80,17 +86,19 @@ load_keras_model(
   
   
 model_weights
-DOUBLE PRECISION[]. Weights of the model for warm start.
+DOUBLE PRECISION[]. Weights of the model which may be use for warm 
start.
 
   
   
 __internal_madlib_id__
-TEXT. Unique id for model arch.
+TEXT. Unique id for model arch.  This is an id used internally be 
MADlib.
 
   
 
 
 
+@anchor delete_keras_model
+@par Delete Model Architecture
 
 
 delete_keras_model(
@@ -101,18 +109,194 @@ delete_keras_model(
 \b Arguments
 
   keras_model_arch_table
-  VARCHAR. Table containing Keras model architectures.
+  VARCHAR. Table containing model architectures.
   
 
   model_id
-  INTEGER. The id of the model arch to be deleted.
+  INTEGER. The id of the model architecture

[madlib] branch master updated: add comment to graph user docs to distribute edge table by source vertex id

2019-05-17 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 874d189  add comment to graph user docs to distribute edge table by 
source vertex id
874d189 is described below

commit 874d1892c5e35436c6e5bfc46ad9983a6587b159
Author: Frank McQuillan 
AuthorDate: Fri May 17 14:10:30 2019 -0700

add comment to graph user docs to distribute edge table by source vertex id
---
 src/ports/postgres/modules/graph/apsp.sql_in | 2 ++
 src/ports/postgres/modules/graph/bfs.sql_in  | 3 +++
 src/ports/postgres/modules/graph/hits.sql_in | 3 +++
 src/ports/postgres/modules/graph/pagerank.sql_in | 3 +++
 src/ports/postgres/modules/graph/sssp.sql_in | 3 +++
 src/ports/postgres/modules/graph/wcc.sql_in  | 5 +++--
 6 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/ports/postgres/modules/graph/apsp.sql_in 
b/src/ports/postgres/modules/graph/apsp.sql_in
index c7bf210..7cd77d3 100644
--- a/src/ports/postgres/modules/graph/apsp.sql_in
+++ b/src/ports/postgres/modules/graph/apsp.sql_in
@@ -55,6 +55,8 @@ for this implementation is O(V^2 * E) where V is the
 number of vertices and E is the number of edges.  In
 practice, run-time will be generally be
 much less than this, but it depends on the graph.
+On a Greenplum cluster, the edge table should be distributed
+by the source vertex id column for better performance.
 
 @anchor apsp
 @par APSP
diff --git a/src/ports/postgres/modules/graph/bfs.sql_in 
b/src/ports/postgres/modules/graph/bfs.sql_in
index c1c27fe..ea991fa 100644
--- a/src/ports/postgres/modules/graph/bfs.sql_in
+++ b/src/ports/postgres/modules/graph/bfs.sql_in
@@ -130,6 +130,9 @@ and a single BFS result is generated.
 
 
 
+@note On a Greenplum cluster, the edge table should be distributed
+by the source vertex id column for better performance.
+
 @anchor notes
 @par Notes
 
diff --git a/src/ports/postgres/modules/graph/hits.sql_in 
b/src/ports/postgres/modules/graph/hits.sql_in
index 96a507c..83f838d 100644
--- a/src/ports/postgres/modules/graph/hits.sql_in
+++ b/src/ports/postgres/modules/graph/hits.sql_in
@@ -127,6 +127,9 @@ parameter.
 
 
 
+@note On a Greenplum cluster, the edge table should be distributed
+by the source vertex id column for better performance.
+
 @anchor notes
 @par Notes
 
diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in 
b/src/ports/postgres/modules/graph/pagerank.sql_in
index b81b58e..cd239bd 100644
--- a/src/ports/postgres/modules/graph/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/pagerank.sql_in
@@ -132,6 +132,9 @@ for personalized PageRank. When this parameter is provided, 
personalized PageRan
 will run.  In the absence of this parameter, regular PageRank will run.
 
 
+@note On a Greenplum cluster, the edge table should be distributed
+by the source vertex id column for better performance.
+
 @anchor examples
 @examp
 
diff --git a/src/ports/postgres/modules/graph/sssp.sql_in 
b/src/ports/postgres/modules/graph/sssp.sql_in
index 372f1fb..8175624 100644
--- a/src/ports/postgres/modules/graph/sssp.sql_in
+++ b/src/ports/postgres/modules/graph/sssp.sql_in
@@ -104,6 +104,9 @@ A summary table named _summary is also created. 
This is an internal t
 TEXT, default = NULL. List of columns used to group the input into 
discrete subgraphs. These columns must exist in the edge table. When this value 
is null, no grouping is used and a single SSSP result is generated. 
 
 
+@note On a Greenplum cluster, the edge table should be distributed
+by the source vertex id column for better performance.
+
 @par Path Retrieval
 
 The path retrieval function returns the shortest path from the
diff --git a/src/ports/postgres/modules/graph/wcc.sql_in 
b/src/ports/postgres/modules/graph/wcc.sql_in
index 1c3808b..bc6ce7a 100644
--- a/src/ports/postgres/modules/graph/wcc.sql_in
+++ b/src/ports/postgres/modules/graph/wcc.sql_in
@@ -115,8 +115,9 @@ weakly connected components are generated for all data
 
 
 
-@note On Greenplum cluster, the edge table should be distributed on the src
-column for better performance. In addition, the user should note that this
+@note On a Greenplum cluster, the edge table should be distributed
+by the source vertex id column for better performance.
+In addition, the user should note that this
 function creates a duplicate of the edge table (on Greenplum cluster) for
 better performance.
 



[madlib] branch master updated: add sections to RF and DT user docs on run-time and memory usage

2019-04-19 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 20c87fa  add sections to RF and DT user docs on run-time and memory 
usage
20c87fa is described below

commit 20c87faefd3a166c5456112fba1c8b6ab107ad18
Author: Frank McQuillan 
AuthorDate: Fri Apr 19 17:23:51 2019 -0700

add sections to RF and DT user docs on run-time and memory usage
---
 .../deep_learning/keras_model_arch_table.sql_in|  2 +-
 .../recursive_partitioning/decision_tree.sql_in| 34 +
 .../recursive_partitioning/random_forest.sql_in| 43 +-
 .../modules/regress/clustered_variance.sql_in  |  6 +--
 .../postgres/modules/sample/balance_sample.sql_in  |  2 +-
 src/ports/postgres/modules/svm/svm.sql_in  |  4 +-
 6 files changed, 67 insertions(+), 24 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in 
b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
index bb734ab..16037c2 100644
--- a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
+++ b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
@@ -129,7 +129,7 @@ model.add(Dense(3, name='dense_2'))
 model.to_json
 
 This is represented by the following JSON:
-
+
 '{"class_name": "Sequential", "keras_version": "2.1.6",
 "config": [{"class_name": "Dense", "config": {"kernel_initializer":
 {"class_name": "VarianceScaling", "config": {"distribution": "uniform",
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 8ad7a9d..bf1c883 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -17,6 +17,7 @@ m4_include(`SQLCommon.m4')
 
 Contents
 Training Function
+Run-time and Memory Usage
 Prediction Function
 Tree Display
 Importance Display
@@ -109,7 +110,7 @@ tree_train(
   by their value.
   
 
-  list_of_features_to_exclude
+  list_of_features_to_exclude (optional)
   TEXT. Comma-separated string of column names to exclude from the 
predictors
   list. If the dependent_variable is an expression (including 
cast of a column name),
   then this list should include the columns present in the
@@ -118,7 +119,7 @@ tree_train(
   The names in this parameter should be identical to the names used in the 
table and
   quoted appropriately. 
 
-  split_criterion
+  split_criterion (optional)
   TEXT, default = 'gini' for classification, 'mse' for regression.
   Impurity function to compute the feature to use to split a node.
   Supported criteria are 'gini', 'entropy', 'misclassification' for
@@ -148,7 +149,8 @@ tree_train(
   INTEGER, default: 7. Maximum depth of any node of the final tree,
   with the root node counted as depth 0. A deeper tree can
   lead to better prediction but will also result in
-  longer processing time and higher memory usage.
+  longer processing time and higher memory usage.
+  Current allowed maximum is 100.
 
   min_split (optional)
   INTEGER, default: 20. Minimum number of observations that must exist
@@ -475,11 +477,27 @@ provided cp and explore all possible sub-trees 
(up to a single-node tre
 to compute the optimal sub-tree. The optimal sub-tree and the 'cp' 
corresponding
 to this optimal sub-tree is placed in the output_table, with the
 columns named as tree and pruning_cp respectively.
-- The main parameters that affect memory usage are: depth of
-tree (‘max_depth’), number of features, number of values per
-categorical feature, and number of bins for continuous features (‘num_splits’).
-If you are hitting memory limits, consider reducing one or
-more of these parameters.
+
+@anchor runtime
+@par Run-time and Memory Usage
+
+The number of features and the number of class values per categorical feature 
have a direct
+impact on run-time and memory.  In addition, here is a summary of the main 
parameters
+in the training function that affect run-time and memory:
+
+| Parameter | Run-time | Memory | Notes |
+| :-- | :-- | :-- | :-- |
+| 'max_depth' | High | High | Deeper trees can take longer to run and use more 
memory. |
+| 'min_split' | No or little effect, unless very small. | No or little effect, 
unless very small. | If too small, can impact run-time by building trees that 
are very thick. |
+| 'min_bucket' | No or little effect, unless very small. | No or little 
effect, unless very small. | If too small, can impact run-time by building 
trees that are very thick. |
+| 'num_splits' | High | High | D

[madlib] branch master updated: add examples for generalize cross validation

2019-05-02 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new e4b53a7  add examples for generalize cross validation
e4b53a7 is described below

commit e4b53a75f62e9e6688d611fc3bc029af26961b0f
Author: Frank McQuillan 
AuthorDate: Thu May 2 15:32:28 2019 -0700

add examples for generalize cross validation
---
 .../modules/validation/cross_validation.sql_in | 198 ++---
 1 file changed, 173 insertions(+), 25 deletions(-)

diff --git a/src/ports/postgres/modules/validation/cross_validation.sql_in 
b/src/ports/postgres/modules/validation/cross_validation.sql_in
index a5eeeff..77b2c2b 100644
--- a/src/ports/postgres/modules/validation/cross_validation.sql_in
+++ b/src/ports/postgres/modules/validation/cross_validation.sql_in
@@ -28,7 +28,8 @@ m4_include(`SQLCommon.m4')
 
 
 
-Estimates the fit of a predictive model given a data set and specifications 
for the training, prediction, and error estimation functions.
+Estimates the fit of a predictive model given a data set and specifications for
+the training, prediction, and error estimation functions.
 
 Cross validation, sometimes called rotation estimation, is a technique for
 assessing how the results of a statistical analysis will generalize to an
@@ -56,12 +57,12 @@ output table. The prediction function should take a unique 
ID column name in
 the data table as one of the inputs, so that the prediction result can be
 compared with the validation values.
 Note: Prediction function in some MADlib modules do not save results into an 
output
-table. These prediction functions are not suitable for cross-validation.
+table. These prediction functions are not suitable for this cross-validation 
module.
 
 - The error metric function compares the prediction results with the known
 values of the dependent variables in the data set that was fed into the
 prediction function. It computes the error metric using the specified error
-metric function, storing the  results in a table.
+metric function, and stores the results in a table.
 
 Other inputs include the output table name, k value for the k-fold
 cross validation, and how many folds to try. For example, you can choose to 
run a
@@ -94,40 +95,54 @@ cross_validation_general( modelling_func,
 
 modelling_func
 VARCHAR. The name of the function that trains the model.
+
 modelling_params
 VARCHAR[]. An array of parameters to supply to the modelling function.
+
 modelling_params_type
 VARCHAR[]. An array of data type names for each of the parameters supplied 
to the modelling function.
+
 param_explored
 VARCHAR. The name of the parameter that will be checked to find the 
optimum value. The name must appear in the \e modelling_params array.
+
 explore_values
 VARCHAR. The name of the parameter whose values are to be studied.
+
 predict_func
 VARCHAR. The name of the prediction function.
+
 predict_params
 VARCHAR[]. An array of parameters to supply to the prediction 
function.
+
 predict_params_type
 VARCHAR[]. An array of data type names for each of the parameters supplied 
to the prediction function.
+
 metric_func
 VARCHAR. The name of the function for measuring errors.
+
 metric_params
 VARCHAR[]. An array of parameters to supply to the error metric 
function.
+
 metric_params_type
 VARCHAR[]. An array of data type names for each of the parameters supplied 
to the metric function.
+
 data_tbl
 VARCHAR. The name of the data table that will be split into training and 
validation parts.
+
 data_id
 VARCHAR. The name of the column containing a unique ID associated with
 each row, or NULL if the table has no such column.
 
-Ideally, the data set has a unique ID for each row, so that it is easier to
+Ideally, the data set has a unique ID for each row so that it is easier to
 partition the data set into the training part and the validation part. Set the
 \e id_is_random argument to inform the cross-validation function whether
 the ID value is randomly assigned to each row. If it is not randomly
 assigned, the cross-validation function generates a random ID for each row.
 
+
 id_is_random
 BOOLEAN. TRUE if the provided ID is randomly assigned to each row.
+
 validation_result
 VARCHAR. The name of the table to store the output of the cross-validation 
function. The output table has the following columns:
 
@@ -146,6 +161,7 @@ same name specified in the \e param_explored argument of 
the \e cross_validation
 
 
 
+
 data_cols
 A comma-separated list of names of data columns to use in the calculation.
 When its value is NULL, the function will automatically figure out all the 
column names of the data table.
@@ -183,42 +199,174 @@ The parameter arrays for the modelling, prediction and 
metric functions can incl
 @anchor examples
 @examp
 
-This example uses cross validation with an elastic net regression to find

[madlib-site] branch asf-site updated: add links to deep learning notes and Jupyter notebooks

2019-07-11 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new c30b4ca  add links to deep learning notes and Jupyter notebooks
c30b4ca is described below

commit c30b4cab2616d8633ee3fdd0d32662033ef2973a
Author: Frank McQuillan 
AuthorDate: Thu Jul 11 13:34:12 2019 -0700

add links to deep learning notes and Jupyter notebooks
---
 index.html | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/index.html b/index.html
index 9060809..3a4c7a0 100644
--- a/index.html
+++ b/index.html
@@ -84,7 +84,10 @@
 K-nearest neighbors - Improve performance with kd-tree 
approximate method.
 Association rules - Set default maximum itemset rules to 
10 to reduce runtime.
 
-   You are invited to https://dist.apache.org/repos/dist/release/madlib/1.16/;>download the 
1.16 release and https://github.com/apache/madlib/blob/master/RELEASE_NOTES;>review the 
release notes.
+   You are invited to https://dist.apache.org/repos/dist/release/madlib/1.16/;>download the 
1.16 release and https://github.com/apache/madlib/blob/master/RELEASE_NOTES;>review the 
release notes.
+   For more details about the new deep learning feature, please refer to 
the
+   https://cwiki.apache.org/confluence/display/MADLIB/Deep+Learning;>Apache 
MADlib deep learning notes and
+   the https://github.com/apache/madlib-site/tree/asf-site/community-artifacts/Deep-learning;>Jupyter
 notebook examples.
  

 



[madlib] branch keras_byom created (now 49cf31f)

2019-08-15 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a change to branch keras_byom
in repository https://gitbox.apache.org/repos/asf/madlib.git.


  at 49cf31f  updated user docs for madlib-keras with BYOM inference

This branch includes the following new commits:

 new 49cf31f  updated user docs for madlib-keras with BYOM inference

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




[madlib-site] 01/02: Disable --no-temp-files|-m option, since it doesn't work

2019-08-27 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git

commit 6a530b1b23b609aefd2dc5cb3ca9098ea7849c81
Author: Domino Valdano 
AuthorDate: Mon Aug 26 11:58:42 2019 -0700

Disable --no-temp-files|-m option, since it doesn't work
---
 .../Deep-learning/madlib_image_loader.py   | 42 +-
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/community-artifacts/Deep-learning/madlib_image_loader.py 
b/community-artifacts/Deep-learning/madlib_image_loader.py
index 09a170d..1dc45b3 100755
--- a/community-artifacts/Deep-learning/madlib_image_loader.py
+++ b/community-artifacts/Deep-learning/madlib_image_loader.py
@@ -54,7 +54,7 @@
 # 2a. Perform parallel image loading from numpy arrays:
 #
 #   iloader.load_dataset_from_np(data_x, data_y, table_name,
-#append=False, no_temp_files=False)
+#append=False)
 #
 #   data_x contains image data in np.array format, and data_y is a 1D 
np.array
 #   of the image categories (labels).
@@ -73,18 +73,12 @@
 #   name instead.  This avoids needing to pass the table_name again 
every
 #   time, but also allows it to be changed at any time.
 #
-#   EXPERIMENTAL:  If no_temp_files=True, the operation will happen without
-#  writing out the tables to temporary files before 
loading them.
-#  Instead, an in-memory filelike buffer (StringIO) will 
be used
-#  to build the tables before loading.  Currently not 
working,
-#  for unknown reason.
-#
 #  or,
 #
 # 2b. Perform parallel image loading from disk:
 #
 #   load_dataset_from_disk(self, root_dir, table_name, 
num_labels='all',
-#   append=False, no_temp_files=False):
+#   append=False):
 #
 #   Calling this function instead will look in root_dir on the local disk 
of
 #   wherever this is being run.  It will skip over any files in that
@@ -93,7 +87,7 @@
 #   where the name of each subdirectory is the label for the images
 #   contained within it.
 #
-#   The table_name, append, and no_temp_files parameters are the same as
+#   The table_name and append parameters are the same as described
 #   above.  num_labels is an optional parameter which can be used to
 #   restrict the number of labels (image classes) loaded, even if more
 #   are found in root_dir.  For example, for a large dataset you may
@@ -107,7 +101,7 @@
 #
 # usage: madlib_image_loader.py [-h] [-r ROOT_DIR] [-n NUM_LABELS] [-d DB_NAME]
 #   [-a] [-w NUM_WORKERS] [-p PORT] [-U USERNAME]
-#   [-t HOST] [-P PASSWORD] [-m]
+#   [-t HOST] [-P PASSWORD]
 #   table_name
 #
 # positional arguments:
@@ -247,7 +241,7 @@ class ImageLoader:
 self.table_name = table_name
 self.root_dir = None
 self.pool = None
-self.no_temp_files = None
+self.no_temp_files = False
 
 global iloader  # Singleton per process
 iloader = self
@@ -435,7 +429,7 @@ class ImageLoader:
 self.db_close()
 
 def load_dataset_from_np(self, data_x, data_y, table_name=None,
- append=False, no_temp_files=False):
+ append=False):
 """
 Loads a numpy array into db.  For append=False, creates a new table and
 loads the data.  For append=True, appends data to existing table.
@@ -450,14 +444,12 @@ class ImageLoader:
 @table_name Name of table in db to load data into
 @append Whether to create a new table (False) or append to an existing
 one (True).  If unspecified, default is False
-@no_temp_files If specified, no temporary files are written--all
-operations are performed in-memory.
-
 """
 start_time = time.time()
 self.mother = True
 self.from_disk = False
 self.append = append
+
 if table_name:
 self.table_name = table_name
 
@@ -477,7 +469,7 @@ class ImageLoader:
  initargs=(current_process().pid,
self.table_name,
self.append,
-   no_temp_files,
+   False,
self.db_creds,
False))
 
@@ -539,7 +531,7 @@ class ImageLoader:
 _call_np_worker(data)
 
 def load_dataset_from_disk(self, root_dir, table_name, num_labels='all',
-   append=False, no_temp_files=False):
+   append=False):
 &quo

[madlib] branch load_mst_user_docs created (now 2059ddb)

2019-09-04 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a change to branch load_mst_user_docs
in repository https://gitbox.apache.org/repos/asf/madlib.git.


  at 2059ddb  user docs for setting up model selection table

This branch includes the following new commits:

 new 2059ddb  user docs for setting up model selection table

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




[madlib-site] branch asf-site updated: update jupyter notebooks for new image loader script

2019-07-23 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new b53d4bd  update jupyter notebooks for new image loader script
b53d4bd is described below

commit b53d4bd6b95373c5b05db554d52c3ef5fd094f0c
Author: Frank McQuillan 
AuthorDate: Tue Jul 23 10:41:42 2019 -0700

update jupyter notebooks for new image loader script
---
 .../Deep-learning/Load-images-v1.ipynb | 662 +
 ...-v1.ipynb => MADlib-Keras-cifar10-cnn-v2.ipynb} | 348 ++-
 ...ynb => MADlib-Keras-transfer-learning-v2.ipynb} | 516 
 .../Deep-learning/Madlib Image Loader Demo.ipynb   | 488 ---
 4 files changed, 1088 insertions(+), 926 deletions(-)

diff --git a/community-artifacts/Deep-learning/Load-images-v1.ipynb 
b/community-artifacts/Deep-learning/Load-images-v1.ipynb
new file mode 100644
index 000..15aa948
--- /dev/null
+++ b/community-artifacts/Deep-learning/Load-images-v1.ipynb
@@ -0,0 +1,662 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Load images into table\n",
+"\n",
+"This demonstrates different ways to load images into a database table.\n",
+"\n",
+"We use the script called madlib_image_loader.py located at 
https://github.com/apache/madlib-site/tree/asf-site/community-artifacts/Deep-learning
 which uses the Python Imaging Library so supports multiple formats 
http://www.pythonware.com/products/pil/\n;,
+"\n",
+"## Table of contents\n",
+"\n",
+"1. Setup image loader\n",
+"\n",
+"2. Fetch images then load NumPy array into 
table\n",
+"\n",
+"3. Load from file system into table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated since IPython 
4.0. You should import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: fmcquillan@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.x on GCP for deep learning (PM demo machine)\n",
+"#%sql postgresql://gpadmin@35.239.240.26:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.16, git revision: rc/1.16-rc1, cmake 
configuration time: Mon Jul  1 17:45:09 UTC 2019, build type: Release, build 
system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.16, git revision: rc/1.16-rc1, cmake 
configuration time: Mon Jul  1 17:45:09 UTC 2019, build type: Release, build 
system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": &

[madlib] branch master updated: updated DL preprocessor docs for bytea (#445)

2019-10-01 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 63f40e7  updated DL preprocessor docs for bytea (#445)
63f40e7 is described below

commit 63f40e70f8dbb6c9ed2b1b91c847fd3819b1a627
Author: Frank McQuillan 
AuthorDate: Tue Oct 1 13:52:40 2019 -0700

updated DL preprocessor docs for bytea (#445)

* updated DL preprocessor docs for bytea

* address review comments
---
 .../deep_learning/input_data_preprocessor.sql_in   | 210 ++---
 1 file changed, 98 insertions(+), 112 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in 
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
index a3f4281..8d70431 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
@@ -18,7 +18,7 @@
  * under the License.
  *
  * @file input_preprocessor_dl.sql_in
- * @brief TODO
+ * @brief Utilities to prepare input image data for use by deep learning 
modules.
  * @date December 2018
  *
  */
@@ -86,9 +86,10 @@ training_preprocessor_dl(source_table,
   TEXT.  Name of the output table from the training preprocessor which
   will be used as input to algorithms that support mini-batching.
   Note that the arrays packed into the output table are shuffled
-  and normalized (by dividing each element in the independent variable array
-  by the optional 'normalizing_const' parameter), so they will not match
-  up in an obvious way with the rows in the source table.
+  and normalized, by dividing each element in the independent variable array
+  by the optional 'normalizing_const' parameter. For performance reasons,
+  packed arrays are converted to PostgreSQL bytea format, which is a
+  variable-length binary string.
 
   In the case a validation data set is used (see
   later on this page), this output table is also used
@@ -158,11 +159,15 @@ validation_preprocessor_dl(source_table,
 
   output_table
   TEXT.  Name of the output table from the validation
-  preprocessor which will be used as input to algorithms that support 
mini-batching.  The arrays packed into the output table are
+  preprocessor which will be used as input to algorithms that support 
mini-batching.
+  The arrays packed into the output table are
   normalized using the same normalizing constant from the
   training preprocessor as specified in
   the 'training_preprocessor_table' parameter described below.
   Validation data is not shuffled.
+  For performance reasons,
+  packed arrays are converted to PostgreSQL bytea format, which is a
+  variable-length binary string.
   
 
   dependent_varname
@@ -209,25 +214,43 @@ validation_preprocessor_dl(source_table,
 validation_preprocessor_dl() contain the following columns:
 
   
-buffer_id
-INTEGER. Unique id for each row in the packed table.
+independent_var
+BYTEA. Packed array of independent variables in PostgreSQL bytea 
format.
+Arrays of independent variables packed into the output table are
+normalized by dividing each element in the independent variable array 
by the
+optional 'normalizing_const' parameter.  Training data is shuffled, but
+validation data is not.
 
   
   
 dependent_var
-ANYARRAY[]. Packed array of dependent variables.
+BYTEA. Packed array of dependent variables in PostgreSQL bytea 
format.
 The dependent variable is always one-hot encoded as an
-INTEGER[] array. For now, we are assuming that
+integer array. For now, we are assuming that
 input_preprocessor_dl() will be used
 only for classification problems using deep learning. So
 the dependent variable is one-hot encoded, unless it's already a
 numeric array in which case we assume it's already one-hot
-encoded and just cast it to an INTEGER[] array.
+encoded and just cast it to an integer array.
 
   
   
-independent_var
-REAL[]. Packed array of independent variables.
+independent_var_shape
+INTEGER[]. Shape of the independent variable array after 
preprocessing.
+The first element is the number of images packed per row, and 
subsequent
+elements will depend on how the image is described (e.g., channels 
first or last).
+
+  
+  
+dependent_var_shape
+INTEGER[]. Shape of the dependent variable array after 
preprocessing.
+The first element is the number of images packed per row, and the 
second
+element is the number of class values.
+
+  
+  
+buffer_id
+INTEGER. Unique id for each row in the packed table

[madlib-site] branch automl updated: hyperband diagonal E2E update

2019-11-22 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch automl
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/automl by this push:
 new 94a7f7e  hyperband diagonal E2E update
94a7f7e is described below

commit 94a7f7e81077ccd67710648850b696e2344e39d9
Author: Frank McQuillan 
AuthorDate: Fri Nov 22 16:29:51 2019 -0800

hyperband diagonal E2E update
---
 .../hyperband_diag_v2_mnist-checkpoint.ipynb   | 157 +
 .../automl/hyperband_diag_v2_mnist.ipynb   | 130 -
 2 files changed, 135 insertions(+), 152 deletions(-)

diff --git 
a/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
 
b/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
index 091e6fd..b62f8d5 100644
--- 
a/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
+++ 
b/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
@@ -30,19 +30,17 @@
   },
   {
"cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 16,
"metadata": {
 "scrolled": true
},
"outputs": [
 {
- "name": "stderr",
+ "name": "stdout",
  "output_type": "stream",
  "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated since IPython 
4.0. You should import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
  ]
 }
],
@@ -52,7 +50,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -74,7 +72,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 19,
"metadata": {},
"outputs": [
 {
@@ -100,7 +98,7 @@
"[(u'MADlib version: 1.17-dev, git revision: rel/v1.16-47-g5a1717e, 
cmake configuration time: Tue Nov 19 01:02:39 UTC 2019, build type: release, 
build system: Linux-3.10.0-957.27.2.el7.x86_64, C compiler: gcc 4.8.5, C++ 
compiler: g++ 4.8.5',)]"
   ]
  },
- "execution_count": 3,
+ "execution_count": 19,
  "metadata": {},
  "output_type": "execute_result"
 }
@@ -121,24 +119,9 @@
   },
   {
"cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 20,
"metadata": {},
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  "Using TensorFlow backend.\n"
- ]
-},
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Couldn't import dot_parser, loading of dot files will not be 
possible.\n"
- ]
-}
-   ],
+   "outputs": [],
"source": [
 "from __future__ import print_function\n",
 "\n",
@@ -180,7 +163,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -794,7 +777,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 22,
"metadata": {},
"outputs": [
 {
@@ -821,7 +804,7 @@
"[]"
   ]
  },
- "execution_count": 17,
+ "execution_count": 22,
  "metadata": {},
  "output_type": "execute_result"
 }
@@ -896,7 +879,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -924,7 +907,7 @@
   },
   {
   

[madlib-site] branch automl updated: hyperband diagonal E2E still in work...

2019-11-21 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch automl
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/automl by this push:
 new c606abc  hyperband diagonal E2E still in work...
c606abc is described below

commit c606abcf87684808eaa68fc47b700ae247a7f20c
Author: Frank McQuillan 
AuthorDate: Thu Nov 21 17:20:43 2019 -0800

hyperband diagonal E2E still in work...
---
 .../hyperband_diag_v2_mnist-checkpoint.ipynb   | 924 ++---
 .../automl/hyperband_diag_v2_mnist.ipynb   | 924 ++---
 2 files changed, 866 insertions(+), 982 deletions(-)

diff --git 
a/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
 
b/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
index 09598ea..091e6fd 100644
--- 
a/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
+++ 
b/community-artifacts/Deep-learning/automl/.ipynb_checkpoints/hyperband_diag_v2_mnist-checkpoint.ipynb
@@ -23,7 +23,9 @@
 "\n",
 "5. Hyperband diagonal\n",
 "\n",
-"6. Plot results"
+"6. Plot results\n",
+"\n",
+"7. Print run schedules"
]
   },
   {
@@ -792,7 +794,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 17,
"metadata": {},
"outputs": [
 {
@@ -819,7 +821,7 @@
"[]"
   ]
  },
- "execution_count": 6,
+ "execution_count": 17,
  "metadata": {},
  "output_type": "execute_result"
 }
@@ -894,7 +896,7 @@
   },
   {
"cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -917,344 +919,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
-"Pretty print reg Hyperband run schedule"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "max_iter = 3\n",
-  "eta = 3\n",
-  "B = 2*max_iter = 6\n",
-  " \n",
-  "s=1\n",
-  "n_i  r_i\n",
-  "\n",
-  "3 1.0\n",
-  "1.0 3.0\n",
-  " \n",
-  "s=0\n",
-  "n_i  r_i\n",
-  "\n",
-  "2 3\n",
-  " \n",
-  "sum of configurations at leaf nodes across all s = 3.0\n",
-  "(if have more workers than this, they may not be 100% busy)\n"
- ]
-}
-   ],
-   "source": [
-"import numpy as np\n",
-"from math import log, ceil\n",
-"\n",
-"#input\n",
-"max_iter = 3  # maximum iterations/epochs per configuration\n",
-"eta = 3  # defines downsampling rate (default=3)\n",
-"\n",
-"logeta = lambda x: log(x)/log(eta)\n",
-"s_max = int(logeta(max_iter))  # number of unique executions of 
Successive Halving (minus one)\n",
-"B = (s_max+1)*max_iter  # total number of iterations (without reuse) per 
execution of Succesive Halving (n,r)\n",
-"\n",
-"#echo output\n",
-"print (\"max_iter = \" + str(max_iter))\n",
-"print (\"eta = \" + str(eta))\n",
-"print (\"B = \" + str(s_max+1) + \"*max_iter = \" + str(B))\n",
-"\n",
-"sum_leaf_n_i = 0 # count configurations at leaf nodes across all s\n",
-"\n",
-" Begin Finite Horizon Hyperband outlerloop. Repeat indefinitely.\n",
-"for s in reversed(range(s_max+1)):\n",
-"\n",
-"print (\" \")\n",
-"print (\"s=\" + str(s))\n",
-"print (\"n_i  r_i\")\n",
-"print (\"\")\n",
-"counter = 0\n",
-"\n",
-"n = int(ceil(int(B/max_iter/(s+1))*eta**s)) # initial number of 
configurations\n",
-"r = max_iter*eta**(-s) # initial number of iterations to run 
configurations for\n",
-"\n",
-" Begin Finite Ho

[madlib] branch master updated: misc user doc updates for 1dot17

2019-12-17 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new ec5614f  misc user doc updates for 1dot17
ec5614f is described below

commit ec5614fe34fc4e410ac226a60985051fc166ee8e
Author: Frank McQuillan 
AuthorDate: Tue Dec 17 12:38:01 2019 -0800

misc user doc updates for 1dot17
---
 doc/mainpage.dox.in|  6 +--
 .../deep_learning/input_data_preprocessor.sql_in   |  4 +-
 .../deep_learning/keras_model_arch_table.sql_in|  9 ++--
 .../modules/deep_learning/madlib_keras.sql_in  | 57 +++---
 .../madlib_keras_fit_multiple_model.sql_in | 28 ++-
 src/ports/postgres/modules/knn/knn.sql_in  |  4 ++
 6 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 0e7b426..82be4a5 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -292,9 +292,9 @@ Interface and implementation are subject to change.
 @defgroup grp_gpu_configuration GPU Configuration
 @defgroup grp_keras Keras
 @defgroup grp_keras_model_arch Load Models
-@defgroup grp_model_selection Model Selection
-@brief Train multiple deep learning models at the same time.
-@details Train multiple deep learning models at the same time.
+@defgroup grp_model_selection Model Selection for DL
+@brief Train multiple deep learning models at the same time for model 
architecture search and hyperparameter selection.
+@details Train multiple deep learning models at the same time for 
model architecture search and hyperparameter selection.
 @{
 @defgroup grp_automl AutoML
 @defgroup grp_keras_run_model_selection Run Model Selection
diff --git 
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in 
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
index ddc356f..f243417 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
@@ -853,7 +853,9 @@ Geoffrey Hinton with Nitish Srivastava and Kevin Swersky, 
http://www.cs.toronto.
 @anchor related
 @par Related Topics
 
-minibatch_preprocessing.sql_in
+training_preprocessor_dl()
+
+validation_preprocessor_dl()
 
 gpu_configuration()
 
diff --git 
a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in 
b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
index b1bf150..cc915bb 100644
--- a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
+++ b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
@@ -275,11 +275,10 @@ SELECT COUNT(*) FROM model_arch_library WHERE 
model_weights IS NOT NULL;
 ---+
  1
 
-Load weights from Keras using psycopg2.
-(Psycopg is a PostgreSQL database adapter for the
-Python programming language.) As above we need to
-flatten then serialize the weights to store as a
-PostgreSQL binary data type.
+Load weights from Keras using psycopg2.  (Psycopg is a PostgreSQL database 
adapter for the
+Python programming language.) As above we need to flatten then serialize the 
weights to store as a
+PostgreSQL binary data type.  Note that the psycopg2.Binary function used 
below will increase the size of the
+Python object for the weights, so if your model is large it might be better to 
use a PL/Python function as above.
 
 import psycopg2
 import psycopg2 as p2
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 6127031..0a395e8 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -737,7 +737,12 @@ madlib_keras_predict_byom(
   class_values (optional)
   TEXT[], default: NULL.
 List of class labels that were used while training the model. See the 
'output_table'
-column for more details.
+column above for more details.
+
+@note
+If you specify the class values parameter,
+it must reflect how the dependent variable was 1-hot encoded for training. 
If you accidently
+pick another order that does not match the 1-hot encoding, the predictions 
would be wrong.
   
 
   normalizing_const (optional)
@@ -1166,7 +1171,7 @@ WHERE iris_predict.estimated_class_text != 
iris_test.class_text;
  6
 (1 row)
 
-Percent missclassifications:
+Accuracy:
 
 SELECT round(count(*)*100/(150*0.2),2) as test_accuracy_percent from
 (select iris_test.class_text as actual, iris_predict.estimated_class_text 
as estimated
@@ -1188,10 +1193,18 @@ syntax. See load_keras_model
 for details on how to load the model architecture and weights.
 In this example we will use

[madlib] branch master updated (35e959d -> 72dfd30)

2019-10-28 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git.


from 35e959d  DL: Remove quote_ident to allow tables on schemas
 new 24c6e73  Add keras version to the docs and release notes
 new 72dfd30  Address review comments

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 RELEASE_NOTES| 8 
 src/ports/postgres/modules/deep_learning/madlib_keras.sql_in | 4 
 2 files changed, 12 insertions(+)



[madlib] 01/02: Add keras version to the docs and release notes

2019-10-28 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 24c6e730c5dd4faa2fc60fd054a88d85643cf63c
Author: Orhan Kislal 
AuthorDate: Mon Oct 21 14:10:12 2019 -0400

Add keras version to the docs and release notes
---
 RELEASE_NOTES| 8 
 src/ports/postgres/modules/deep_learning/madlib_keras.sql_in | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 49a4cd6..d4296ec 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -10,6 +10,14 @@ commit history located at 
https://github.com/apache/madlib/commits/master.
 Current list of bugs and issues can be found at 
https://issues.apache.org/jira/browse/MADLIB.
 
 —-
+MADlib v1.17:
+
+Release Date:
+
+Other:
+- DL: Supported keras version is fixed to 2.2.4
+
+—-
 MADlib v1.16:
 
 Release Date: 2019-Jul-02
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index cf4f2d1..9c4f39a 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -77,6 +77,9 @@ typically resulting faster and smoother convergence [3].
 You can also do inference on models that have not been trained with MADlib,
 but rather imported from an external source.
 
+Note that the following MADlib functions are targetting a specific Keras
+version (2.2.4). Using a newer or older version may or may not work as 
intended.
+
 @brief Solves image classification problems by calling
 the Keras API
 



[madlib-site] branch automl created (now 0c8e677)

2019-11-18 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a change to branch automl
in repository https://gitbox.apache.org/repos/asf/madlib-site.git.


  at 0c8e677  hyperband in work

This branch includes the following new commits:

 new 0c8e677  hyperband in work

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




[madlib] branch master updated: SVM: Lower bound the default for n_components

2019-10-07 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 1b5ba4a  SVM: Lower bound the default for n_components
1b5ba4a is described below

commit 1b5ba4afd58ca9b263ccac47769ef281b45e3466
Author: Orhan Kislal 
AuthorDate: Fri Oct 4 14:45:06 2019 -0400

SVM: Lower bound the default for n_components

JIRA: MADLIB-1384
---
 src/ports/postgres/modules/svm/svm.py_in  |  2 +-
 src/ports/postgres/modules/svm/svm.sql_in | 30 ++
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/ports/postgres/modules/svm/svm.py_in 
b/src/ports/postgres/modules/svm/svm.py_in
index b4f4f45..1532cb2 100644
--- a/src/ports/postgres/modules/svm/svm.py_in
+++ b/src/ports/postgres/modules/svm/svm.py_in
@@ -1330,7 +1330,7 @@ def _process_epsilon(is_svc, args):
 def _extract_kernel_params(kernel_params='', n_features=10):
 params_default = {
 # common params
-'n_components': 2 * n_features,
+'n_components': max(100, 2 * n_features),
 'fit_intercept': False,
 'random_state': 1,
 
diff --git a/src/ports/postgres/modules/svm/svm.sql_in 
b/src/ports/postgres/modules/svm/svm.sql_in
index cb6b69e..ba05e86 100644
--- a/src/ports/postgres/modules/svm/svm.sql_in
+++ b/src/ports/postgres/modules/svm/svm.sql_in
@@ -319,23 +319,22 @@ to the end of the feature list - thus the last element of 
the coefficient list
 is the intercept.
 
 n_components
-Default: 2*num_features. The dimensionality of the transformed feature 
space.
+Default: max(100, 2*num_features). The dimensionality of the transformed 
feature space.
 A larger value lowers the variance of the estimate of the kernel but requires
 more memory and takes longer to train.
 @note
-Setting the \e n_components kernel parameter properly is important
-to generate an accurate decision boundary.  This parameter
-is the dimensionality of the transformed feature space that arises
-from using the primal formulation.  We use primal in MADlib
-because we are implementing in a distributed system,
-compared to an R or other single node implementations
-that can use the dual formulation.  The primal approach
-implements an approximation of the kernel function using random
-feature maps, so in the case of a gaussian kernel, the
-dimensionality of the transformed feature space is not
-infinite (as in dual), but rather of size \e n_components.
-Try increasing \e n_components higher than the default if you are
-not getting an accurate decision boundary.
+Setting the \e n_components kernel parameter properly is important to
+generate an accurate decision boundary and can make the difference between a
+good model and a useless model. Try increasing the value of \e n_components
+ if you are not getting an accurate decision boundary. This parameter arises
+from using the primal formulation, in which we map data into a relatively
+low-dimensional randomized feature space [2, 3]. The parameter
+\e n_components is the dimension of that feature space.  We use the primal in
+MADlib to support scaling to large data sets, compared to R or other single
+node implementations  that use the dual formulation and hence do not have this
+type of mapping, since the the dimensionality of  the transformed feature
+space in the dual is effectively infinite.
+
 random_state
 Default: 1. Seed used by a random number generator. 
 
@@ -641,8 +640,7 @@ WHERE houses_pred.prediction != (houses.price < 10);
 -# Train using Gaussian kernel. This time we specify
 the initial step size and maximum number of iterations to run. As part of the
 kernel parameter, we choose 10 as the dimension of the space where we train
-SVM. A larger number will lead to a more powerful model but run the risk of
-overfitting. As a result, the model will be a 10 dimensional vector, instead
+SVM. As a result, the model will be a 10 dimensional vector, instead
 of 4 as in the case of linear model.
 
 DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, 
houses_svm_gaussian_random;



[madlib] branch master updated: correct fit function call in user docs for multi fit

2019-12-19 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new fc9cd64  correct fit function call in user docs for multi fit
fc9cd64 is described below

commit fc9cd64ea53433353d7db205113f0e499d920f14
Author: Frank McQuillan 
AuthorDate: Thu Dec 19 16:11:58 2019 -0800

correct fit function call in user docs for multi fit
---
 .../modules/deep_learning/madlib_keras_fit_multiple_model.sql_in| 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index c0a68b3..669c5db 100644
--- 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -93,7 +93,7 @@ of model architectures, compile and fit parameters.
 The fit (training) function has the following format:
 
 
-madlib_keras_fit(
+madlib_keras_fit_multiple_model(
 source_table,
 model_output_table,
 model_selection_table,



[madlib] branch master updated: indicate optional param in elastic net

2020-02-12 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 62e2a46  indicate optional param in elastic net
62e2a46 is described below

commit 62e2a46173761e1d6ef4db8304e15506f724a708
Author: Frank McQuillan 
AuthorDate: Wed Feb 12 17:30:19 2020 -0800

indicate optional param in elastic net
---
 src/ports/postgres/modules/elastic_net/elastic_net.sql_in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in 
b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index 157851d..c1aaebf 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -163,7 +163,7 @@ empty string, no columns are excluded.
 max_iter (optional)
 INTEGER, default: 1000. The maximum number of iterations allowed.
 
-tolerance
+tolerance (optional)
 FLOAT8, default: 1e-6. This is the criterion to stop iterating. Both the
 'fista' and 'igd' optimizers compute the difference between the
 log likelihood of two consecutive iterations, and when the difference is 
smaller



[madlib] branch master updated: indicate optional params in DR and RF

2020-02-12 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new ac30a3c  indicate optional params in DR and RF
ac30a3c is described below

commit ac30a3c508509a6996f872b0a7505b215c94fd85
Author: Frank McQuillan 
AuthorDate: Wed Feb 12 17:06:19 2020 -0800

indicate optional params in DR and RF
---
 .../postgres/modules/recursive_partitioning/decision_tree.sql_in| 6 +++---
 .../postgres/modules/recursive_partitioning/random_forest.sql_in| 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 2408770..04f7b82 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -537,7 +537,7 @@ tree_predict(tree_model,
   'estimated_prob_dep_value', where dep_value represents each
   value of the response variable.
 
-  type
+  type (optional)
   TEXT, optional, default: 'response'. For regression trees, the output is
   always the predicted value of the dependent variable. For classification
   trees, the type variable can be 'response', giving the
@@ -580,10 +580,10 @@ split for a tuple.
 
 tree_model
 TEXT. Name of the table containing the decision tree model.
-dot_format
+dot_format (optional)
 BOOLEAN, default = TRUE. Output can either be in a dot format or a text
 format. If TRUE, the result is in the dot format, else output is in text 
format.
-verbosity
+verbosity (optional)
 BOOLEAN, default = FALSE. If set to TRUE, the dot format output will 
contain
 additional information (impurity, sample size, number of weighted rows
 for each response variable, classification or prediction if the tree
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index 251dfbc..888388c 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -545,7 +545,7 @@ forest_predict(random_forest_model,
   'estimated_prob_dep_value', where dep_value represents each
   value of the response variable.
 
-  type
+  type (optional)
   TEXT, optional, default: 'response'. For regression trees, the output is
   always the predicted value of the dependent variable. For classification
   trees, the type variable can be 'response', giving the



[madlib] branch master updated: misc user doc clarifications

2020-01-15 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 515dc25  misc user doc clarifications
515dc25 is described below

commit 515dc2574f2800c0459ec2f0b10d17071f456186
Author: Frank McQuillan 
AuthorDate: Wed Jan 15 16:41:10 2020 -0800

misc user doc clarifications
---
 .../madlib_keras_fit_multiple_model.sql_in | 97 --
 .../madlib_keras_model_selection.sql_in| 10 +++
 2 files changed, 64 insertions(+), 43 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index 0468942..33699a4 100644
--- 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -130,6 +130,13 @@ madlib_keras_fit_multiple_model(
 
   num_iterations
   INTEGER.  Number of iterations to train.
+
+@note
+This parameter is different than the number of passes over the dataset,
+which is commonly referred to as the number of epochs.  Since MADlib 
operates
+in a distributed system, the number of
+epochs is actually equal to this parameter 'num_iterations' X 'epochs' as
+specified in the Keras fit parameter.
   
 
   use_gpus (optional)
@@ -1016,18 +1023,18 @@ SELECT * FROM iris_multi_model_info ORDER BY 
training_metrics_final DESC, traini
 
  mst_key | model_id | compile_params   
   |  fit_params   |  model_type  |  model_size  | 
metrics_elapsed_time | metrics_type | training_metrics_final | 
training_loss_final |  training_metrics   |training_loss| 
validation_metrics_final | validation_loss_final | validation_metrics | 
validation_loss
 
-+--+-+---+--+--+--+--++-+-+-+--+---++-
-   9 |2 | loss='categorical_crossentropy', 
optimizer='Adam(lr=0.01)',metrics=['accuracy'] | batch_size=4,epochs=1 | 
madlib_keras | 1.2197265625 | {0.189763069152832}  | {accuracy}   | 
0.98349228 |  0.102392569184 | {0.98349227905} | 
{0.102392569184303} |  |   |
|
-   4 |1 | loss='categorical_crossentropy', 
optimizer='Adam(lr=0.01)',metrics=['accuracy'] | batch_size=8,epochs=1 | 
madlib_keras | 0.7900390625 | {0.170287847518921}  | {accuracy}   | 
0.97523842 |  0.159002527595 | {0.97523841858} | 
{0.159002527594566} |  |   |
|
-   3 |1 | loss='categorical_crossentropy', 
optimizer='Adam(lr=0.01)',metrics=['accuracy'] | batch_size=4,epochs=1 | 
madlib_keras | 0.7900390625 | {0.165465116500854}  | {accuracy}   | 
0.96638851 |   0.10245500505 | {0.96638851166} | 
{0.102455005049706} |  |   |
|
-  10 |2 | loss='categorical_crossentropy', 
optimizer='Adam(lr=0.01)',metrics=['accuracy'] | batch_size=8,epochs=1 | 
madlib_keras | 1.2197265625 | {0.199872970581055}  | {accuracy}   | 
0.94162693 |   0.12242924422 | {0.94162693024} | 
{0.122429244220257} |  |   |
|
-   5 |1 | 
loss='categorical_crossentropy',optimizer='Adam(lr=0.001)',metrics=['accuracy'] 
| batch_size=4,epochs=1 | madlib_keras | 0.7900390625 | {0.16815185546875}   | 
{accuracy}   | 0.88325386 |  0.437314987183 | 
{0.88325386047} | {0.437314987182617} |  |  
 ||
-  11 |2 | 
loss='categorical_crossentropy',optimizer='Adam(lr=0.001)',metrics=['accuracy'] 
| batch_size=4,epochs=1 | madlib_keras | 1.2197265625 | {0.430488109588623}  | 
{accuracy}   | 0.85849228 |  0.400548309088 | 
{0.85849227905} | {0.400548309087753} |  |  
 ||
-   6 |1 | 
loss='categorical_crossentropy',optimizer='Adam(lr=0.001)',metrics=['accuracy'] 
| batch_size=8,epochs=1 | madlib_keras | 0.7900390625 | {0.154508113861084}  | 
{accuracy}   | 0.68337307 |  0.634458899498 | 
{0.68337306976} | {0.634458899497986} |  |  
 ||
-  12 |2 | 
loss

[madlib] branch master updated (7625ae0 -> 273301e)

2020-01-15 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git.


from 7625ae0  DL: Fix failure on GPDB6 for preprocessor
 new 96a4424  Decrease the learning rate for transfer learning test
 new 273301e  Update Apache Copyright date

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 NOTICE| 2 +-
 .../modules/deep_learning/test/madlib_keras_transfer_learning.sql_in  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)



[madlib] 02/02: Update Apache Copyright date

2020-01-15 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 273301e3c2e150b9648a886761607695a04ce236
Author: Domino Valdano 
AuthorDate: Wed Jan 15 10:42:52 2020 -0800

Update Apache Copyright date
---
 NOTICE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NOTICE b/NOTICE
index 7cbfa51..10b9387 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
 Apache MADlib
-Copyright 2016-2019 The Apache Software Foundation.
+Copyright 2016-2020 The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).



[madlib] branch master updated: clarify warm start with model selection in user docs

2020-01-07 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 1fa020b  clarify warm start with model selection in user docs
1fa020b is described below

commit 1fa020b08c2a4a8971d1957674794894d6c71783
Author: Frank McQuillan 
AuthorDate: Tue Jan 7 17:33:42 2020 -0800

clarify warm start with model selection in user docs
---
 .../modules/deep_learning/madlib_keras_fit_multiple_model.sql_in  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index fbf3497..0468942 100644
--- 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -84,7 +84,7 @@ for the training data.  For example, you may only want
 to train models on segments that reside on hosts that are GPU enabled.
 
 You can set up the models and hyperparameters to try with the
-Setup
+Setup
 Model Selection utility to define the unique combinations
 of model architectures, compile and fit parameters.
 
@@ -1320,6 +1320,8 @@ set the 'warm_start' parameter to TRUE in the fit 
function.
 Transfer learning uses initial model state (weights) stored in the 
'model_arch_table' - in this case set the
 'warm_start' parameter to FALSE in the fit function.
 
+4. Here are some more details on how warm start works.  These details are 
mostly applicable when implementing autoML algorithms on top of MADlib's model 
selection.  In short, the 'model_selection_table' dictates which models get 
trained and output to the 'model_output_table' and associated summary and info 
tables.  When 'warm_start' is TRUE, models are built for each 'mst_key' in the 
'model_selection_table'.  If there are prior runs for an 'mst_key' then the 
weights from that run will be [...]
+
 @anchor background
 @par Technical Background
 



[madlib-site] branch asf-site updated: update download page to say ubuntu 18 for 1.17.0

2020-04-14 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 002cf96  update download page to say ubuntu 18 for 1.17.0
002cf96 is described below

commit 002cf96cd02ffc461c7adf3a0b99128ebda3371c
Author: Frank McQuillan 
AuthorDate: Tue Apr 14 11:43:13 2020 -0700

update download page to say ubuntu 18 for 1.17.0
---
 download.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/download.html b/download.html
index 76af104..0d89739 100644
--- a/download.html
+++ b/download.html
@@ -72,7 +72,7 @@
 
https://dist.apache.org/repos/dist/release/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.x, GPD [...]
 
-   https://dist.apache.org/repos/dist/release/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.deb;>Linux
   (https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.deb.asc;>pgp,
  https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.deb.sha512;>sha512)
 — Ubuntu 16.04. GPDB 5.x, GPDB 6.x, PostgreSQL 11.x and [...]
+   https://dist.apache.org/repos/dist/release/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.deb;>Linux
   (https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.deb.asc;>pgp,
  https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux.deb.sha512;>sha512)
 — Ubuntu 18.04. GPDB 5.x, GPDB 6.x, PostgreSQL 11.x and [...]
 
https://dist.apache.org/repos/dist/release/madlib/1.17.0/apache-madlib-1.17.0-bin-Darwin.dmg;>Mac
 OS X   (https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Darwin.dmg.asc;>pgp,
 https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Darwin.dmg.sha512;>sha512)
 — OS 10.6 and higher.  PostgreSQL 11.x and 12.x.




[madlib-site] branch asf-site updated: fix download links for archived 1.16 release artifacts

2020-04-17 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new e2df99d  fix download links for archived 1.16 release artifacts
e2df99d is described below

commit e2df99d059fd2224e5bdfc1119d4afc71c93efee
Author: Frank McQuillan 
AuthorDate: Fri Apr 17 11:32:29 2020 -0700

fix download links for archived 1.16 release artifacts
---
 download.html | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/download.html b/download.html
index 0d89739..66f5052 100644
--- a/download.html
+++ b/download.html
@@ -109,15 +109,15 @@
Latest 
stable release:
 

-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-src.tar.gz;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-src.tar.gz.sha512;>sha512)
 
+   https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-src.tar.gz;>Source
 code tar.gz (https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-src.tar.gz.asc;>pgp,
 https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-src.tar.gz.sha512;>sha512)
 
 
-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). [...]
+   https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm;>Linux
   (https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm.asc;>pgp,
  https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). GP [...]
 
-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.x, PostgreSQL 10.x a [...]
+   https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm;>Linux
   (https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm.asc;>pgp,
  https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.x, PostgreSQL 10.x and  [...]
 
-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-bin-Linux.deb;>Linux
   (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.deb.asc;>pgp,
  https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.deb.sha512;>sha512)
 — Ubuntu 16.04. GPDB 5.x, PostgreSQL 10.x and 11.x.
+   https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.deb;>Linux
   (https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.deb.asc;>pgp,
  https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.deb.sha512;>sha512)
 — Ubuntu 16.04. GPDB 5.x, PostgreSQL 10.x and 11.x.
 
-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-bin-Darwin.dmg;>Mac
 OS X   (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Darwin.dmg.asc;>pgp,
 https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Darwin.dmg.sha512;>sha512)
 — OS 10.6 and higher.  PostgreSQL 10.x and 11.x.
+   https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Darwin.dmg;>Mac
 OS X   (https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Darwin.dmg.asc;>pgp,
 https://archive.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Darwin.dmg.sha512;>sha512)
 — OS 10.6 and higher.  PostgreSQL 10.x and 11.x.

 
v1.15.1



[madlib-site] branch asf-site updated: update website for 1.17.0 release

2020-04-10 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 1d4f9be  update website for 1.17.0 release
 new 81dc279  Merge branch 'asf-site' of github.com:apache/madlib-site into 
asf-site
1d4f9be is described below

commit 1d4f9bec751b1b2323f05d8e512a60cb8240aa92
Author: Frank McQuillan 
AuthorDate: Fri Apr 10 11:08:02 2020 -0700

update website for 1.17.0 release
---
 _media/logos/vmw.png | Bin 0 -> 5231 bytes
 community.html   |   4 +--
 documentation.html   |   1 +
 download.html|  31 
 index.html   |  81 ---
 5 files changed, 73 insertions(+), 44 deletions(-)

diff --git a/_media/logos/vmw.png b/_media/logos/vmw.png
new file mode 100644
index 000..c216caa
Binary files /dev/null and b/_media/logos/vmw.png differ
diff --git a/community.html b/community.html
index 4cec998..62ef336 100644
--- a/community.html
+++ b/community.html
@@ -58,7 +58,7 @@
 
 http://pivotal.io/; class="center">
 
-
+
 
 Providing core development and scalability 
testing
 Learn More
@@ -210,7 +210,7 @@
 
 http://postgresql.org;>PostgreSQL
 http://greenplum.org/;>Greenplum 
Database
-http://hawq.incubator.apache.org;>Apache 
HAWQ
+http://hawq.apache.org;>Apache 
HAWQ
 http://cran.r-project.org/web/packages/PivotalR/;>PivotalR
 
 
diff --git a/documentation.html b/documentation.html
index b5d93bf..7d92d11 100644
--- a/documentation.html
+++ b/documentation.html
@@ -55,6 +55,7 @@ jQuery(document).ready(function() {
 The primary documentation reference material providing 
detailed information on the functions and algorithms within MADlib as well as 
background theory and references into the literature.
 
 Older Documentation
+MADlib v1.16
 MADlib v1.15.1
 MADlib v1.15
 MADlib v1.14
diff --git a/download.html b/download.html
index b68b47f..76af104 100644
--- a/download.html
+++ b/download.html
@@ -58,7 +58,7 @@
Current Release


-   v1.16
+   v1.17.0
Source Code and Convenience 
Binaries
 
MADlib source code 
and convenience binaries are available from the Apache distribution site.
@@ -66,15 +66,15 @@
Latest 
stable release:
 

-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-src.tar.gz;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-src.tar.gz.sha512;>sha512)
 
+   https://dist.apache.org/repos/dist/release/madlib/1.17.0/apache-madlib-1.17.0-src.tar.gz;>Source
 code tar.gz (https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-src.tar.gz.asc;>pgp,
 https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-src.tar.gz.sha512;>sha512)
 
 
-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux-GPDB43.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and higher (64 bit). [...]
+   https://dist.apache.org/repos/dist/release/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux-GPDB43.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux-GPDB43.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.17.0/apache-madlib-1.17.0-bin-Linux-GPDB43.rpm.sha512;>sha512)
 — CentOS / Red Hat 5 and hi [...]
 
-   https://dist.apache.org/repos/dist/release/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm;>Linux
   (https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm.asc;>pgp,
  https://www.apache.org/dist/madlib/1.16/apache-madlib-1.16-bin-Linux.rpm.sha512;>sha512)
 — CentOS / Red Hat 6 and higher (64 bit). GPDB 5.x, PostgreSQL 10.x a [...]
+   https://dist.apache.or

[madlib] branch master updated: correct disk space comment for gp5 and 6 in keras multi fit user docs

2020-03-26 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 2f0bb2e  correct disk space comment for gp5 and 6 in keras multi fit 
user docs
2f0bb2e is described below

commit 2f0bb2e0b01e060150b443c43f00c5e1d664a5c6
Author: Frank McQuillan 
AuthorDate: Thu Mar 26 15:25:49 2020 -0700

correct disk space comment for gp5 and 6 in keras multi fit user docs
---
 .../modules/deep_learning/madlib_keras_fit_multiple_model.sql_in | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index 4d1eb09..9238652 100644
--- 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -88,10 +88,11 @@ You can set up the models and hyperparameters to try with 
the
 Model Selection utility to define the unique combinations
 of model architectures, compile and fit parameters.
 
-@note If 'madlib_keras_fit_multiple_model()' is running on GPDB 5, the 
database will
+@note If 'madlib_keras_fit_multiple_model()' is running on GPDB 5 and some 
versions
+of GPDB 6, the database will
 keep adding to the disk space (in proportion to model size) and will only
 release the disk space once the fit multiple query has completed execution.
-This is not the case for GPDB 6+ where disk space is released during the
+This is not the case for GPDB 6.5.0+ where disk space is released during the
 fit multiple query.
 
 @note CUDA GPU memory cannot be released until the process holding it is 
terminated.



[madlib-site] branch asf-site updated: fix broken links for datasets on community page

2020-03-30 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new bda2c9b  fix broken links for datasets on community page
bda2c9b is described below

commit bda2c9b2335a80914332ebf16fdc16009988f71f
Author: Frank McQuillan 
AuthorDate: Mon Mar 30 15:34:03 2020 -0700

fix broken links for datasets on community page
---
 community.html | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/community.html b/community.html
index 6f892a8..4cec998 100644
--- a/community.html
+++ b/community.html
@@ -197,10 +197,12 @@
 
 
 Datasets
+There is a growing set of publically available 
datasets.  Here are some examples:
 
-http://archive.ics.uci.edu/ml/datasets.html; title="UCI Machine Learning 
Repository: Data Sets">http://archive.ics.uci.edu/ml/datasets.html
-http://mlcomp.org/datasets; title="MLcomp 
- Viewing All Datasets">http://mlcomp.org/datasets
-http://mldata.org/; title="mldata :: 
Welcome">http://mldata.org/
+https://archive.ics.uci.edu/ml/index.php;>UCI Machine Learning 
Repository
+https://datasetsearch.research.google.com/;>Google Dataset Search
+https://www.kaggle.com/datasets;>Kaggle 
Datasets
+https://www.kdnuggets.com/datasets/index.html;>KDnuggets List of 
Datasets
 
 
 



[madlib-site] branch asf-site updated: add ipython notebook for window functions

2020-04-24 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 3a7f9ed  add ipython notebook for window functions
3a7f9ed is described below

commit 3a7f9ed2e8dbaa6d0b0a406593f00b0598e1bbf0
Author: Frank McQuillan 
AuthorDate: Fri Apr 24 12:45:54 2020 -0700

add ipython notebook for window functions
---
 .../Time-series/Window-functions-v1.ipynb  | 1910 
 1 file changed, 1910 insertions(+)

diff --git a/community-artifacts/Time-series/Window-functions-v1.ipynb 
b/community-artifacts/Time-series/Window-functions-v1.ipynb
new file mode 100644
index 000..9c30a40
--- /dev/null
+++ b/community-artifacts/Time-series/Window-functions-v1.ipynb
@@ -0,0 +1,1910 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Time series example - window functions\n",
+"\n",
+"Some example queries on time series data using aggregates and window 
functions.  Thanks to Divya Bhargov from VMware for this example notebook.\n",
+"\n",
+"Data from 
https://data.cityofchicago.org/Transportation/Potholes-Patched/wqdh-9gek/data 
which is loaded from CSV format.\n",
+"\n",
+"## Table of contents \n",
+"\n",
+"1. Connect to database\n",
+"\n",
+"2. Load data\n",
+"\n",
+"3. Window functions\n",
+"\n",
+"4. Mapping for gap filling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"\n",
+"## 1. Connect to database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n",
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "PostgreSQL 8.3.23 (Greenplum Database 5.18.0 build 
commit:6aec9959d367d46c6b4391eb9ffc82c735d20102) on x86_64-pc-linux-gnu, 
compiled by GCC gcc (GCC) 6.2.0, 64-bit compiled on Apr  3 2019 
14:45:51\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'PostgreSQL 8.3.23 (Greenplum Database 5.18.0 build 
commit:6aec9959d367d46c6b4391eb9ffc82c735d20102) on x86_64-pc-linux-gnu, 
compiled by GCC gcc (GCC) 6.2.0, 64-bit compiled on Apr  3 2019 14:45:51',)]"
+  ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%load_ext sql\n",
+"\n",
+"# Greenplum Database 5.x on GCP (PM demo machine) - via tunnel\n",
+"%sql postgresql://gpadmin@localhost:8000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"%sql SELECT version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"\n",
+"## 2. Load data\n",
+"Load from CSV.  You will need to change the path to the location of the 
CSV file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "65544 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/plain": [
+   "[]"
+  ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%%sql\n",
+"DROP TABLE IF EXISTS chicago_potholes_patched;\n",
+"CREATE TABLE chicago_potholes_patched (\n",
+"id serial NOT NULL,\n",
+"address TEXT,\n",
+"request_date TIMESTAMP,\n",

[madlib] branch master updated: fix error in marginal effects example

2021-02-02 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new a70a877  fix error in marginal effects example
a70a877 is described below

commit a70a8776fea111afef353f91f0bad93ffa13b6ab
Author: Frank McQuillan 
AuthorDate: Tue Feb 2 11:48:55 2021 -0800

fix error in marginal effects example
---
 src/ports/postgres/modules/regress/marginal.sql_in | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/ports/postgres/modules/regress/marginal.sql_in 
b/src/ports/postgres/modules/regress/marginal.sql_in
index a19424e..3cb3f8a 100644
--- a/src/ports/postgres/modules/regress/marginal.sql_in
+++ b/src/ports/postgres/modules/regress/marginal.sql_in
@@ -38,9 +38,7 @@ computed is the average of the marginal effect at every data 
point present in th
 source table.
 
 MADlib provides marginal effects regression functions for linear, logistic and
-multinomial logistic regressions.
-
-@warning The margins_logregr() and margins_mlogregr() functions have been 
deprecated in favor of the margins() function.
+multinomial logistic regressions. The implementation is similar to reference 
[1].
 
 @anchor margins
 @par Marginal Effects with Interaction Terms
@@ -321,11 +319,11 @@ DROP TABLE IF EXISTS margins_table;
 SELECT madlib.logregr_train( 'patients',
  'model_table',
  'second_attack',
- 'ARRAY[1, treatment, trait_anxiety, treatment^2, 
treatment * trait_anxiety]'
+ 'ARRAY[1, treatment, trait_anxiety, treatment * 
trait_anxiety]'
);
 SELECT madlib.margins( 'model_table',
'margins_table',
-   'intercept, treatment, trait_anxiety, treatment^2, 
treatment*trait_anxiety',
+   'intercept, treatment, trait_anxiety, 
treatment*trait_anxiety',
NULL,
NULL
  );
@@ -347,7 +345,7 @@ and view the results (using different names in 'x_design').
 DROP TABLE IF EXISTS result_table;
 SELECT madlib.margins( 'model_table',
'result_table',
-   'i, tre, tra, tre^2, tre*tra',
+   'i, tre, tra, tre*tra',
NULL,
'tre'
  );
@@ -475,7 +473,7 @@ We use the delta method for calculating standard errors on 
the marginal effects.
 @literature
 
 
-[1] mfx function in STATA: http://www.stata.com/help.cgi?mfx_option
+[1] Marginal effects in Stata: https://www.stata.com/
 
 @anchor related
 @par Related Topics



[madlib] branch master updated: clarify grouping not part of arima currently in user docs

2021-02-02 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 2e75913  clarify grouping not part of arima currently in user docs
2e75913 is described below

commit 2e75913b32d6ee6282da5fd7e77c2fee80befd6a
Author: Frank McQuillan 
AuthorDate: Tue Feb 2 12:45:15 2021 -0800

clarify grouping not part of arima currently in user docs
---
 src/ports/postgres/modules/tsa/arima.sql_in | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/ports/postgres/modules/tsa/arima.sql_in 
b/src/ports/postgres/modules/tsa/arima.sql_in
index 48f0abd..12930a6 100644
--- a/src/ports/postgres/modules/tsa/arima.sql_in
+++ b/src/ports/postgres/modules/tsa/arima.sql_in
@@ -158,13 +158,17 @@ arima_train( input_table,
 TEXT. The name of the column containing the time series data. This 
data is
 currently restricted to DOUBLE PRECISION.
 
-grouping_columns (optional)
-TEXT, default: NULL. Not currently implemented. Any non-NULL value 
is ignored.
+grouping_columns (not currently implemented)
+TEXT, default: NULL.
 
 A comma-separated list of column names used to group the input dataset
 into discrete groups, training one ARIMA model per group. It is similar to
 the SQL GROUP BY clause. When this value is null, no grouping is
-used and a single result model is generated.
+used and a single result model is generated.
+
+@note Grouping is not currently implemented for ARIMA, but 
+will be added in the future.  Any non-NULL value for this parameter
+is ignored.
 
 include_mean (optional)
 BOOLEAN, default: FALSE. Mean value of the data series is added in the 
ARIMA model



[madlib] branch master updated: move notes to bottom of page for consistency in user docs

2021-02-08 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new f29674b  move notes to bottom of page for consistency in user docs
f29674b is described below

commit f29674b8bf3d500b3dcca38e81356a4a39591bec
Author: Frank McQuillan 
AuthorDate: Mon Feb 8 12:58:26 2021 -0800

move notes to bottom of page for consistency in user docs
---
 src/ports/postgres/modules/graph/apsp.sql_in | 26 +++---
 src/ports/postgres/modules/graph/bfs.sql_in  | 28 
 src/ports/postgres/modules/graph/hits.sql_in | 22 +--
 src/ports/postgres/modules/graph/pagerank.sql_in | 10 ++---
 src/ports/postgres/modules/graph/sssp.sql_in | 28 
 src/ports/postgres/modules/graph/wcc.sql_in  |  7 ++
 6 files changed, 66 insertions(+), 55 deletions(-)

diff --git a/src/ports/postgres/modules/graph/apsp.sql_in 
b/src/ports/postgres/modules/graph/apsp.sql_in
index 893cd79..bab6d83 100644
--- a/src/ports/postgres/modules/graph/apsp.sql_in
+++ b/src/ports/postgres/modules/graph/apsp.sql_in
@@ -34,8 +34,8 @@ m4_include(`SQLCommon.m4')
 Contents
 
 APSP
-Notes
 Examples
+Notes
 Literature
 
 
@@ -159,18 +159,6 @@ It contains a row for every group and has the following 
columns:
 
 
 
-@anchor notes
-@par Notes
-
-Graphs with negative edges are supported but graphs with negative cycles are 
not.
-
-The implementation is analogous to a matrix multiplication procedure.
-Please refer to the MADlib design document and references [1] and [2]
-for more details.
-
-Also see the Grail project [3] for more background on graph analytics 
processing
-in relational databases.
-
 @anchor examples
 @examp
 
@@ -369,6 +357,18 @@ SELECT * FROM out_gr_path ORDER BY grp;
1 | {0,4,5}
 
 
+@anchor notes
+@par Notes
+
+1. Graphs with negative edges are supported but graphs with negative cycles 
are not.
+
+2. The implementation for APSP is analogous to a matrix multiplication 
operation.
+Please refer to the MADlib design document and references [1] and [2]
+for more details.
+
+3. Also see the Grail project [3] for more background on graph analytics 
processing
+in relational databases.
+
 @anchor literature
 @par Literature
 
diff --git a/src/ports/postgres/modules/graph/bfs.sql_in 
b/src/ports/postgres/modules/graph/bfs.sql_in
index f9507d9..d2474f0 100644
--- a/src/ports/postgres/modules/graph/bfs.sql_in
+++ b/src/ports/postgres/modules/graph/bfs.sql_in
@@ -33,8 +33,8 @@ m4_include(`SQLCommon.m4')
 Contents
 
 Breadth-First Search
-Notes
 Examples
+Notes
 Literature
 
 
@@ -130,19 +130,6 @@ and a single BFS result is generated.
 
 
 
-@note On a Greenplum cluster, the edge table should be distributed
-by the source vertex id column for better performance.
-
-@anchor notes
-@par Notes
-
-The graph_bfs function is a SQL implementation of the well-known breadth-first
-search algorithm [1] modified appropriately for a relational database. It will
-find any node in the graph reachable from the source_vertex only once. If a 
node
-is reachable by many different paths from the source_vertex (i.e. has more than
-one parent), then only one of those parents is present in the output table.
-The BFS result will, in general, be different for different choices of 
source_vertex.
-
 @anchor examples
 @examp
 
@@ -388,6 +375,19 @@ SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
 (7 rows)
 
 
+@anchor notes
+@par Notes
+
+1. On a Greenplum cluster, the edge table should be distributed
+by the source vertex id column for better performance.
+
+2. The graph_bfs function is a SQL implementation of the well-known 
breadth-first
+search algorithm [1] modified appropriately for a relational database. It will
+find any node in the graph reachable from the 'source_vertex' only once. If a 
node
+is reachable by many different paths from the 'source_vertex' (i.e. has more 
than
+one parent), then only one of those parents is present in the output table.
+The BFS result will, in general, be different for different choices of 
'source_vertex'.
+
 @anchor literature
 @par Literature
 
diff --git a/src/ports/postgres/modules/graph/hits.sql_in 
b/src/ports/postgres/modules/graph/hits.sql_in
index d2d6cfc..6f140c8 100644
--- a/src/ports/postgres/modules/graph/hits.sql_in
+++ b/src/ports/postgres/modules/graph/hits.sql_in
@@ -34,13 +34,13 @@ m4_include(`SQLCommon.m4')
 Contents
 
 HITS
-Notes
 Examples
+Notes
 Literature
 
 
 
-@brief Find the HITS scores(authority and hub) of all vertices in a directed
+@brief Find the HITS scores (authority and hub) of all vertices in a directed
 graph.
 
 Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the
@@ -127,15 +127,6 @@ parameter.
 
 
 
-@note On a Greenplum cluster, the edge table should be distributed
-by the source vertex id column

[madlib-site] branch asf-site updated (d27cc96 -> 222c2eb)

2021-04-02 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a change to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git.


from d27cc96  Add docs for 1.18.0 RC1
 new 1b38647  updated jupyter notebooks for 1dot18dot0 release
 new e6b2ff6  2nd update jupyter notebooks for 1dot18dot0 release
 new 8bb79b8  trivial update
 new 36413e3  minor edits to multiple workbooks
 new 222c2eb  Merge pull request #21 from 
fmcquillan99/1dot18dot0-jupyter-notebooks

The 113 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../Encoding-categorical-variables-v2.ipynb|   57 +-
 .../Path-demo-4.ipynb  |  856 ++-
 .../Load-model-selection-table-v1.ipynb|  955 ---
 .../Deep-learning/MADlib-Keras-MLP-v2.ipynb| 4057 
 .../MADlib-Keras-cifar10-inference-v1.ipynb|  601 --
 .../MADlib-Keras-model-selection-MLP-v1.ipynb  | 5709 
 .../Define-custom-functions-v1.ipynb   |  531 ++
 .../Define-model-architecture-v2.ipynb}|  270 +-
 ...rocessor-for-images-distribution-rules-v1.ipynb |   10 +-
 .../Preprocessor-for-images-v2.ipynb   |  739 +--
 .../Train-multiple-models/AutoML-MLP-v1.ipynb  | 6937 
 .../Define-model-configurations-v2.ipynb   | 2025 ++
 ...Dlib-Keras-model-selection-CNN-cifar10-v1.ipynb |0
 .../MADlib-Keras-model-selection-MLP-v1.ipynb  | 6279 ++
 .../Train-single-model/MADlib-Keras-MLP-v2.ipynb   | 5025 ++
 .../MADlib-Keras-cifar10-cnn-v3.ipynb  |   74 +-
 .../MADlib-Keras-cifar10-inference-v1.ipynb|  829 +++
 .../MADlib-Keras-imagenet-inference-v1.ipynb   |2 +-
 .../MADlib-Keras-transfer-learning-v3.ipynb|  778 ++-
 .../{ => Utilities}/Load-images-v1.ipynb   |  382 +-
 .../{ => Utilities}/madlib_image_loader.py |0
 .../automl/hyperband-diag-cifar10-v1.ipynb | 5288 ---
 .../MADlib-e2e-ds-workflow-abalone.ipynb   | 2181 +++---
 community-artifacts/Graph/PageRank-v2.ipynb|   93 +-
 .../Supervised-learning/Decision-trees-v2.ipynb|  346 +-
 .../Supervised-learning/Linear-regression-v1.ipynb |  100 +-
 .../Supervised-learning/MLP-mnist-v3.ipynb |  386 +-
 .../SVM-binary-classification-v1.ipynb |  201 +-
 .../SVM-novelty-detection-v2.ipynb |  484 +-
 .../Kmeans-auto-k-selection-v1.ipynb   |  241 +-
 30 files changed, 25559 insertions(+), 19877 deletions(-)
 delete mode 100644 
community-artifacts/Deep-learning/Load-model-selection-table-v1.ipynb
 delete mode 100644 community-artifacts/Deep-learning/MADlib-Keras-MLP-v2.ipynb
 delete mode 100644 
community-artifacts/Deep-learning/MADlib-Keras-cifar10-inference-v1.ipynb
 delete mode 100644 
community-artifacts/Deep-learning/MADlib-Keras-model-selection-MLP-v1.ipynb
 create mode 100755 
community-artifacts/Deep-learning/Model-preparation/Define-custom-functions-v1.ipynb
 rename community-artifacts/Deep-learning/{Load-model-architecture-v2.ipynb => 
Model-preparation/Define-model-architecture-v2.ipynb} (68%)
 rename community-artifacts/Deep-learning/{ => 
Model-preparation}/Preprocessor-for-images-distribution-rules-v1.ipynb (98%)
 rename community-artifacts/Deep-learning/{ => 
Model-preparation}/Preprocessor-for-images-v2.ipynb (61%)
 create mode 100755 
community-artifacts/Deep-learning/Train-multiple-models/AutoML-MLP-v1.ipynb
 create mode 100755 
community-artifacts/Deep-learning/Train-multiple-models/Define-model-configurations-v2.ipynb
 rename community-artifacts/Deep-learning/{ => 
Train-multiple-models}/MADlib-Keras-model-selection-CNN-cifar10-v1.ipynb (100%)
 create mode 100644 
community-artifacts/Deep-learning/Train-multiple-models/MADlib-Keras-model-selection-MLP-v1.ipynb
 create mode 100644 
community-artifacts/Deep-learning/Train-single-model/MADlib-Keras-MLP-v2.ipynb
 rename community-artifacts/Deep-learning/{ => 
Train-single-model}/MADlib-Keras-cifar10-cnn-v3.ipynb (99%)
 create mode 100644 
community-artifacts/Deep-learning/Train-single-model/MADlib-Keras-cifar10-inference-v1.ipynb
 rename community-artifacts/Deep-learning/{ => 
Train-single-model}/MADlib-Keras-imagenet-inference-v1.ipynb (99%)
 mode change 100644 => 100755
 rename community-artifacts/Deep-learning/{ => 
Train-single-model}/MADlib-Keras-transfer-learning-v3.ipynb (68%)
 rename community-artifacts/Deep-learning/{ => Utilities}/Load-images-v1.ipynb 
(83%)
 rename community-artifacts/Deep-learning/{ => 
Utilities}/madlib_image_loader.py (100%)
 delete mode 100644 
community-artifacts/Deep-learning/automl/hyperband-diag-cifar10-v1.ipynb


[madlib] branch master updated: release notes for 1dot18dot0

2021-03-10 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new c6a5883  release notes for 1dot18dot0
c6a5883 is described below

commit c6a5883e193a8f89d1b29dd0317f7976e7a969fa
Author: Frank McQuillan 
AuthorDate: Tue Mar 9 11:19:26 2021 -0800

release notes for 1dot18dot0
---
 RELEASE_NOTES | 52 
 1 file changed, 52 insertions(+)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 030d28c..918cdf4 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -10,6 +10,58 @@ commit history located at 
https://github.com/apache/madlib/commits/master.
 Current list of bugs and issues can be found at 
https://issues.apache.org/jira/browse/MADLIB.
 
 —-
+MADlib v1.18.0:
+
+Release Date: 2021-Mar-16
+
+New features
+- DL: setup methods for grid search and random search (MADLIB-1439)  
+- DL: Add support for custom loss functions (MADLIB-1441) 
+- DL: Hyperband phase 1 - print run schedule (MADLIB-1445)
+- DL: Hyperband phase 2 - generate MST table (MADLIB-1446)
+- DL: Hyperband phase 3 - logic for diagonal runs (MADLIB-1447)   
+- DL: Hyperband phase 4 - implement full logic with default params 
(MADLIB-1448)  
+- DL: Hyperband phase 5 - implement full logic with optional params 
(MADLIB-1449) 
+- AutoML: add Hyperopt for deep learning (MADLIB-1453)
+- DL: Add Multiple input/output support to load, fit, and evaluate 
(MADLIB-1457)  
+- DL: Add multiple input/output support on advanced features (MADLIB-1458) 
   
+- DL: add caching param to autoML interface (MADLIB-1461) 
+- DL: Add support for TensorBoard (MADLIB-1474)
+- DBSCAN clustering algo - phase 1 (MADLIB-1017)  
+
+Improvements:
+- DL: cache data to speed training (MADLIB-1427) 
+- DL: reduce GPU idle time between hops (MADLIB-1428)
+- DL: utility to load and delete custom Python functions (MADLIB-1429)   
+- DL: support custom loss functions (MADLIB-1432)
+- DL: support custom metrics (MADLIB-1433)   
+- DL: Fit multiple does not print timing for validation evaluate 
(MADLIB-1462)   
+- DL: Fix gpu_memory_fraction for distribution_policy != 'all_segments' 
(MADLIB-1463) 
+- DL: add object table info in load MST table utility function 
(MADLIB-1430) 
+- DL: improve speed of evaluate for multiple model training (MADLIB-1431)  
  
+- DL: improve existing grid search method (MADLIB-1440)
+- DL: Remove dependency on keras (MADLIB-1450)
+- DL: Improve output of predict (MADLIB-1451) 
+- DL: Add top n to evalute() (MADLIB-1452)
+- DL - Write best so far to console for autoML methods (MADLIB-1454)  
+- Do not try to drop output tables (MADLIB-1442)
+- Prevent an "integer out of range" exception in linear regression train 
(MADLIB-1460)
+
+Bug fixes:
+- DL: Fix fit_multiple when output_table or mst_table is passed as NULL 
(MADLIB-1464) 
+- DL: Iris predict accuracy has regressed (MADLIB-1465)   
+- DL: madlib_keras_fit_multiple_model goes down with an IndexError: tuple 
index out of range (MADLIB-1467)
+- DL: Crash in fit_multiple when any model reaches loss=nan (MADLIB-1443) 
+- DL: BYOM fails at get_num_classes (MADLIB-1472) 
+- DL: Hyperband cumulative output time is not correct (MADLIB-1456)  
+- check bigint support for all graph methods (MADLIB-1444)   
+- MLP: weights param not working (MADLIB-1471)  
+
+Other:
+- Create build trigger jobs on cloudbees (MADLIB-1466)
+
+
+—-
 MADlib v1.17.0:
 
 Release Date: 2020-Mar-31



[madlib] branch master updated: clarify example in user docs for loading model arch

2021-03-04 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 7eeb29c  clarify example in user docs for loading model arch
7eeb29c is described below

commit 7eeb29c6827ff6968e9533536d6f32f8bc6de3c8
Author: Frank McQuillan 
AuthorDate: Thu Mar 4 15:41:33 2021 -0800

clarify example in user docs for loading model arch
---
 .../deep_learning/keras_model_arch_table.sql_in| 46 +-
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in 
b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
index ee30f94..0c099e0 100644
--- a/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
+++ b/src/ports/postgres/modules/deep_learning/keras_model_arch_table.sql_in
@@ -237,8 +237,15 @@ output table 'iris_model' from a previous run
 of 'madlib_keras_fit()' :
 
 UPDATE model_arch_library SET model_weights = model_weights FROM iris_model 
WHERE model_id = 2;
+SELECT model_id, name, description, (model_weights IS NOT NULL) AS 
has_model_weights FROM model_arch_library ORDER BY model_id;
 
-To load weights from Keras using a PL/Python function,
+
+ model_id |  name  | description | has_model_weights 
+--++-+---
+1 | Sophie | A simple model  | f
+2 | Maria  | Also a simple model | t
+
+-# To load weights from Keras using a PL/Python function,
 we need to flatten then serialize the weights to store
 as a PostgreSQL binary data type. Byte format is more
 efficient on space and memory compared to a numeric array.
@@ -273,15 +280,16 @@ plpy.execute(load_query, [model.to_json(), weights_bytea])
 $$ language plpythonu;
 -- Call load function
 SELECT load_weights();
--- Check weights loaded OK
-SELECT COUNT(*) FROM model_arch_library WHERE model_weights IS NOT NULL;
+SELECT model_id, name, description, (model_weights IS NOT NULL) AS 
has_model_weights FROM model_arch_library ORDER BY model_id;
 
 
- count
+
- 1
+ model_id |  name  | description | has_model_weights 
+--++-+---
+1 | Sophie | A simple model  | f
+2 | Maria  | Also a simple model | t
+3 | Ella   | Model x | t
 
-Load weights from Keras using psycopg2.  (Psycopg is a PostgreSQL database 
adapter for the
+-# Load weights from Keras using psycopg2.  (Psycopg is a PostgreSQL database 
adapter for the
 Python programming language.) As above we need to flatten then serialize the 
weights to store as a
 PostgreSQL binary data type.  Note that the psycopg2.Binary function used 
below will increase the size of the
 Python object for the weights, so if your model is large it might be better to 
use a PL/Python function as above.
@@ -310,27 +318,29 @@ weights_bytea = psycopg2.Binary(weights1d.tostring())
 query = "SELECT madlib.load_keras_model('model_arch_library', %s,%s)"
 cur.execute(query,[model.to_json(),weights_bytea])
 conn.commit()
-
-From SQL check if weights loaded OK:
-
-SELECT COUNT(*) FROM model_arch_library WHERE model_weights IS NOT NULL;
+SELECT model_id, name, description, (model_weights IS NOT NULL) AS 
has_model_weights FROM model_arch_library ORDER BY model_id;
 
 
- count
+
- 2
+ model_id |  name  | description | has_model_weights 
+--++-+---
+1 | Sophie | A simple model  | f
+2 | Maria  | Also a simple model | t
+3 | Ella   | Model x | t
+4 | Grace  | Model y | t
 
 -# Delete one of the models:
 
 SELECT madlib.delete_keras_model('model_arch_library',   -- Output table
   1  -- Model id
 );
-SELECT COUNT(*) FROM model_arch_library;
+SELECT model_id, name, description, (model_weights IS NOT NULL) AS 
has_model_weights FROM model_arch_library ORDER BY model_id;
 
 
- count
+
- 2
+ model_id | name  | description | has_model_weights 
+--+---+-+---
+2 | Maria | Also a simple model | t
+3 | Ella  | Model x | t
+4 | Grace | Model y | t
 
 
 @anchor related



[madlib] branch master updated: clarify input row weights vs network weights in user docs for MLP

2021-03-08 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new fe1c1f5  clarify input row weights vs network weights in user docs for 
MLP
fe1c1f5 is described below

commit fe1c1f5915cc7c5c0dfa7422e3b6a7713402524f
Author: Frank McQuillan 
AuthorDate: Mon Mar 8 15:35:08 2021 -0800

clarify input row weights vs network weights in user docs for MLP
---
 src/ports/postgres/modules/convex/mlp.sql_in | 26 ++
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/ports/postgres/modules/convex/mlp.sql_in 
b/src/ports/postgres/modules/convex/mlp.sql_in
index d6ce7ce..d98f8c4 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -152,19 +152,20 @@ mlp_classification(
 
   weights (optional)
   TEXT, default: 1.
-Weights for input rows. Column name which specifies the weight for each 
input row.
-This weight will be incorporated into the update during stochastic gradient
-descent (SGD), but will not be used for loss calculations. If not 
specified,
- weight for each row will default to 1 (equal weights).  Column should be a
-  numeric type.
+Column name for giving different weights to different rows during training.
+E.g., a weight of two for a specific row is equivalent to dupicating that 
row.
+This weight is incorporated into the update during stochastic gradient
+descent (SGD), but is not be used for loss calculations. If not specified,
+weight for each row will default to 1 (equal weights).  Column should be a
+numeric type.
 @note
-The 'weights' parameter is not currently for mini-batching.
+The 'weights' parameter cannot be used if you use mini-batching of the 
source dataset.
   
 
   warm_start (optional)
   BOOLEAN, default: FALSE.
-Initalize weights with the coefficients from the last call of the training
-function. If set to true, weights will be initialized from the output_table
+Initalize neural network weights with the coefficients from the last call 
of the training
+function. If set to true, neural network weights will be initialized from 
the output_table
 generated by the previous run. Note that all parameters other than
 optimizer_params and verbose must remain constant between calls when
 warm_start is used.
@@ -173,7 +174,7 @@ mlp_classification(
 The warm start feature works based on the name of the output_table.
 When using warm start, do not drop the output table or the output table 
summary
 before calling the training function, since these are needed to obtain the
-weights from the previous run.
+neural network weights from the previous run.
 If you are not using warm start, the output table and the output table
 summary must be dropped in the usual way before calling the training 
function.
 
@@ -294,7 +295,8 @@ A summary table named \_summary is also 
created, which has the fo
 
 
 weights
-The weight column used during training.
+The weight column used during training for giving different
+weights to different rows.
 
 
 grouping_col
@@ -421,7 +423,7 @@ a factor of gamma.  Valid for learning rate policy = 'step'.
 
 n_tries
 Default: 1. Number of times to retrain the network with randomly 
initialized
-weights.
+neural network weights.
 
 
 lambda
@@ -954,7 +956,7 @@ num_iterations | 450
 
 Notice that the loss is lower compared to the previous example, despite
 having the same values for every other parameter. This is because the algorithm
-learnt three different models starting with a different set of initial weights
+learned three different models starting with a different set of initial weights
 for the coefficients, and chose the best model among them as the initial
 weights for the coefficients when run with warm start.
 



[madlib] branch master updated: update example in multi-fit to use new model config generator

2021-03-05 Thread fmcquillan
This is an automated email from the ASF dual-hosted git repository.

fmcquillan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
 new 33ad16c  update example in multi-fit to use new model config generator
33ad16c is described below

commit 33ad16c29af1e99a02a8a153671a9a16608e74c6
Author: Frank McQuillan 
AuthorDate: Fri Mar 5 16:54:34 2021 -0800

update example in multi-fit to use new model config generator
---
 .../madlib_keras_fit_multiple_model.sql_in | 69 --
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index 67ee2c7..e8c4d51 100644
--- 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -999,41 +999,44 @@ $$
'MLP with 2 hidden layers'   -- Descr
 );
 
--# Define model selection tuples and load.  Select the model(s) from the model 
architecture
-table that you want to run, along with the compile and fit parameters. 
Combinations will be
-created for the set of model selection parameters will be loaded:
+-# Generate model configurations using grid search. The output table for grid 
+search contains the unique combinations of model architectures, compile and 
+fit parameters.
 
 DROP TABLE IF EXISTS mst_table, mst_table_summary;
-SELECT madlib.load_model_selection_table('model_arch_library', -- model 
architecture table
- 'mst_table',  -- model 
selection table output
-  ARRAY[1,2],  -- model 
ids from model architecture table
-  ARRAY[   -- compile 
params
-  
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.1)',metrics=['accuracy']$$,
-  
$$loss='categorical_crossentropy', 
optimizer='Adam(lr=0.01)',metrics=['accuracy']$$,
-  
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.001)',metrics=['accuracy']$$
-  ],
-  ARRAY[-- fit 
params
-  $$batch_size=4,epochs=1$$,
-  $$batch_size=8,epochs=1$$
-  ]
+SELECT madlib.generate_model_configs(
+'model_arch_library', -- model 
architecture table
+'mst_table',  -- model 
selection table output
+ ARRAY[1,2],  -- model ids 
from model architecture table
+ $$
+{'loss': 
['categorical_crossentropy'],
+ 'optimizer_params_list': [ 
{'optimizer': ['Adam'], 'lr': [0.001, 0.01, 0.1]} ],
+ 'metrics': ['accuracy']}
+ $$,  -- 
compile_param_grid
+ $$
+ { 'batch_size': [4, 8],
+   'epochs': [1]
+ }
+ $$,  -- fit_param_grid
+ 'grid'   -- search_type
  );
 SELECT * FROM mst_table ORDER BY mst_key;
 
 
- mst_key | model_id | compile_params   
   |  fit_params
+ mst_key | model_id | compile_params   
   |  fit_params   
 
-+--+-+---
-   1 |1 | 
loss='categorical_crossentropy',optimizer='Adam(lr=0.1)',metrics=['accuracy']   
| batch_size=4,epochs=1
-   2 |1 | 
loss='categorical_crossentropy',optimizer='Adam(lr=0.1)',metrics=['accuracy']   
| batch_size=8,epochs=1
-   3 |1 | loss='categorical_crossentropy', 
optimizer='Adam(lr=0.01)',metrics=['accuracy'] | batch_size=4,epochs=1
-   4 |1 | loss='categorical_crossentropy', 
optimizer='Adam(lr=0.01)',metrics=['accuracy'] | batch_size=8,epochs=1
-   5 |1 | 
loss='categorical_crossentropy',optimizer='Adam(lr=0.001)',metrics=['accuracy'] 
| batch_size=4,epochs=1
-   6