[12/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/KNN-v4.ipynb -- diff --git a/community-artifacts/KNN-v4.ipynb b/community-artifacts/KNN-v4.ipynb new file mode 100644 index 000..a4b3304 --- /dev/null +++ b/community-artifacts/KNN-v4.ipynb @@ -0,0 +1,857 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# k-Nearest Neighbors\n", +"Finds k nearest data points to a given data point and outputs majority vote value of output classes in case of classification, and average value of target values in case of regression. KNN was first added in MADlib 1.10 with updates in 1.13 and 1.14." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-12-gb8a306e, cmake configuration time: Mon Feb 12 19:57:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-12-gb8a306e, cmake configuration time: Mon Feb 12 19:57:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data for classification" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "9 rows affected.\n", + "9 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "data\n", + "label\n", + "\n", + "\n", + "1\n", + "[1, 1]\n", + "1\n", + "\n", + "\n", + "2\n", + "[2, 2]\n", + "1\n", + "\n", + "\n", + "3\n", + "[3, 3]\n", + "1\n", + "\n", + "\n", + "4\n", + "[4, 4]\n", + "1\n", + "\n", + "\n", + "5\n", + "[4, 5]\n", + "1\n", + "\n", + "\n", + "6\n", + "[20, 50]\n", + "0\n", + "\n", + "\n", + "7\n", + "[10, 31]\n", + "0\n", + "\n", + "\n", + "8\n", + "[81, 13]\n", + "0\n", + "\n", + "\n", + "9\n", + "[1, 111]\n", + "0\n", + "\n", + "" + ], + "text/plain
[08/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Random-forest-v1.ipynb -- diff --git a/community-artifacts/Random-forest-v1.ipynb b/community-artifacts/Random-forest-v1.ipynb deleted file mode 100644 index bac8363..000 --- a/community-artifacts/Random-forest-v1.ipynb +++ /dev/null @@ -1,2899 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Random forest\n", -"\n", -"Random forests build an ensemble of classifiers, each of which is a tree model constructed using bootstrapped samples from the input data. The results of these models are then combined to yield a single prediction, which, at the expense of some loss in interpretation, have been found to be highly accurate.\n", -"\n", -"Please also refer to the decision tree user documentation for information relevant to the implementation of random forests in MADlib." - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpadmin@madlib'" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum Database 5.4.0 on GCP (demo machine)\n", -"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", -"\n", -"# PostgreSQL local\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum Database 4.3.10.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Random forest classification examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. Load data\n", -"Data set related to whether to play golf or not." - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "14 rows affected.\n", - "14 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "OUTLOOK\n", - "temperature\n", - "humidity\n", - "Temp_Humidity\n", - "clouds_airquality\n", - "windy\n", - "class\n", - "\n", - "\n", - "1\n", - "sunny\n", - "85.0\n", - "85.0\n", - "[85.0, 85.0]\n", - "[u'none', u'unhealthy']\n", - "False\n", - "Don't Play\n", - "\n", - "\n", - "2\n", - "sunny\n", - "80.0\n", - "90.0\n", - "[80.0, 90.0]\n", - "[u'none', u'moderate']\n", - "True\n", - "Don't Play\n", - "\n", - "\n", - "3\n", - "overcast\n", - "83.0\n", - "78.0\n", - "[83.0, 78.0]\n", - "[u'low', u'moderate']\n", - "False\n", - "Play\n", - "\n", - "\n", - "4\n", - "rain\n", -
[17/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Covariance-and-correlation-v1.ipynb -- diff --git a/community-artifacts/Covariance-and-correlation-v1.ipynb b/community-artifacts/Covariance-and-correlation-v1.ipynb new file mode 100644 index 000..aa17628 --- /dev/null +++ b/community-artifacts/Covariance-and-correlation-v1.ipynb @@ -0,0 +1,1318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Covariance and Correlation\n", +"\n", +"Generates a covariance or Pearson correlation matrix for pairs of numeric columns in a table. Grouping added in 1.15." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { +"scrolled": true + }, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.15-dev, git revision: rc/1.14-rc1-6-g3b80a32, cmake configuration time: Wed May 16 19:29:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-6-g3b80a32, cmake configuration time: Wed May 16 19:29:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "53 rows affected.\n", + "53 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "outlook\n", + "temperature\n", + "humidity\n", + "windy\n", + "class\n", + "day\n", + "\n", + "\n", + "1\n", + "sunny\n", + "85.0\n", + "85.0\n", + "false\n", + "Dont Play\n", + "Mon\n", + "\n", + "\n", + "2\n", + "sunny\n", + "80.0\n", + "90.0\n", + "true\n", + "Dont Play\n", + "Mon\n", + "\n", + "\n", + "3\n", + "overcast\n", + "83.0\n", + "78.0\n", + "false\n", + "Play\n", + "Mon\n", + "\n", + "\n", + "4\n", + "rain\n", + "70.0\n", + "96.0\n", + "false\n", + "Play\n", + "Mon\n", + "\n", + "\n", + "5\n", + "rain
[13/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Elastic-net-v3.ipynb -- diff --git a/community-artifacts/Elastic-net-v3.ipynb b/community-artifacts/Elastic-net-v3.ipynb new file mode 100644 index 000..7592fe6 --- /dev/null +++ b/community-artifacts/Elastic-net-v3.ipynb @@ -0,0 +1,2049 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Elastic net (MADlib v1.10+)\n", +"Demonstrates elastic net, including these updates:\n", +"- in MADlib 1.10: grouping and cross validation introduced \n", +"- in MADlib 1.13: report negative root mean squared error instead of the negative mean squared error" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-gabafa66, cmake configuration time: Wed Jul 11 00:36:05 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-gabafa66, cmake configuration time: Wed Jul 11 00:36:05 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"## 1. Create data set\n", +"House prices and characteristics." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "27 rows affected.\n", + "27 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "tax\n", + "bedroom\n", + "bath\n", + "price\n", + "size\n", + "lot\n", + "zipcode\n", + "\n", + "\n", + "1\n", + "590\n", + "2\n", + "1.0\n", + "5\n", + "770\n", + "22100\n", + "94301\n", + "\n", + "\n", + "2\n", + "1050\n", + "3\n", + "2.0\n", + "85000\n", + "1410\n", + "12000\n", + "94301\n", + "\n", + "\n", + "3\n", + "20\n", + "3\n", + "1.0\n", + "22500\n", + "1060\n", + "3500\n", + "94301\n", + "\n", + "\n", + "4\n", + "870\n", + "2\n", + "2.0\n", + "9\n", + "1300\n", + "17500\n", + "94301\n", + "\n", + "\n", + "5\n", + "1320\n", + "3\n", + "2.0\n", + "133000\n", + "1500\n", + "3\n", + "94301\n", + "\n", + "\n", + "6\n", + "1350\n", + "2\n", + "1.0\n", + "90500\n", + "820\n", + "25700\n", + "94301\n", + "
[09/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Novelty-detection-demo-1.ipynb -- diff --git a/community-artifacts/Novelty-detection-demo-1.ipynb b/community-artifacts/Novelty-detection-demo-1.ipynb deleted file mode 100755 index 563bda4..000 --- a/community-artifacts/Novelty-detection-demo-1.ipynb +++ /dev/null @@ -1,478 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Novelty detection using 1-class SVM\n", -"\n", -"Classifies new data as similar or different to the training set. This method is an unsupervised method that builds a decision boundary between the data and origin in kernel space and can be used as a novelty detector." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"# Setup\n", -"%load_ext sql\n", -"# %sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n", -"%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"%matplotlib inline\n", -"\n", -"import pandas as pd\n", -"import numpy as np\n", -"import matplotlib.pyplot as plt\n", -"import matplotlib.font_manager" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "data": { - "image/png": "iVBORw0KGgoNSUhEUgAAAW8AAAD7CAYAAAClvBX1BHNCSVQICAgIfAhkiAlwSFlz\nAAALEgAACxIB0t1+/AAAHdNJREFUeJzt3W9wXNWZ5/HvkWWRTrADso2J48SAnRk2hICMijJFaqVN\n0mqGqWhG0hvCwDSwi3Zq+WOsNiiOCHFheRUnESQwM8WYsEhhirCVYTUjZid9LZKSqkSF7LA2lJeB\nAHaGTUIYYpydGOiJsHX2xbndarW69cfq7tu3+/ep6qL76va9R23z+PRznnOOsdYiIiLhUhd0A0RE\nZOkUvEVEQkjBW0QkhBS8RURCSMFbRCSEFLxFREKovlw3MsaoJlFE5DRYa03usbL2vK21gT6++tWv\nBt6GSnnos9Bnoc8iHJ9FIUqbiIiEkIK3iEgI1VTwbm1tDboJFUOfxQx9FjP0Wcyo9M/CzJdTKeqN\njLHlupeISLUwxmCDHrAUEZHiUPAWEQkhBW8RkRBS8BYRCSEFbxGREFLwFhEJIQVvEZEQUvAWEQkh\nBW8RkRAKffD2PI+utja62trwPC/o5oiIlEWop8d7nke8o4N9qRQAvZEIwyMjxGKxot5HRCQohabH\nhzp4d7W10T42Rtx/PQyMRqM8eeBAUe8jIhKUkq9tYoypM8YcNMaMFuuaIiKSXzG3QdsO/BOwuojX\nnFd3IkF8chKy0yaJRLluLyISmKL0vI0xG4Grge8U43qLFYvFGB4ZYTQaZTQaVb5bQk2D77IURcl5\nG2O+D+wFPgwkrLXtec7Ret4iBWjwXQoplPNedtrEGPOHwL9Ya583xrQCc26Stnv37szz1tbWit+p\nQqRc9g8Osi+Vygy+k0qxf3BQwbsGjY+PMz4+vuB5y+55G2P+K3AdcBKIAKuA/2Gt/dOc89TzFilA\nlVNSSFlKBY0xLShtIrJkSptIISVLm4jI8qUH3/cPD gIwnEgocMu8Qj1JR0Sk2mkDYhGRKqLgLSIS\nQgreIiIhpOAtIhJCCt4iIiGk4C0iUkKlWrNGpYIiIiVSjMlXVbkZg4hIJSvGsgeq8xYRqSKaHi8i\nUiKl3DBGaRMRkRLyPC+zZk33aaxZo5y3iEgIKectIlJFFLxFpCy0R2dxKW0iIiWnzSZOn3LeIhIY\nbfN2+pTzFhE8z6OtrYu2ti6lLkJOdd4iNcLzPDo64qRS+wCYnIwzMjJcltRFKeuda5XSJiI1oq2t\ni7GxdshKXkSjoxw48GRZ7r/ceudapQ2IRSQvz/MYHNwPQCLRXbKgGovFFLCLSMFbpEYkEt1MTsbT\nmQsikV5aWm4LLJUiy6O0iUgNye1lDw7uDzSVIgtT2kRE5qQu0oFcwkelgiI1Zu/evaxZs4U1a7aw\nYcMqIpFeXOX1MJFIL4lEd9BNlEVQ2kSkhuzdu5e77/468IB/5Hbi8Q7eeOMEUNoBSzk9mmEpIqxZ\ns4Xjx79Cdo67sXEPb7/9WpDNknlohqWILJpmYlY+BW8JBa1IN7/FBtuenhuB20nnuOF2/9jsa3V0\nxBkba2dsrJ2Ojrg+80pkrS3Lw91KZOmSyaRdH4nYIbBDYNdHIjaZTAbdrIqRTCZtJLLewpCFIRuJ\nrC/4+SSTSbt58ydtff05dtWqj9v+/v4550Sjnf61rP8YstFoZ+b9TU0ttrFxs21qulJ/DmXgx865\nMTXfwVI8FLzldHVGo3ZoJpLYIbCd0WjQzaoY8wXbbPMF+WQyaaPRThuNdtqmppY511u16mM2Ho9b\nYxotbLOQsLDaGnO2bWpqURAvoULBW3XeIjVi164BfyalG6xMpWbqvLNnWTY03MGKFTs4dSr9zp2c\nOJFiePgJ4FbgYuAOoB5r7+PQIfd+zcwsL+W8peJ1JxJuFTpclrY3EqG7BlekK5TXTiS689Zq33DD\nDaxcuZ6VK9cTjUZ54YX/k74S 0AU8xNGjr3LttbeQSp0PnAucy9TURk6deh94CBgF/hr4C+BDwLP+\neWcCv+c/d4FfE37KLF93vBQPlDaRZUgmk7YzGrWd0WhNfkVfKK+dnfZIJpM2Ho9bWJ053z2/0sKZ\nFtbmHE/4z8+ysM5/vm1O6sQd+4iF9VnvX28hWTBVI8uHct4i4TVfXru/v982Nm62kci5NhJZZxsb\nN1tjPphzfsIPzp/KE5Q3+wE4O2Anc4L82f41zi0Q1FfnHfyU5SsUvJU2EQmx9IzJ48e/Qir1NVIp\ny/Hjf4y19cBTWWc+A3wLl+rItQ64BvhnXKrEA2K43PhOXH77s8AQkG+i3a+Bm5mYOFicX0oWZdkD\nlsaYjcB3gfXANPCwtfaB+d8lIkuRbznXRGKYa6+9BTfVPZ519qh/bAcuDw7wsv/f7qxzD+MC8hrg\nJHAecCVwLXAO8BbwH4CngQNAJ/ATYHvWvdK59jeBnxXjV5VFKkbP+yTQY629CLgCuMUYc2ERrisi\nvlgsxsiIW641Gh1dVGVHQ0MD9fV3UV9/F5//fLM/qPkmcB2uauQRYBD4MvABXOB+BNgAvAPcCEwC\n/wnYBPwt8CX/9Xagx7/Wm6e1oJVmcS5P0dc2Mcb8LfCgtfaHOcdtse8lUuvmLjS1E9ezfpj+/rsA\nuO++RwH4whc+k1mA6ujRoxw5cgfM2s/9YeCnwDf9Y7244PwM8Aug3z/fA3ZTV/cSH/rQarZsuYCB\ngV1LKhPM3U8zEulVqWEBZVnP2xhzHnAp7ruViJRYX18fAPfdt4dU6l3AUF//fc45ZyOPPvo4R478\nM/BpAIaHn2Dz5n/HBRdcwOrVH8650mHgVVzgzk7BfA2XPpn2z/H8n+9jehpOnNjJiy++sOR2Dw7u\nz1tzruC9eEUbsDTGnAn8DbDdWvtOsa4rIrPlphv6+vp4++3XeO+9X9HXt513332XI0dW+4G7Afgz\n/3EGR45 8jrGx8zl06HncQORO//EwcEHOnQ4DbwD34QY7H8b1xK/D5dVHgThTUxeqxjsARel5G2Pq\ncYH7MWvt3xU6b/fu3Znnra2ttLa2FuP2IjUjN92Qveek53ncc88g09P34wLvy7igey6wH/gk8ANc\ndci3/StuxwX4B0hPuJnxCHMHQ3v849/yX+8E1uLy5ItXaABWYHx8nPHx8YVPzFc/uNQHrtrkvgXO\nKWElpEhtmK/ee/bPOrPWIMmeVHN2njrtj+fUd2+zsLFATfjGOceMOfu0Jk7lTiyS/CjV2ibGmCuB\nPwEOG2MO4QpBv2ytTS732iK
[03/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/mlp-mnist-v2.ipynb -- diff --git a/community-artifacts/mlp-mnist-v2.ipynb b/community-artifacts/mlp-mnist-v2.ipynb deleted file mode 100644 index 3c1ad14..000 --- a/community-artifacts/mlp-mnist-v2.ipynb +++ /dev/null @@ -1,1154 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Neural networks\n", -"\n", -"Multilayer perceptron (MLP) using the well known MNIST data set.\n", -"\n", -"Updated to include mini-batching which was added in the 1.14 release.\n", -"\n", -"# Intro" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ -{ - "data": { - "image/jpeg": "/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/ WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N zMTCrxBi2\nZFYd9ozi8gMBINePW6zcfTtopc+2yqv+bfk/ziWXAcg2MRLXv/U4aX4OaGGYIIkYn6/vcTf+sX15\nd1ikYNdBqopG3GoZ7obG0b3aufZs/PXJY9jsPKrue4M2E7gDLy1w2WAbPou2O/PVi+uu
[11/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/MLP-mnist-v3.ipynb -- diff --git a/community-artifacts/MLP-mnist-v3.ipynb b/community-artifacts/MLP-mnist-v3.ipynb new file mode 100644 index 000..1fa6210 --- /dev/null +++ b/community-artifacts/MLP-mnist-v3.ipynb @@ -0,0 +1,1329 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Neural networks\n", +"\n", +"Multilayer perceptron (MLP) using the well known MNIST data set.\n", +"\n", +"Updated to include mini-batching which was added in 1.14. Momentum was added in 1.15.\n", +"\n", +"# Intro" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "data": { + "image/jpeg": "/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/ WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N zMTCrxBi2\nZFYd9ozi8gMBINePW6zcfTtopc+2yqv+bfk/ziWXAcg2MRLXv/U4aX4OaGGYIIkYn6/vcTf+sX15\nd1ikYNdBqopG3GoZ7obG0b3aufZs/PXJY9jsPKrue4M2E7gDLy1w2W
[14/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Elastic-net-v2.ipynb -- diff --git a/community-artifacts/Elastic-net-v2.ipynb b/community-artifacts/Elastic-net-v2.ipynb deleted file mode 100644 index b6082f0..000 --- a/community-artifacts/Elastic-net-v2.ipynb +++ /dev/null @@ -1,2078 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Elastic net (MADlib v1.10+)\n", -"Demonstrates elastic net, including these updates:\n", -"- in MADlib 1.10: grouping and cross validation which were introduced \n", -"- in MADlib 1.13: report negative root mean squared error instead of the negative mean squared error" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ -{ - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", - " \"You should import from traitlets.config instead.\", ShimWarning)\n", - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", - " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum 4.2.3.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.13-dev, git revision: rel/v1.12-42-gedc93f5, cmake configuration time: Fri Dec 8 18:28:18 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-42-gedc93f5, cmake configuration time: Fri Dec 8 18:28:18 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"## 1. Create data set\n", -"House prices and characteristics." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "27 rows affected.\n", - "27 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "tax\n", - "bedroom\n", - "bath\n", - "price\n", - "size\n", - "lot\n", - "zipcode\n", - "\n", - "\n", - "1\n", - "590\n", - "2\n", - "1.0\n", - "5\n", - "770\n", - "22100\n", - "94301\n", - "\n", - "\n", - "2\n", - "1050\n", - "3\n", - "2.0\n", - "85000\n", - "1410\n", - "12000\n", - "94301\n", - "\n", - "\n", - "3\n", - "20\n", - "3\n", - "1.0\n", - "22500\n", - "1060\n", - "3500\n", - "94301\n", - "\n", - "\n", - "4\n", - "870\n", - "2\n", - "2.0\n", - "9\n",
[04/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Stratified-sampling-v2.ipynb -- diff --git a/community-artifacts/Stratified-sampling-v2.ipynb b/community-artifacts/Stratified-sampling-v2.ipynb new file mode 100644 index 000..daa417b --- /dev/null +++ b/community-artifacts/Stratified-sampling-v2.ipynb @@ -0,0 +1,672 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Stratified sampling\n", +"Stratified sampling is a method for sampling subpopulations (strata) independently. It is commonly used to reduce sampling error by ensuring that subgroups are adequately represented in the sample.\n", +"\n", +"Stratified sampling was added in MADlib 1.12." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { +"scrolled": true + }, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpdbchina@madlib'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Create input table" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "25 rows affected.\n", + "25 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id1\n", + "id2\n", + "gr1\n", + "gr2\n", + "\n", + "\n", + "1\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "2\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "3\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "4\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "5\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "6\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "7\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "8\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "9\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "9\n", + "0\n", + "1\n", + "1\n", + "\n", + "\n", + "9\n", + "0\n", + "
[01/18] madlib-site git commit: update jupyter notebooks for 1dot15
Repository: madlib-site Updated Branches: refs/heads/asf-site 5fa1ac070 -> acd339f65 http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/stratified-sampling-v1.ipynb -- diff --git a/community-artifacts/stratified-sampling-v1.ipynb b/community-artifacts/stratified-sampling-v1.ipynb deleted file mode 100644 index 75e02fd..000 --- a/community-artifacts/stratified-sampling-v1.ipynb +++ /dev/null @@ -1,672 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Stratified sampling\n", -"Stratified sampling is a method for sampling subpopulations (strata) independently. It is commonly used to reduce sampling error by ensuring that subgroups are adequately represented in the sample.\n", -"\n", -"Stratified sampling was added in MADlib 1.12." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { -"scrolled": true - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum 4.2.3.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. Create input table" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "25 rows affected.\n", - "25 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id1\n", - "id2\n", - "gr1\n", - "gr2\n", - "\n", - "\n", - "1\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "2\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "3\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "4\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "5\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "6\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "7\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "8\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "9\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "9\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", -
[06/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/SVM-novelty-detection-v2.ipynb -- diff --git a/community-artifacts/SVM-novelty-detection-v2.ipynb b/community-artifacts/SVM-novelty-detection-v2.ipynb new file mode 100755 index 000..678d7c9 --- /dev/null +++ b/community-artifacts/SVM-novelty-detection-v2.ipynb @@ -0,0 +1,511 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Novelty detection using 1-class SVM\n", +"\n", +"Classifies new data as similar or different to the training set. This method is an unsupervised method that builds a decision boundary between the data and origin in kernel space and can be used as a novelty detector." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { +"collapsed": true + }, + "outputs": [], + "source": [ +"# Setup\n", +"%matplotlib inline\n", +"\n", +"import pandas as pd\n", +"import numpy as np\n", +"import matplotlib.pyplot as plt\n", +"import matplotlib.font_manager" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "data": { + "image/png": "iVBORw0KGgoNSUhEUgAAAW8AAAD7CAYAAAClvBX1BHNCSVQICAgIfAhkiAlwSFlz\nAAALEgAACxIB0t1+/AAAHfZJREFUeJzt3X9wXOV97/H3IwulS2yMZTmG4OCACOMCHiOby3DHnWsN\nYXcZOlUq6x9CSJUfjSZznZofx0RQU6IEcYkTtvnVtB6RTKzAMPQmvmrVznSPlXbEHTE3vQk2lDhQ\niIcyIQZSYXJBwyay0XP/eM6uVqtdayWt9uzZ/bxmdtgfZ88+LObjZ7/Pj2OstYiISLQ0hd0AERFZ\nPIW3iEgEKbxFRCJI4S0iEkEKbxGRCFJ4i4hEUHO1PsgYozmJIiJLYK01hc9VtedtrQ319oUvfCH0\nNtTKTd+Fvgt9F9H4LkpR2UREJIIU3iIiEdRQ4d3Z2Rl2E2qGvotZ+i5m6buYVevfhTlbTaWiH2SM\nrdZniYjUC2MMNuwBSxERqQyFt4hIBCm8RUQiSOEtIhJBCm+RMvm+T08iQU8ige/7YTdHGpxmm4iU\nwfd9eru7OZDJANAfizE8MkIymQy5ZVLvNNtEZBmGUikOZDL0Ar3AgUyGoVQq7GZFin65VFbVNqYS\nkcZV+Muld2JCv1yWSeEtUoY+z6N3YgLyyyaeF3KroiP/lwsAwS8XhffSKbxFypBMJhkeGcmVSoY9\nT8EjodKApTQ83/dzodynUF4RGvBdulIDlgpvaWgKlerRX5JLs+LhbYxpAn4KvGKt7SryusJbak5P\nIkHX2FiuFjsMjMbjHD5yJMxmieRUY6rgbcDPK3g+EREpoSLhbYzZBNwEfKcS5xOplj7Pc6USXK+7\nPxajT7NIJAIqUjYxxvwAeABYC3gqm0iUqBYrtaxU2WTZUwWNMX8IvG6tfdoY0wnM+5CsgYGB3P3O\nzs6av1KFNIZkMqnAlpoxPj7O+Pj4gsctu+dtjPkfw K3AGSAGrAH+l7X2TwqOU89bRGSRqjJV0Biz\nC5VNREQqRhtTiYjUES3SERGpYep5i4jUEYV3hWnPYhGpBoV3BWX3yegaG3NLrru7FeAiDWqlO3IK\n7wrS1VYam351SVY1OnLaz1ukAnSlGMlXjYtPKLwrSFdbaVy6UoxUm8K7gnS1FRGB6nTkNM9bpAJ0\nUQcpVKkNz3QlHZEVpt0JZSUovEVEIkgrLEVE6ojCW0QkghTeIiIRpPAWEYkghbeISAQpvEVEIkjh\nLSISQQpvkQainQ/rhxbpiDQILeGPJq2wFGlwPYmE21s6eDwMjMbjHD5yJMxmyQK0wlJEpI4ovEUa\nRJ/nuVIJrtfdH4vRt8htSn3fJ5HoIZHoUc08ZCqbiDSQ5ex86Ps+3d29ZDIHAIjF+hkZGVbNfIWp\n5i0i8/i+Tyo1BIDn9Z01iBOJHsbGuiCvah6Pj3LkyOGVb2gDKxXeupKOSAPJD+tdu7bzwAPfyvWk\nJyZ61ZOOEIW3SIMoLHuMjd0BfI1sTzqTgVRqqGR4e14fExO92St7EYv143nDVWi5FKMBS5EG4Ps+\nt9yyh0zmEuCC4PbeRZ0jmUwyMuJKJfH4KCMjLri3b/8D1q+/jO3bOzWIWUWqeYvUucIeN9wFnAY+\njZt38hCw+AFI3/fp6rqZ6enm3DlaWu5idPQRlV4qSDVvkQaVSg0Fwd2b9+xBXODGgQFaW/+Txx4r\nHtylBjVTqSGmp7cAn82de3r67KUXqRyFt0hDmgr+mQReY8eO0ZLBnd9rL2dQc3LyDRKJHsANij7x\nxFFg4dkssjgqm4hE0GKm+LnyxseZnv5q8Mw+4G3gQ8Bqmpuf4x//8W8B12uenHyDt976T95883ec\nPn2at9/+L0D2/91LiMdf4siRw0XLJs3NHk1N7zI9/XXgWeBh4JuA5oUvVamyCdbaqtzcR4nIcqXT\naRuLbbRwyMIhG4tttOl0+qzH r159oYUtFtotXG3hvNz7Ya1tb7/CtrRsyHvuPAuehR4L51u4Lnjc\nZlevvjD3eel02nZ07LStre22vf3q4HOus5C2sDs4lw1uh2w8vrtaX1PdCLJzXqZqtolIxMytYbuS\nRrYXXihb9piaWgtMAn8B3A604Gac9ALf4MSJt4KeeW9w+ybwU+BfgK/j6tqPAr1MTZ3LTTd9jO3b\nOwE4enSCxx77NidPvsrU1IPM1sDfW
[07/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Random-forest-v2.ipynb -- diff --git a/community-artifacts/Random-forest-v2.ipynb b/community-artifacts/Random-forest-v2.ipynb new file mode 100644 index 000..87605b7 --- /dev/null +++ b/community-artifacts/Random-forest-v2.ipynb @@ -0,0 +1,3082 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Random forest\n", +"\n", +"Random forests build an ensemble of classifiers, each of which is a tree model constructed using bootstrapped samples from the input data. The results of these models are then combined to yield a single prediction, which, at the expense of some loss in interpretation, have been found to be highly accurate.\n", +"\n", +"Please also refer to the decision tree user documentation for information relevant to the implementation of random forests in MADlib.\n", +"\n", +"This notebook includes impurity importance which was added in 1.15." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug 1 18:34:10 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug 1 18:34:10 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Random forest classification examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Data set related to whether to play golf or not." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "OUTLOOK\n", + "temperature\n", + "humidity\n", + "Temp_Humidity\n", + "clouds_airquality\n", + "windy\n", + "class\n", + "\n", + "\n", + "1\n", + "sunny\n", + "85.0\n", + "85.0\n", + "[85.0, 85.0]\n", + "[u'none', u'unhealthy']\n", + "False\n", + "Don't Play\n", + "\n", + "\n", + "2\n", + "sunny\n", + "80.0\n", + "90.0\n", + "[80.0, 90.0]\n", + "[u'none', u'moderate'
[10/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/MLP-v4.ipynb -- diff --git a/community-artifacts/MLP-v4.ipynb b/community-artifacts/MLP-v4.ipynb new file mode 100644 index 000..a6b62d6 --- /dev/null +++ b/community-artifacts/MLP-v4.ipynb @@ -0,0 +1,4588 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Multilayer Perceptron\n", +"\n", +"Multilayer Perceptron (MLP) is a type of neural network that can be used for regression and classification.\n", +"\n", +"This version of the workbook includes mini-batching added in 1.14 and momentum added in 1.15" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { +"scrolled": true + }, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-g5c4331d, cmake configuration time: Thu Jul 5 17:46:06 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-g5c4331d, cmake configuration time: Thu Jul 5 17:46:06 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Classification without Mini-Batching\n", +"\n", +"# 1. Create input table for classification" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "52 rows affected.\n", + "52 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "attributes\n", + "class_text\n", + "class\n", + "state\n", + "\n", + "\n", + "1\n", + "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "2\n", + "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "3\n", + "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "4\n", + "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "A
[05/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/SVM-v1.ipynb -- diff --git a/community-artifacts/SVM-v1.ipynb b/community-artifacts/SVM-v1.ipynb new file mode 100644 index 000..405710d --- /dev/null +++ b/community-artifacts/SVM-v1.ipynb @@ -0,0 +1,2806 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Support Vector Machines\n", +"Support Vector Machines (SVMs) are models for regression and classification tasks. SVM models have two particularly desirable features: robustness in the presence of noisy data and applicability to a variety of data configurations. At its core, a linear SVM model is a hyperplane separating two distinct classes of data (in the case of classification problems), in such a way that the distance between the hyperplane and the nearest training data point (called the margin) is maximized. Vectors that lie on this margin are called support vectors. With the support vectors fixed, perturbations of vectors beyond the margin will not affect the model; this contributes to the modelâs robustness. By substituting a kernel function for the usual inner product, one can approximate a large variety of decision boundaries in addition to linear hyperplanes." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.15-dev, git revision: rc/1.14-rc1-25-gda13eb7, cmake configuration time: Tue Jul 10 21:37:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-25-gda13eb7, cmake configuration time: Tue Jul 10 21:37:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();" + ] + }, + { + "cell_type": "markdown", + "metadata": { +"collapsed": true + }, + "source": [ +"# Classification\n", +"# 1. Create input data set" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "15 rows affected.\n", + "15 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "tax\n", + "bedroom\n", + "bath\n", + "price\n", + "size\n", + "lot\n", + "\n", + "\n", + "1\n", + "590\n", + "2\n", + "1.0\n", + "5\n", + "770\n", + "22100\n", + "\n", + "\n", + "2\n", + "1050\n", + "3\n", + "2.0\n", + "85000\n", + "1410\n", + "12000\n", + "\n", + "\n", + "3\n", + "20\n", + "3\n", + "1.0\n", + "22500\n", + "1060\n", + "3500\n", + "\n", + "\n", + "4\n", + "870\n", + "2\n", + "2.0\n", + "
[18/18] madlib-site git commit: update jupyter notebooks for 1dot15
update jupyter notebooks for 1dot15 Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/acd339f6 Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/acd339f6 Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/acd339f6 Branch: refs/heads/asf-site Commit: acd339f65ab5b6b9c2f95ca370cc1fb8460fd7c6 Parents: 5fa1ac0 Author: Frank McQuillan Authored: Wed Aug 1 13:13:25 2018 -0700 Committer: Frank McQuillan Committed: Wed Aug 1 13:13:25 2018 -0700 -- .../Column-vector-operations-v1.ipynb | 2553 ++ .../Covariance-and-correlation-v1.ipynb | 1318 + community-artifacts/Decision-trees-v1.ipynb | 3051 community-artifacts/Decision-trees-v2.ipynb | 3208 community-artifacts/Elastic-net-v2.ipynb| 2078 community-artifacts/Elastic-net-v3.ipynb| 2049 community-artifacts/KNN-v4.ipynb| 857 community-artifacts/MLP-mnist-v3.ipynb | 1329 + community-artifacts/MLP-v4.ipynb| 4588 ++ .../Novelty-detection-demo-1.ipynb | 478 -- community-artifacts/Random-forest-v1.ipynb | 2899 --- community-artifacts/Random-forest-v2.ipynb | 3082 .../SVM-novelty-detection-v2.ipynb | 511 ++ community-artifacts/SVM-v1.ipynb| 2806 +++ .../Stratified-sampling-v2.ipynb| 672 +++ community-artifacts/kNN-v3.ipynb| 857 community-artifacts/mlp-mnist-v2.ipynb | 1154 - community-artifacts/mlp-v3.ipynb| 4584 - .../stratified-sampling-v1.ipynb| 672 --- 19 files changed, 22973 insertions(+), 15773 deletions(-) -- http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Column-vector-operations-v1.ipynb -- diff --git a/community-artifacts/Column-vector-operations-v1.ipynb b/community-artifacts/Column-vector-operations-v1.ipynb new file mode 100644 index 000..147b328 --- /dev/null +++ b/community-artifacts/Column-vector-operations-v1.ipynb @@ -0,0 +1,2553 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Column and vector operations\n", +"\n", +"Column and vector operations were added in 1.15.\n", +"\n", +"* cols2vec\n", +"* vec2cols\n", +"* drop columns" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { +"scrolled": true + }, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C comp
[02/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/mlp-v3.ipynb -- diff --git a/community-artifacts/mlp-v3.ipynb b/community-artifacts/mlp-v3.ipynb deleted file mode 100644 index 8c585a6..000 --- a/community-artifacts/mlp-v3.ipynb +++ /dev/null @@ -1,4584 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Multilayer Perceptron\n", -"\n", -"Multilayer Perceptron (MLP) is a type of neural network that can be used for regression and classification.\n", -"\n", -"This version of the workbook includes mini-batching which was added in the 1.14 release." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { -"scrolled": true - }, - "outputs": [ -{ - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", - " \"You should import from traitlets.config instead.\", ShimWarning)\n", - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", - " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpadmin@madlib'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum Database 5.4.0 on GCP (demo machine)\n", -"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", -"\n", -"# PostgreSQL local\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum Database 4.3.10.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Classification without Mini-Batching\n", -"\n", -"# 1. Create input table for classification" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "52 rows affected.\n", - "52 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "attributes\n", - "class_text\n", - "class\n", - "state\n", - "\n", - "\n", - "1\n", - "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "2\n", - "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "3\n", - "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "4\n", - "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "A
[16/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Decision-trees-v1.ipynb -- diff --git a/community-artifacts/Decision-trees-v1.ipynb b/community-artifacts/Decision-trees-v1.ipynb deleted file mode 100644 index 02a60ef..000 --- a/community-artifacts/Decision-trees-v1.ipynb +++ /dev/null @@ -1,3051 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Decision trees\n", -"\n", -"A decision tree is a supervised learning method that can be used for classification and regression. It consists of a structure in which internal nodes represent tests on attributes, and the branches from nodes represent the result of those tests. Each leaf node is a class label and the paths from root to leaf nodes define the set of classification or regression rules." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ -{ - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", - " \"You should import from traitlets.config instead.\", ShimWarning)\n", - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", - " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpadmin@madlib'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum Database 5.4.0 on GCP (demo machine)\n", -"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", -"\n", -"# PostgreSQL local\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum Database 4.3.10.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.14, git revision: rc/1.13-rc1-68-g1c81cb1, cmake configuration time: Tue Apr 24 15:54:15 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.14, git revision: rc/1.13-rc1-68-g1c81cb1, cmake configuration time: Tue Apr 24 15:54:15 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Decision tree classification examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. Load data\n", -"Data set related to whether to play golf or not." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "14 rows affected.\n", - "14 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "OUTLOOK\n", - "temperature\n", - "humidity\n", - "Temp_Humidity\n", - "clouds_airquality\n", - "windy\n", - "class\n", - "observation_weight\n", - "\n", - "\n", - "1\n", - "sunny\n", - "85.0\n", - "85.0\n", - "[85.0, 85.0]\n", - "[u'none', u'unhealthy']\n", - "False\n", - "Don't Play\n", - "5.0\n", - "\n", - "\n", - "2\n", - "sunny\n", - "80.0\n", - "90.0\n", - "[80.0, 90.0]\n", - "[u'none', u'moderate'
[15/18] madlib-site git commit: update jupyter notebooks for 1dot15
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Decision-trees-v2.ipynb -- diff --git a/community-artifacts/Decision-trees-v2.ipynb b/community-artifacts/Decision-trees-v2.ipynb new file mode 100644 index 000..5b55b03 --- /dev/null +++ b/community-artifacts/Decision-trees-v2.ipynb @@ -0,0 +1,3208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Decision trees\n", +"\n", +"A decision tree is a supervised learning method that can be used for classification and regression. It consists of a structure in which internal nodes represent tests on attributes, and the branches from nodes represent the result of those tests. Each leaf node is a class label and the paths from root to leaf nodes define the set of classification or regression rules.\n", +"\n", +"This notebook includes impurity importance which was added in 1.15." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug 1 18:34:10 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug 1 18:34:10 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Decision tree classification examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Data set related to whether to play golf or not." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "OUTLOOK\n", + "temperature\n", + "humidity\n", + "Temp_Humidity\n", + "clouds_airquality\n", + "windy\n", + "class\n", + "observation_weight\n", + "\n", + "\n", + "1\n", + "sunny\n", + "85.0\n", + "85.0\n", + "[85.0, 85.0]\n", + "[u'none', u'unhealthy']\n", + "False\n", + "Don't Play\n", + "5.0\n", + "\n", + "\n", + "2\n", + "sunny\n", + "80.0\n", + "90.0\n", + "[80.0, 90.0]\n", + "[u'none', u'moderate']\n", + "
[1/2] madlib git commit: DT/RF: Add function to report importance scores
Repository: madlib Updated Branches: refs/heads/master e2534e44e -> 186390f7c DT/RF: Add function to report importance scores JIRA: MADLIB-925 This commit adds a new MADlib function (get_var_importance) to report the importance scores in decision tree and random forest by unnesting the importance values along with corresponding features. Closes #295 Co-authored-by: Rahul Iyer Co-authored-by: Jingyi Mei Co-authored-by: Orhan Kislal Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/1aac377f Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/1aac377f Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/1aac377f Branch: refs/heads/master Commit: 1aac377f68d20290374c004a3a8bb2da82ab1fcc Parents: e2534e4 Author: Nandish Jayaram Authored: Tue Jul 3 12:22:07 2018 -0700 Committer: Rahul Iyer Committed: Wed Aug 1 12:58:22 2018 -0700 -- .../recursive_partitioning/decision_tree.cpp| 11 +- .../recursive_partitioning/decision_tree.hpp| 2 +- .../recursive_partitioning/random_forest.cpp| 15 ++ .../recursive_partitioning/random_forest.hpp| 1 + .../recursive_partitioning/decision_tree.py_in | 10 +- .../recursive_partitioning/decision_tree.sql_in | 102 +++--- .../recursive_partitioning/random_forest.py_in | 187 ++- .../recursive_partitioning/random_forest.sql_in | 168 + .../test/decision_tree.ic.sql_in| 3 +- .../test/decision_tree.sql_in | 46 - .../test/random_forest.sql_in | 20 +- .../test/unit_tests/plpy_mock.py_in | 43 + .../test/unit_tests/test_random_forest.py_in| 173 + 13 files changed, 697 insertions(+), 84 deletions(-) -- http://git-wip-us.apache.org/repos/asf/madlib/blob/1aac377f/src/modules/recursive_partitioning/decision_tree.cpp -- diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp index d249946..0a7f7a5 100644 --- a/src/modules/recursive_partitioning/decision_tree.cpp +++ b/src/modules/recursive_partitioning/decision_tree.cpp @@ -488,7 +488,7 @@ print_decision_tree::run(AnyType &args){ } AnyType -get_variable_importance::run(AnyType &args){ +compute_variable_importance::run(AnyType &args){ Tree dt = args[0].getAs(); const int n_cat_features = args[1].getAs(); const int n_con_features = args[2].getAs(); @@ -497,19 +497,12 @@ get_variable_importance::run(AnyType &args){ ColumnVector con_var_importance = ColumnVector::Zero(n_con_features); dt.computeVariableImportance(cat_var_importance, con_var_importance); -// Variable importance is scaled to represent a percentage. Even though -// the importance values are split between categorical and continuous, the -// percentages are relative to the combined set. ColumnVector combined_var_imp(n_cat_features + n_con_features); combined_var_imp << cat_var_importance, con_var_importance; - -// Avoid divide by zero by adding a small number -double total_var_imp = combined_var_imp.sum(); -double VAR_IMP_EPSILON = 1e-6; -combined_var_imp *= (100.0 / (total_var_imp + VAR_IMP_EPSILON)); return combined_var_imp; } + AnyType display_text_tree::run(AnyType &args){ Tree dt = args[0].getAs(); http://git-wip-us.apache.org/repos/asf/madlib/blob/1aac377f/src/modules/recursive_partitioning/decision_tree.hpp -- diff --git a/src/modules/recursive_partitioning/decision_tree.hpp b/src/modules/recursive_partitioning/decision_tree.hpp index ae62bfa..8cb6703 100644 --- a/src/modules/recursive_partitioning/decision_tree.hpp +++ b/src/modules/recursive_partitioning/decision_tree.hpp @@ -14,7 +14,7 @@ DECLARE_UDF(recursive_partitioning, compute_surr_stats_transition) DECLARE_UDF(recursive_partitioning, dt_surr_apply) DECLARE_UDF(recursive_partitioning, print_decision_tree) -DECLARE_UDF(recursive_partitioning, get_variable_importance) +DECLARE_UDF(recursive_partitioning, compute_variable_importance) DECLARE_UDF(recursive_partitioning, predict_dt_response) DECLARE_UDF(recursive_partitioning, predict_dt_prob) http://git-wip-us.apache.org/repos/asf/madlib/blob/1aac377f/src/modules/recursive_partitioning/random_forest.cpp -- diff --git a/src/modules/recursive_partitioning/random_forest.cpp b/src/modules/recursive_partitioning/random_forest.cpp index 70ebbaa..a12f095 100644 --- a/src/modules/recursive_partitioning/random_forest.cpp +++ b/src/modules/recursive_partitioning/random_forest.cpp @@ -204,6 +204,21 @@ rf_con_imp_score::run(AnyType &ar
[2/2] madlib git commit: DT/RF: Fix user doc examples
DT/RF: Fix user doc examples Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/186390f7 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/186390f7 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/186390f7 Branch: refs/heads/master Commit: 186390f7c2af5ad886a4d5b77d0792b68cd3414d Parents: 1aac377 Author: Frank McQuillan Authored: Wed Aug 1 12:49:10 2018 -0700 Committer: Rahul Iyer Committed: Wed Aug 1 12:58:44 2018 -0700 -- .../recursive_partitioning/decision_tree.sql_in | 16 ++-- .../recursive_partitioning/random_forest.sql_in | 12 +++- 2 files changed, 17 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/madlib/blob/186390f7/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in -- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in index 469f1b2..5926152 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in @@ -284,14 +284,17 @@ tree_train( impurity_var_importance DOUBLE PRECISION[]. Impurity importance of each variable. The order of the variables is the same as - that of 'independent_varnames' column in the summary table (see below). + that of the 'independent_varnames' column in the summary table (see below). The impurity importance of any feature is the decrease in impurity by a node containing the feature as a primary split, summed over the whole tree. If surrogates are used, then the importance value includes the impurity decrease scaled by the adjusted surrogate agreement. - Reported importance values are normalized to sum to 100 across - all variables. + Importance values are displayed as raw values as per the 'split_criterion' + parameter. + To see importance values normalized to sum to 100 across + all variables, use the importance display helper function + described later on this page. Please refer to [1] for more information on variable importance. @@ -727,7 +730,7 @@ independent_var_types | text, boolean, double precision n_folds | 0 null_proxy | -View the impurity importance table using the helper function: +View the normalized impurity importance table using the helper function: \\x off DROP TABLE IF EXISTS imp_output; @@ -,10 +1114,11 @@ which shows ordering of levels of categorical variables 'vs' and 'cyl': SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output; +-[ RECORD 1 ]---+ pruning_cp | 0 cat_levels_in_text | {0,1,4,6,8} cat_n_levels| {2,3} -impurity_var_importance | {0,51.8593201959496,10.976977929129,5.31897402755374,31.8447278473677} +impurity_var_importance | {0,22.6309172500675,4.79024943310651,2.321153,13.8967382920111} tree_depth | 4 View the summary table: @@ -1147,7 +1151,7 @@ independent_var_types | integer, integer, double precision, double precisi n_folds | 0 null_proxy | -View the impurity importance table using the helper function: +View the normalized impurity importance table using the helper function: \\x off DROP TABLE IF EXISTS imp_output; http://git-wip-us.apache.org/repos/asf/madlib/blob/186390f7/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in -- diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in index 39b6f5d..5b5a0f0 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in @@ -164,7 +164,9 @@ forest_train(training_table_name, Due to nature of permutation, the importance value can end up being negative if the number of levels for a categorical variable is small and is unbalanced. In such a scenario, the importance values are shifted to ensure -that the lowest importance value is 0. +that the lowest importance value is 0. To see importance values normalized +to sum to 100 across all variables, use the importance display helper function +described later on this page. @@ -758,7 +760,7 @@ the variables in 'independent_varnames'
madlib git commit: DT/RF: Don't eliminate single-level cat variable
Repository: madlib Updated Branches: refs/heads/master 20f95b33b -> e2534e44e DT/RF: Don't eliminate single-level cat variable JIRA: MADLIB-1258 When DT/RF is run with grouping, a subset of the groups could eliminate a categorical variable leading to multiple issues downstream, including invalid importance values and incorrect prediction. This commit keeps all categorical variables (even if it contains just one level). The accumulator state would use additional space during tree_train for this categorical variable, even though the variable is never consumed by the tree. This inefficiency is still preferred since it yields clean code and error-free prediction/importance reporting. Additional changes: - get_expr_type (validate_args.py) has been updated to return type for multiple expressions at the same time. This prevents calling a separate query for each expression, thus saving time. - Cat features are not stored per tree (in the grouping case) anymore since the features are now consistent across trees. Closes #301 Co-authored-by: Nandish Jayaram Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/e2534e44 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/e2534e44 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/e2534e44 Branch: refs/heads/master Commit: e2534e44ea36aedec843a3a7c48236d0e1104e2c Parents: 20f95b3 Author: Rahul Iyer Authored: Thu Jul 26 12:17:58 2018 -0700 Committer: Rahul Iyer Committed: Wed Aug 1 12:51:13 2018 -0700 -- src/modules/recursive_partitioning/DT_impl.hpp | 91 .../recursive_partitioning/decision_tree.cpp| 21 +- .../recursive_partitioning/decision_tree.py_in | 217 +-- .../recursive_partitioning/random_forest.py_in | 120 +- .../test/decision_tree.sql_in | 83 +++ .../test/random_forest.sql_in | 46 ++-- .../modules/utilities/validate_args.py_in | 49 +++-- 7 files changed, 319 insertions(+), 308 deletions(-) -- http://git-wip-us.apache.org/repos/asf/madlib/blob/e2534e44/src/modules/recursive_partitioning/DT_impl.hpp -- diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp index 69bdc88..75e4ce4 100644 --- a/src/modules/recursive_partitioning/DT_impl.hpp +++ b/src/modules/recursive_partitioning/DT_impl.hpp @@ -518,6 +518,7 @@ DecisionTree::expand(const Accumulator &state, double gain = impurityGain( state.cat_stats.row(stats_i). segment(fv_index, sps * 2), sps); + if (gain > max_impurity_gain){ max_impurity_gain = gain; max_feat = f; @@ -665,21 +666,29 @@ DecisionTree::pickSurrogates( // 1. Compute the max count and corresponding split threshold for // each categorical and continuous feature + ColumnVector cat_max_thres = ColumnVector::Zero(n_cats); ColumnVector cat_max_count = ColumnVector::Zero(n_cats); IntegerVector cat_max_is_reverse = IntegerVector::Zero(n_cats); Index prev_cum_levels = 0; for (Index each_cat=0; each_cat < n_cats; each_cat++){ Index n_levels = state.cat_levels_cumsum(each_cat) - prev_cum_levels; -Index max_label; -(cat_stats_counts.row(stats_i).segment( -prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label); -cat_max_thres(each_cat) = static_cast(max_label / 2); -cat_max_count(each_cat) = -cat_stats_counts(stats_i, prev_cum_levels*2 + max_label); -// every odd col is for reverse, hence i % 2 == 1 for reverse index i -cat_max_is_reverse(each_cat) = (max_label % 2 == 1) ? 1 : 0; -prev_cum_levels = state.cat_levels_cumsum(each_cat); +if (n_levels > 0){ +Index max_label; +(cat_stats_counts.row(stats_i).segment( +prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label); + +// For each split, there are two stats => +// max_label / 2 gives the split index. A floor +// operation is unnecessary since the threshold will yield +// the same results for n and n+0.5. +cat_max_thres(each_cat) = static_cast(max_label / 2); +cat_max_count(each_cat) = +cat_stats_counts(stats_i, prev_cum_levels*2 + max_label); +// every odd col is for reverse, hence i % 2 == 1 for reverse
[2/2] madlib git commit: Utilities: Add module transform_vec_cols for column-vector conversion
Utilities: Add module transform_vec_cols for column-vector conversion JIRA: MADLIB-1240 This commit adds a new SQL function called vec2cols and refactors the current function cols2vec, providing greater integration between the two modules. We now have a single Python file with separate classes for each feature. We also have unified unit-tests and dev-check/install-check tests. The vec2cols function enables users to split up a single column into multiple columns, given that the input column contains array entries. For example, if the input column contained ARRAY[1, 2, 3] in one of its rows, the output table will contain 3 different columns, one for each element of the array. Co-authored-by: Nandish Jayaram Co-authored-by: Rahul Iyer Co-authored-by: Nikhil Kak Co-authored-by: Orhan Kislal Co-authored-by: Frank McQuillan Closes #291 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/20f95b33 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/20f95b33 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/20f95b33 Branch: refs/heads/master Commit: 20f95b33bcbd05b154a566c81958091c66258858 Parents: a0cfcf8 Author: Arvind Sridhar Authored: Wed Aug 1 11:22:27 2018 -0700 Committer: Orhan Kislal Committed: Wed Aug 1 11:22:27 2018 -0700 -- doc/mainpage.dox.in | 1 + .../postgres/modules/internal/db_utils.py_in| 9 + .../postgres/modules/utilities/cols2vec.py_in | 128 - .../postgres/modules/utilities/cols2vec.sql_in | 345 ++--- .../modules/utilities/test/cols2vec.sql_in | 91 .../utilities/test/transform_vec_cols.ic.sql_in | 68 +++ .../utilities/test/transform_vec_cols.sql_in| 470 ++ .../unit_tests/test_transform_vec_cols.py_in| 226 + .../modules/utilities/transform_vec_cols.py_in | 496 +++ .../postgres/modules/utilities/utilities.py_in | 14 +- .../postgres/modules/utilities/vec2cols.sql_in | 348 + 11 files changed, 1908 insertions(+), 288 deletions(-) -- http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/doc/mainpage.dox.in -- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index 8f97491..d174ab7 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -276,6 +276,7 @@ complete matrix stored as a distributed table. @defgroup grp_minibatch_preprocessing Mini-Batch Preprocessor @defgroup grp_pmml PMML Export @defgroup grp_text_utilities Term Frequency +@defgroup grp_vec2cols Vector to Columns @} @defgroup grp_early_stage Early Stage Development http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/internal/db_utils.py_in -- diff --git a/src/ports/postgres/modules/internal/db_utils.py_in b/src/ports/postgres/modules/internal/db_utils.py_in index c75babf..45477ef 100644 --- a/src/ports/postgres/modules/internal/db_utils.py_in +++ b/src/ports/postgres/modules/internal/db_utils.py_in @@ -79,3 +79,12 @@ def quote_literal(input_str): return "{qd}{input_str}{qd}".format(qd=QUOTE_DELIMITER, input_str=input_str) # -- + +def is_col_1d_array(source_table, col_name): +query = """ +SELECT array_upper({0}, 2) IS NULL AS n_y +FROM {1} +LIMIT 1 +""".format(col_name, source_table) +result = plpy.execute(query) +return result[0]["n_y"] http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/cols2vec.py_in -- diff --git a/src/ports/postgres/modules/utilities/cols2vec.py_in b/src/ports/postgres/modules/utilities/cols2vec.py_in deleted file mode 100644 index 4f2b1c9..000 --- a/src/ports/postgres/modules/utilities/cols2vec.py_in +++ /dev/null @@ -1,128 +0,0 @@ -# coding=utf-8 -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -#
[1/2] madlib git commit: Utilities: Add module transform_vec_cols for column-vector conversion
Repository: madlib Updated Branches: refs/heads/master a0cfcf8f7 -> 20f95b33b http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/vec2cols.sql_in -- diff --git a/src/ports/postgres/modules/utilities/vec2cols.sql_in b/src/ports/postgres/modules/utilities/vec2cols.sql_in new file mode 100644 index 000..989074c --- /dev/null +++ b/src/ports/postgres/modules/utilities/vec2cols.sql_in @@ -0,0 +1,348 @@ +/* --- */ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * @file sessionize.sql_in + * + * @brief SQL functions for sessionization functions + * @date May 2016 + * + */ +/* --- */ + +m4_include(`SQLCommon.m4') + +/** +@addtogroup grp_vec2cols + +@brief Converts a feature array in a single column of an output table into multiple columns. + +Contents + +Syntax +Usage +Examples + + + +@about +Converts a feature array in a single column into multiple columns. +This process can be used to reverse the function cols2vec. + +Given a table with a column of type array, this function will create an output +table that splits this array into multiple columns, one per array element. +It includes the option to name the new feature columns, and to include +columns from the original table in the output. + +@anchor vec2cols_usage +@usage + + +vec2cols( +source_table, +output_table, +vector_col, +feature_names, +cols_to_output +) + + +\b Arguments + +source_table +TEXT. Name of the table containing the source data.. + +output_table +TEXT. Name of the generated table containing the output. If a table with the +same name already exists, an error will be returned. + +vector_col +TEXT. Name of the column containing the feature array. +Must be a one-dimensional array. + +feature_names (optional) +TEXT[]. Array of names associated with the feature array. +Note that this array exists in the +summary table created by the function 'cols2vec'. +If the 'feature_names' array is not specified, +column names will be automatically generated of +the form 'f1, f2, ...fn'. + +cols_to_output (optional) +TEXT, default NULL. Comma-separated string of column names +from the source table to keep in the +output table, in addition to the feature columns. +To keep all columns from the source table, use '*'. +Note: total number of columns in a table cannot exceed the +PostgreSQL limits. + + + +Output table + +The output table produced by the vec2cols function contains the following columns: + + +<...> +Columns from source table, depending on which ones are kept (if any). + + + +feature columns +Columns for each of the features in 'vector_col'. Column type +will depend on the feature array type in the source table. Column +naming will depend on whether the parameter 'feature_names' is used. + + + + +@anchor vec2cols_example +@par Examples +-# Load sample data: + +DROP TABLE IF EXISTS golf CASCADE; +CREATE TABLE golf ( +id integer NOT NULL, +"OUTLOOK" text, +temperature double precision, +humidity double precision, +"Temp_Humidity" double precision[], +clouds_airquality text[], +windy boolean, +class text, +observation_weight double precision +); +INSERT INTO golf VALUES +(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), +(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), +(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), +(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), +(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), +(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), +(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), +(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t P