[2/3] incubator-madlib git commit: New module: Multilayer Perceptron

riyer Thu, 20 Jul 2017 15:33:06 -0700

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4fcb60ed/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in 
b/src/ports/postgres/modules/convex/mlp.sql_in
new file mode 100644
index 0000000..400f892
--- /dev/null
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -0,0 +1,752 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file mlp.sql_in
+ *
+ * @brief SQL functions for multilayer perceptron
+ * @date June 2012
+ *
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_mlp
+
+<div class="toc"><b>Contents</b><ul>
+<li class="level1"><a href="#mlp_classification">Classification</a></li>
+<li class="level1"><a href="#mlp_regression">Regression</a></li>
+<li class="level1"><a href="#optimization_params">Optimizer Parameters</a></li>
+<li class="level1"><a href="#predict">Prediction Functions/a></li>
+<li class="level1"><a href="#example">Examples</a></li>
+<li class="level1"><a href="#background">Technical Background</a></li>
+<li class="level1"><a href="#literature">Literature</a></li>
+<li class="level1"><a href="#related">Related Topics</a></li>
+</ul></div>
+
+Multilayer Perceptron (MLP) is a model for regression and
+classification.
+
+Also called "vanilla neural networks", they consist of several
+fully connected hidden layers with non-linear activation
+functions.  In the case of classification, the final layer of the
+neural net has as many nodes as classes, and the output of the
+neural net can be interpreted as the probability of a given input
+feature belonging to a specific class.
+
+
+@brief Solves classification and regression problems with several
+fully connected layers and nonlinear activation functions.
+
+@anchor mlp_classification
+@par Classification Training Function
+The mlp classification training function has the following format:
+<pre class="syntax">
+mlp_classification(
+    source_table,
+    output_table,
+    independent_varname,
+    dependent_varname,
+    hidden_layer_sizes,
+    optimizer_params,
+    activation
+    )
+</pre>
+\b Arguments
+<DL class="arglist">
+  <DT>source_table</DT>
+  <DD>TEXT. Name of the table containing the training data.</DD>
+
+  <DT>output_table</DT>
+  <DD>TEXT. Name of the output table containing the model. Details of the 
output
+   tables are provided below.
+  </DD>
+
+  <DT>independent_varname</DT>
+  <DD>TEXT. Expression list to evaluate for the
+    independent variables. An intercept variable should not be included as part
+    of this expression. Please note that expression should be able to be cast
+    to DOUBLE PRECISION[].
+  </DD>
+
+  <DT>dependent_varname</DT>
+  <DD> TEXT. Name of the dependent variable column. For classification, 
supported types are:
+  text, varchar, character varying, char, character
+  integer, smallint, bigint, and boolean.  </DD>
+
+  <DT>hidden_layer_sizes (optional)</DT>
+  <DD>INTEGER[], default: ARRAY[].
+  The number of neurons in each hidden layer.  The length of this array will
+  determine the number of hidden layers.  Empty for no hidden layers.
+  </DD>
+
+
+  <DT>optimizer_params (optional)</DT>
+  <DD>TEXT, default: NULL.
+    Parameters for optimization in a comma-separated string
+    of key-value pairs. See the description below for details.
+  </DD>
+
+  <DT>activation (optional)</DT>
+  <DD>TEXT, default: 'sigmoid'.
+    Activation function. Currently three functions are supported: 'sigmoid' 
(default),
+    'relu', and 'tanh'. The text can be any prefix of the three
+    strings; for e.g., activation='s' will use the sigmoid activation.
+  </DD>
+</DL>
+
+<b>Output tables</b>
+<br>
+    The model table produced by mlp contains the following columns:
+    <table class="output">
+      <tr>
+        <th>coeffs</th>
+        <td>FLOAT8[]. Flat array containing the weights of the neural net</td>
+      </tr>
+      <tr>
+        <th>n_iterations</th>
+        <td>INTEGER. Number of iterations completed by stochastic gradient 
descent
+        algorithm. The algorithm either converged in this number of iterations
+        or hit the maximum number specified in the optimization parameters. 
</td>
+      </tr>
+      <tr>
+        <th>loss</th>
+        <td>FLOAT8. The cross entropy over the training data.
+        See Technical Background section below for more details.</td>
+      </tr>
+    </table>
+
+
+A summary table named \<output_table\>_summary is also created, which has the 
following columns:
+    <table class="output">
+    <tr>
+        <th>source_table</th>
+        <td>The source table.</td>
+    </tr>
+    <tr>
+        <th>dependent_varname</th>
+        <td>The dependent variable.</td>
+    </tr>
+    <tr>
+        <th>independent_varname</th>
+        <td>The independent variables.</td>
+    </tr>
+    <tr>
+        <th>tolerance</th>
+        <td>The tolerance as given in optimizer_params.</td>
+    </tr>
+    <tr>
+        <th>step_size</th>
+        <td>The step size as given in optimizer_params.</td>
+    </tr>
+    <tr>
+        <th>n_iterations</th>
+        <td>The number of iterations run</td>
+    </tr>
+    <tr>
+        <th>n_tries</th>
+        <td>The number of tries as given in optimizer_params.</td>
+    </tr>
+    <tr>
+        <th>layer_sizes</th>
+        <td>The number of units in each layer including the input and output 
layer.</td>
+    </tr>
+    <tr>
+        <th>activation_function</th>
+        <td>The activation function.</td>
+    </tr>
+    <tr>
+        <th>is_classification</th>
+        <td>True if the model was trained for classification, False if it was 
trained
+        for regression</td>
+    </tr>
+    <tr>
+        <th>classes</th>
+        <td>The classes which were trained against (empty for regression)</td>
+    </tr>
+
+   </table>
+
+
+@anchor mlp_regression
+@par Regression Training Function
+The mlp regression training function has the following format:
+<pre class="syntax">
+mlp_regression(source_table,
+    source_table,
+    output_table,
+    independent_varname,
+    dependent_varname,
+    hidden_layer_sizes,
+    optimizer_params,
+    activation
+    )
+</pre>
+
+\b Arguments
+
+Specifications for regression are largely the same as for classification. In 
the
+model table, the loss will refer to mean square error instead of cross 
entropy. In the
+summary table, there is classes column. The following
+arguments have specifications which differ from mlp_classification:
+<DL class="arglist">
+<DT>dependent_varname</DT>
+  <DD>TEXT. Name of the dependent variable column.
+  For regression supported types are any numeric type, or array
+  or numeric types (for multiple regression).
+  </DD>
+</DL>
+
+
+@anchor optimizer_params
+@par Optimizer Parameters
+Parameters in this section are supplied in the \e optimizer_params argument as 
a string
+containing a comma-delimited list of name-value pairs. All of these named
+parameters are optional, and their order does not matter. You must use the
+format "<param_name> = <value>" to specify the value of a parameter, otherwise
+the parameter is ignored.
+
+
+<pre class="syntax">
+  'step_size = &lt;value>,
+   n_iterations = &lt;value>,
+   n_tries = &lt;value>,
+   tolerance = &lt;value>'
+</pre>
+\b Optimizer Parameters
+<DL class="arglist">
+
+<DT>step_size</dt>
+<DD>Default: [0.001].
+Also known as the learning rate. A small value is usually desirable to
+ensure convergence, while a large value provides more room for progress during
+training. Since the best value depends on the condition number of the data, in
+practice one often tunes this parameter.
+</DD>
+
+
+<DT>n_iterations</dt>
+<DD>Default: [100]. The maximum number of iterations allowed.
+</DD>
+<DT>n_tries</dt>
+<DD>Default: [1]. Number of times to retrain the network with randomly 
initialized
+weights
+</DD>
+
+<DT>tolerance</dt>
+<DD>Default: 0.001. The criterion to end iterations. The training stops 
whenever
+<the difference between the training models of two consecutive iterations is
+<smaller than \e tolerance or the iteration number is larger than \e max_iter.
+</DD>
+
+</DL>
+
+@anchor predict
+@par Prediction Function
+Used to generate predictions given a previously trained model on novel data.
+The same syntax is used for classification, and regression.
+<pre class="syntax">
+mlp_predict(model_table,
+            data_table,
+            id_col_name,
+            output_table,
+            pred_type)
+</pre>
+
+\b Arguments
+<DL class="arglist">
+  <DT>model_table</DT>
+  <DD>TEXT. Model table produced by the training function.</DD>
+
+  <DT>data_table</DT>
+  <DD>TEXT. Name of the table containing the data for prediction. This table 
is expected
+  to contain the same input features that were used during training. The table 
should
+  also contain id_col_name used for identifying each row.</DD>
+
+  <DT>id_col_name</DT>
+  <DD>TEXT. The name of the id column in the input table.</DD>
+
+  <DT>output_table</DT>
+  <DD>TEXT. Name of the table where output predictions are written. If this
+table name is already in use, then an error is returned.  Table contains:</DD>
+    <table class="output">
+      <tr>
+        <th>id</th>
+        <td>Gives the 'id' for each prediction, corresponding to each row from 
the data_table.</td>
+      </tr>
+      <tr>
+        <th>estimated_<COL_NAME></th>
+        <td>
+        (For pred_type='response') The estimated class
+         for classification or value for regression, where
+         <COL_NAME> is the name of the column to be
+         predicted from training data
+        </td>
+      </tr>
+      <tr>
+        <th>prob_<CLASS></th>
+        <td>
+        (For pred_type='prob' for classification) The
+        probability of a given class <CLASS> as given by
+        softmax. There will be one column for each class
+        in the training data.
+        </td>
+      </tr>
+
+
+  <DT>pred_type</DT>
+  <DD>TEXT.
+
+the type of output requested:
+'response' gives the actual prediction,
+'prob' gives the probability of each class.
+for regression, only type='response' is defined.
+The name of the id column in the input table.</DD>
+</DL>
+</table>
+
+@anchor example
+@par Examples
+-#  Create an input data set.
+<pre class="example">
+CREATE TABLE iris_data(
+    id integer,
+    attributes numeric[],
+    class_text varchar,
+    class integer
+);
+INSERT INTO iris_data VALUES
+(1,ARRAY[5.1,3.5,1.4,0.2],'Iris-setosa',1),
+(2,ARRAY[4.9,3.0,1.4,0.2],'Iris-setosa',1),
+(3,ARRAY[4.7,3.2,1.3,0.2],'Iris-setosa',1),
+(4,ARRAY[4.6,3.1,1.5,0.2],'Iris-setosa',1),
+(5,ARRAY[5.0,3.6,1.4,0.2],'Iris-setosa',1),
+(6,ARRAY[5.4,3.9,1.7,0.4],'Iris-setosa',1),
+(7,ARRAY[4.6,3.4,1.4,0.3],'Iris-setosa',1),
+(8,ARRAY[5.0,3.4,1.5,0.2],'Iris-setosa',1),
+(9,ARRAY[4.4,2.9,1.4,0.2],'Iris-setosa',1),
+(10,ARRAY[4.9,3.1,1.5,0.1],'Iris-setosa',1),
+(11,ARRAY[7.0,3.2,4.7,1.4],'Iris-versicolor',2),
+(12,ARRAY[6.4,3.2,4.5,1.5],'Iris-versicolor',2),
+(13,ARRAY[6.9,3.1,4.9,1.5],'Iris-versicolor',2),
+(14,ARRAY[5.5,2.3,4.0,1.3],'Iris-versicolor',2),
+(15,ARRAY[6.5,2.8,4.6,1.5],'Iris-versicolor',2),
+(16,ARRAY[5.7,2.8,4.5,1.3],'Iris-versicolor',2),
+(17,ARRAY[6.3,3.3,4.7,1.6],'Iris-versicolor',2),
+(18,ARRAY[4.9,2.4,3.3,1.0],'Iris-versicolor',2),
+(19,ARRAY[6.6,2.9,4.6,1.3],'Iris-versicolor',2),
+(20,ARRAY[5.2,2.7,3.9,1.4],'Iris-versicolor',2);
+</pre>
+-# Generate a multilayer perceptron with a single hidden layer of 5 units.
+Use the attributes column as the independent variables, and use the class
+column as the classification. Set the tolerance to 0 so that 5000
+iterations will be run. Use a hyperbolic tangent activation function.
+The model will be written to mlp_model.
+<pre class="example">
+DROP TABLE IF EXISTS mlp_model;
+DROP TABLE IF EXISTS mlp_model_summary;
+SELECT madlib.mlp_classification(
+    'iris_data',      -- Source table
+    'mlp_model',      -- Destination table
+    'attributes',     -- Input features
+    'class_text',     -- Label
+    ARRAY[5],         -- Number of units per layer
+    'step_size=0.003,
+    n_iterations=5000,
+    tolerance=0',     -- Optimizer params
+    'tanh');          -- Activation function
+</pre>
+-# View the result for the model.
+<pre class="example">
+-- Set extended display on for easier reading of output
+\\x ON
+-- Neural net Initialization is non-deterministic, so your results may vary
+SELECT * FROM mlp_model;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 
]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+coeff          | 
{1,1,1,1,1,0.136374930803,0.188739676875,0.662387810001,-1.03381622734,-0.469961067046,0.0614006983397,0.0811504589436,0.299008228258,-0.47391918521,-0.215098143699,0.10519213944,0.145844617525,0.511683525606,-0.800215552382,-0.36417142683,0.120751709056,0.167531106521,0.587074895969,-0.916946198095,-0.417055067449,0.0539541885146,0.0694359704131,0.262598585854,-0.419234805076,-0.189915344282,1,1,1,1,1,1,0.105645702152,1.46247470474,0.484457903226,0.965962824478,1.19361986431,0.419805760087,-0.105696503487,-1.46245956666,-0.484427811691,-0.965730981426,-1.19365280555,-0.419973628863}
+loss           | 0.0184092375519
+num_iterations | 5000
+</pre>
+-# Next train a regression example.  First create some test data.  This dataset
+contains housing prices data.
+<pre class="example">
+CREATE TABLE lin_housing (id serial, x float8[], grp_by_col int, y float8);
+COPY lin_housing (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
+{1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98}|1|24.00
+{1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14}|1|21.60
+{1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03}|1|34.70
+{1,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94}|1|33.40
+{1,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33}|1|36.20
+{1,0.02985,0.00,2.180,0,0.4580,6.4300,58.70,6.0622,3,222.0,18.70,394.12,5.21}|1|28.70
+{1,0.08829,12.50,7.870,0,0.5240,6.0120,66.60,5.5605,5,311.0,15.20,395.60,12.43}|1|22.90
+{1,0.14455,12.50,7.870,0,0.5240,6.1720,96.10,5.9505,5,311.0,15.20,396.90,19.15}|1|27.10
+{1,0.21124,12.50,7.870,0,0.5240,5.6310,100.00,6.0821,5,311.0,15.20,386.63,29.93}|1|16.50
+{1,0.17004,12.50,7.870,0,0.5240,6.0040,85.90,6.5921,5,311.0,15.20,386.71,17.10}|1|18.90
+{1,0.22489,12.50,7.870,0,0.5240,6.3770,94.30,6.3467,5,311.0,15.20,392.52,20.45}|1|15.00
+{1,0.11747,12.50,7.870,0,0.5240,6.0090,82.90,6.2267,5,311.0,15.20,396.90,13.27}|1|18.90
+{1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71}|1|21.70
+{1,0.62976,0.00,8.140,0,0.5380,5.9490,61.80,4.7075,4,307.0,21.00,396.90,8.26}|1|20.40
+{1,0.63796,0.00,8.140,0,0.5380,6.0960,84.50,4.4619,4,307.0,21.00,380.02,10.26}|1|18.20
+{1,0.62739,0.00,8.140,0,0.5380,5.8340,56.50,4.4986,4,307.0,21.00,395.62,8.47}|1|19.90
+{1,1.05393,0.00,8.140,0,0.5380,5.9350,29.30,4.4986,4,307.0,21.00,386.85,6.58}|1|
 23.10
+{1,0.78420,0.00,8.140,0,0.5380,5.9900,81.70,4.2579,4,307.0,21.00,386.75,14.67}|1|17.50
+{1,0.80271,0.00,8.140,0,0.5380,5.4560,36.60,3.7965,4,307.0,21.00,288.99,11.69}|1|20.20
+{1,0.72580,0.00,8.140,0,0.5380,5.7270,69.50,3.7965,4,307.0,21.00,390.95,11.28}|1|18.20
+\\.
+</pre>
+-# Now train a regression model using a multilayer perceptron a single hidden 
layer of two nodes.
+<pre class="example">
+DROP TABLE IF EXISTS mlp_regress;
+DROP TABLE IF EXISTS mlp_regress_summary;
+SELECT madlib.mlp_regression(
+    'lin_housing',            -- Source table
+    'mlp_regress',              -- Desination table
+    'x',                        -- Input features
+    'y',                        -- Dependent variable
+    ARRAY[5,5],                 -- Number of units per layer
+    'step_size=0.000007,
+    n_iterations=10000,
+    tolerance=0',
+    'relu');
+</pre>
+-# Check the results of the model
+<pre class="example">
+-- Set extended display on for easier reading of output
+\\x ON
+-- Neural net Initialization is non-deterministic, so your results may vary
+SELECT * FROM mlp_regress;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 
]--+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 -----------------------------------
+coeff          | 
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.79506311399e-05,3.56715008915e-05,-6.09333559685e-05,0.000251228318768,-0.000224772841379,-3.71863030857e-05,-3.5757865148e-06,5.27936784854e-05,-2.48474166186e-05,6.19731184294e-05,3.07638968743e-05,6.8964698578e-06,0.000106016701083,-1.71484730318e-05,1.18691881812e-05,-0.000163975464208,0.000170026304906,3.11688265279e-05,0.000177050148787,-1.58265976603e-05,2.70144422657e-05,0.000112667883422,3.77575139073e-05,8.12474658795e-05,-7.90458917626e-05,0.000107566386158,-2.63771171506e-06,2.47996880915e-05,-0.00012642310887,0.000203827391081,0.000139315565565,4.86147243454e-05,-0.000176126471913,-6.47820782916e-05,-8.51592776447e-06,-6.60601176758e-05,2.91421874156e-05,6.3556873752e-05,0.000197557443129,0.000220531367259,0.000135036310289,0.000143735913975,-4.75034117786e-05,-0.000179547345838,-1.6919846786e-05,0.000162784312994,0.000268595819851,-0.000460066553287,8.69756071591e-05,-0.00311762727057,0.000126024763103,0.000205988242921
 
,0.003463432426,-0.00729789075286,0.00151625867549,-0.000890852767597,-0.00525016037249,0.0031043106659,0.00798041103839,-0.00552693050079,0.0232180415786,0.0230489850143,-0.0437890272341,0.0165765426407,-0.248554261758,-7.81336427846e-05,0.00558145591752,0.283465844585,-0.571699956182,0.133474351994,-0.0785181945605,-0.419269930709,0.249547772912,0.631761009875,-0.431305975666,1,1,1,1,1,1,0.0158747497572,-9.02809160806e-05,0.00015574347618,4.10805373863e-06,0.00121532434965,0.101790351335,0.0647558401493,-0.00013654998677,-9.92872075948e-06,-5.5319694394e-05,0.00519320756484,0.412736586036,0.0011998026977,-1.53688189815e-05,1.94817888201e-05,-4.63111489966e-05,7.24547899029e-05,0.00880394144485,5.45309822095e-05,-0.000140943219275,-7.96211486227e-05,-1.04337307472e-05,0.000161936762028,0.00136273797767,-4.54737243585e-05,-3.4083840736e-05,3.69286883662e-05,9.9047243188e-08,3.75014011824e-06,-9.45366086368e-08,1,1,1,1,1,1,6.67488547054,0.102754199001,0.41668912471,0.00886867296479,0
 .00136206007228,-9.88642499013e-05}
+loss           | 144.965776158
+num_iterations | 10000
+</pre>
+-# Now let's look at the prediction functions. In the following examples we 
will
+use the training data set for prediction as well, which is not usual but 
serves to
+show the syntax. First we will test the classification example.
+The prediction is in the the estimated_class_text column with the
+actual value in the class_text column.
+<pre class="example">
+DROP TABLE IF EXISTS mlp_prediction;
+SELECT madlib.mlp_predict(
+         'mlp_model',         -- Model table
+         'iris_data',         -- Test data table
+         'id',                -- Id column in test table
+         'mlp_prediction',    -- Output table for predictions
+         'response'           -- Output classes, not probabilities
+     );
+-# View results
+<pre class="example">
+SELECT * FROM mlp_prediction JOIN iris_data USING (id);
+</pre>
+Result for the classification model:
+<pre class="result">
+ id | estimated_class_text |    attributes     |   class_text    | class
+----+----------------------+-------------------+-----------------+-------
+  1 | Iris-setosa          | {5.1,3.5,1.4,0.2} | Iris-setosa     |     1
+  2 | Iris-setosa          | {4.9,3.0,1.4,0.2} | Iris-setosa     |     1
+  3 | Iris-setosa          | {4.7,3.2,1.3,0.2} | Iris-setosa     |     1
+  4 | Iris-setosa          | {4.6,3.1,1.5,0.2} | Iris-setosa     |     1
+  5 | Iris-setosa          | {5.0,3.6,1.4,0.2} | Iris-setosa     |     1
+  6 | Iris-setosa          | {5.4,3.9,1.7,0.4} | Iris-setosa     |     1
+  7 | Iris-setosa          | {4.6,3.4,1.4,0.3} | Iris-setosa     |     1
+  8 | Iris-setosa          | {5.0,3.4,1.5,0.2} | Iris-setosa     |     1
+  9 | Iris-setosa          | {4.4,2.9,1.4,0.2} | Iris-setosa     |     1
+ 10 | Iris-setosa          | {4.9,3.1,1.5,0.1} | Iris-setosa     |     1
+ 11 | Iris-versicolor      | {7.0,3.2,4.7,1.4} | Iris-versicolor |     2
+ 12 | Iris-versicolor      | {6.4,3.2,4.5,1.5} | Iris-versicolor |     2
+ 13 | Iris-versicolor      | {6.9,3.1,4.9,1.5} | Iris-versicolor |     2
+ 14 | Iris-versicolor      | {5.5,2.3,4.0,1.3} | Iris-versicolor |     2
+ 15 | Iris-versicolor      | {6.5,2.8,4.6,1.5} | Iris-versicolor |     2
+ 16 | Iris-versicolor      | {5.7,2.8,4.5,1.3} | Iris-versicolor |     2
+ 17 | Iris-versicolor      | {6.3,3.3,4.7,1.6} | Iris-versicolor |     2
+ 18 | Iris-versicolor      | {4.9,2.4,3.3,1.0} | Iris-versicolor |     2
+ 19 | Iris-versicolor      | {6.6,2.9,4.6,1.3} | Iris-versicolor |     2
+ 20 | Iris-versicolor      | {5.2,2.7,3.9,1.4} | Iris-versicolor |     2
+</pre>
+Prediction using the regression model:
+<pre class="example">
+DROP TABLE IF EXISTS mlp_regress_prediction;
+SELECT madlib.mlp_predict(
+         'mlp_regress',               -- Model table
+         'lin_housing',               -- Test data table
+         'id',                        -- Id column in test table
+         'mlp_regress_prediction',    -- Output table for predictions
+         'response'                   -- Output values, not probabilities
+     );
+</pre>
+-# View results
+<pre class="example">
+SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id);
+</pre>
+Result for the regression model:
+<pre class="result">
+ id |                                    x                                    
| grp_by_col |  y   |    estimated_y
+----+-------------------------------------------------------------------------+------------+------+--------------------
+ 1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98}       | 
         1 |   24 | {23.2627062018087}
+ 2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14}      | 
         1 | 21.6 | {25.7088419115781}
+ 3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03}     | 
         1 | 34.7 | {27.5587003901404}
+ 4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94}     | 
         1 | 33.4 | {31.1812237427816}
+ 5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33}      | 
         1 | 36.2 | {30.3696873085477}
+ 6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21}      | 
         1 | 28.7 | {29.5290259241882}
+ 7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43}  | 
         1 | 22.9 | {21.1576051716888}
+ 8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15}  | 
         1 | 27.1 | {17.6194200563055}
+ 9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93}  | 
         1 | 16.5 | {15.1366297774139}
+10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1}  | 
         1 | 18.9 | {17.6528662199369}
+11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} | 
         1 |   15 | {17.2017487668181}
+12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27}  | 
         1 | 18.9 | {19.4893860319992}
+13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71}    | 
         1 | 21.7 | {23.2917226708039}
+14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26}        | 
         1 | 20.4 | {22.8904812605193}
+15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26}      | 
         1 | 18.2 | {18.2386754423677}
+16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47}       | 
         1 | 19.9 | {23.28949550874}
+17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58}       | 
         1 | 23.1 | {25.3288762085473}
+18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67}        | 
         1 | 17.5 | {19.0203738118451}
+19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69}      | 
         1 | 20.2 | {12.3162005347545}
+20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28}       | 
         1 | 18.2 | {21.0902211848747}
+</pre>
+Note that the results you get for all examples may vary with the platform you 
are using.
+
+@anchor background
+@par Technical Background
+
+To train a neural net, the respective loss function is minimized using 
stochastic gradient descent.
+In the case of classification, the loss function is cross entropy.  For 
regression, mean square error
+is used. Weights in the neural net are updated via the backpropogation 
process, which uses dynamic
+programming to compute the partial derivative of each weight with respect to 
the overall loss. This
+partial derivative incorporates the respective activation function used, so 
this requires that the
+activation function be differentiable.
+
+For an overview of multilayer perceptrons, see website [1].
+
+For details on backpropogation, see the notes at [2].
+
+@anchor literature
+@literature
+
+@anchor mlp-lit-1
+[1] "Multilayer Perceptron." Wikipedia. Wikimedia Foundation,
+    12 July 2017. Web. 12 July 2017.
+
+[2] Yu Hen Hu. "Lecture 11. MLP (III): Back-Propagation."
+    University of Wisconsin Madison: Computer-Aided Engineering. Web. 12 July 
2017,
+    
http://homepages.cae.wisc.edu/~ece539/videocourse/notes/pdf/lec%2011%20MLP%20(3)%20BP.pdf
+
+@anchor related
+@par Related Topics
+
+File mlp.sql_in documenting the training function
+
+*/
+
+
+CREATE TYPE MADLIB_SCHEMA.mlp_result AS (
+        coeff    DOUBLE PRECISION[],
+        loss     DOUBLE PRECISION
+);
+
+--------------------------------------------------------------------------
+-- create SQL functions for IGD optimizer
+--------------------------------------------------------------------------
+CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_transition(
+        state           DOUBLE PRECISION[],
+        start_vec       DOUBLE PRECISION[],
+        end_vec         DOUBLE PRECISION[],
+        previous_state  DOUBLE PRECISION[],
+        layer_sizes     DOUBLE PRECISION[],
+        stepsize        DOUBLE PRECISION,
+        activation      INTEGER,
+        is_classification INTEGER)
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE;
+
+CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_merge(
+        state1 DOUBLE PRECISION[],
+        state2 DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_final(
+        state DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
+/**
+ * @internal
+ * @brief Perform one iteration of backprop
+ */
+CREATE AGGREGATE MADLIB_SCHEMA.mlp_igd_step(
+        /* start_vec*/        DOUBLE PRECISION[],
+        /* end_vec */         DOUBLE PRECISION[],
+        /* previous_state */  DOUBLE PRECISION[],
+        /* layer_sizes */     DOUBLE PRECISION[],
+        /* stepsize */        DOUBLE PRECISION,
+        /* activation */      INTEGER,
+        /* is_classification */ INTEGER )(
+    STYPE=DOUBLE PRECISION[],
+    SFUNC=MADLIB_SCHEMA.mlp_igd_transition,
+    m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlp_igd_merge,')
+    FINALFUNC=MADLIB_SCHEMA.mlp_igd_final,
+    INITCOND='{0,0,0,0,0,0,0,0}'
+);
+-------------------------------------------------------------------------
+
+CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_igd_distance(
+    /*+ state1 */ DOUBLE PRECISION[],
+    /*+ state2 */ DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION AS
+'MODULE_PATHNAME'
+LANGUAGE c IMMUTABLE STRICT;
+
+CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_igd_result(
+    /*+ state */ DOUBLE PRECISION[])
+RETURNS MADLIB_SCHEMA.mlp_result AS
+'MODULE_PATHNAME'
+LANGUAGE c IMMUTABLE STRICT;
+-------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+    source_table      VARCHAR,
+    output_table      VARCHAR,
+    independent_varname    VARCHAR,
+    dependent_varname   VARCHAR,
+    hidden_layer_sizes         INTEGER[],
+    optimizer_params   VARCHAR,
+    activation      VARCHAR
+) RETURNS VOID AS $$
+    PythonFunctionBodyOnly(`convex', `mlp_igd')
+    mlp_igd.mlp(
+        schema_madlib,
+        source_table,
+        output_table,
+        independent_varname,
+        dependent_varname,
+        hidden_layer_sizes,
+        optimizer_params,
+        activation,
+        True
+    )
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+    source_table      VARCHAR,
+    output_table      VARCHAR,
+    independent_varname    VARCHAR,
+    dependent_varname   VARCHAR,
+    hidden_layer_sizes         INTEGER[],
+    optimizer_params   VARCHAR,
+    activation      VARCHAR
+) RETURNS VOID AS $$
+    PythonFunctionBodyOnly(`convex', `mlp_igd')
+    mlp_igd.mlp(
+        schema_madlib,
+        source_table,
+        output_table,
+        independent_varname,
+        dependent_varname,
+        hidden_layer_sizes,
+        optimizer_params,
+        activation,
+        False
+    )
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict(
+    model_table      VARCHAR,
+    data_table      VARCHAR,
+    id_col_name    VARCHAR,
+    output_table      VARCHAR,
+    pred_type      VARCHAR
+) RETURNS VOID AS $$
+    PythonFunctionBodyOnly(`convex', `mlp_igd')
+    mlp_igd.mlp_predict(
+        schema_madlib,
+        model_table,
+        data_table,
+        id_col_name,
+        output_table,
+        pred_type)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp(
+        coeff DOUBLE PRECISION[],
+        independent_varname DOUBLE PRECISION[],
+        is_classification DOUBLE PRECISION,
+        activation_function DOUBLE PRECISION,
+        layer_sizes DOUBLE PRECISION[],
+        is_response INTEGER
+    )
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
+   message TEXT
+) RETURNS TEXT AS $$
+    PythonFunctionBodyOnly(`convex', `mlp_igd')
+    return mlp_igd.mlp_help(schema_madlib,message,True)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification()
+RETURNS TEXT AS $$
+  SELECT MADLIB_SCHEMA.mlp_classification(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression(
+   message TEXT
+) RETURNS TEXT AS $$
+    PythonFunctionBodyOnly(`convex', `mlp_igd')
+    return mlp_igd.mlp_help(schema_madlib,message,False)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression()
+RETURNS TEXT AS $$
+  SELECT MADLIB_SCHEMA.mlp_regression(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict(
+   message TEXT
+) RETURNS TEXT AS $$
+    PythonFunctionBodyOnly(`convex', `mlp_igd')
+    return mlp_igd.mlp_predict_help(schema_madlib,message)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict()
+RETURNS TEXT AS $$
+  SELECT MADLIB_SCHEMA.mlp_predict(NULL::TEXT)
+$$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4fcb60ed/src/ports/postgres/modules/convex/mlp_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in 
b/src/ports/postgres/modules/convex/mlp_igd.py_in
new file mode 100644
index 0000000..6cea7b0
--- /dev/null
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -0,0 +1,752 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+@file mlp_igd.py_in
+
+@brief Multilayer perceptron using IGD: Driver functions
+
+@namespace mlp_igd
+"""
+import plpy
+
+from utilities.control import MinWarning
+from utilities.utilities import add_postfix
+from utilities.utilities import py_list_to_sql_string
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import _assert
+from utilities.utilities import unique_string
+from utilities.utilities import strip_end_quotes
+
+from utilities.validate_args import cols_in_tbl_valid
+from utilities.validate_args import input_tbl_valid
+from utilities.validate_args import is_var_valid
+from utilities.validate_args import output_tbl_valid
+from utilities.validate_args import get_expr_type
+from utilities.validate_args import array_col_has_same_dimension
+from utilities.validate_args import array_col_dimension
+
+
+def mlp(schema_madlib, source_table, output_table, independent_varname,
+        dependent_varname, hidden_layer_sizes,
+        optimizer_param_str, activation, is_classification, **kwargs):
+    """
+    Args:
+        @param schema_madlib
+        @param source_table
+        @param output_table
+        @param independent_varname
+        @param dependent_varname
+        @param hidden_layer_sizes
+        @param optimizer_param_str
+
+    Returns:
+        None
+    """
+    with MinWarning('warning'):
+        optimizer_params = _get_optimizer_params(optimizer_param_str or "")
+        summary_table = add_postfix(output_table, "_summary")
+        _validate_args(source_table, output_table, summary_table, 
independent_varname,
+                       dependent_varname, hidden_layer_sizes,
+                       optimizer_params, is_classification)
+
+        current_iteration = 1
+        prev_state = None
+        tolerance = optimizer_params["tolerance"]
+        n_iterations = optimizer_params["n_iterations"]
+        step_size = optimizer_params["step_size"]
+        n_tries = optimizer_params["n_tries"]
+        activation_name = _get_activation_function_name(activation)
+        activation_index = _get_activation_index(activation_name)
+        num_input_nodes = array_col_dimension(
+            source_table, independent_varname)
+        num_output_nodes = 0
+        classes = []
+        dependent_type = get_expr_type(dependent_varname, source_table)
+        original_dependent_varname = dependent_varname
+
+        if is_classification:
+            dependent_variable_sql = """
+                SELECT DISTINCT {dependent_varname}
+                FROM {source_table}
+                """.format(dependent_varname=dependent_varname,
+                           source_table=source_table)
+            labels = plpy.execute(dependent_variable_sql)
+            one_hot_dependent_varname = 'ARRAY['
+            num_output_nodes = len(labels)
+            for label_obj in labels:
+                label = _format_label(label_obj[dependent_varname])
+                classes.append(label)
+                one_hot_dependent_varname += dependent_varname + \
+                    "=" + str(label) + ","
+            # Remove the last comma
+            one_hot_dependent_varname = one_hot_dependent_varname[:-1]
+            one_hot_dependent_varname += ']::integer[]'
+            dependent_varname = one_hot_dependent_varname
+        else:
+            if "[]" not in dependent_type:
+                dependent_varname = "ARRAY[" + dependent_varname + "]"
+            num_output_nodes = array_col_dimension(
+                source_table, dependent_varname)
+        layer_sizes = [num_input_nodes] + \
+            hidden_layer_sizes + [num_output_nodes]
+
+        while True:
+            if prev_state:
+                prev_state_str = py_list_to_sql_string(
+                    prev_state, array_type="double precision")
+            else:
+                prev_state_str = "(NULL)::DOUBLE PRECISION[]"
+            train_sql = """
+            SELECT
+                {schema_madlib}.mlp_igd_step(
+                    ({independent_varname})::DOUBLE PRECISION[],
+                    ({dependent_varname})::DOUBLE PRECISION[],
+                    {prev_state},
+                    {layer_sizes},
+                    ({step_size})::FLOAT8,
+                    {activation},
+                    {is_classification}) as curr_state
+            FROM {source_table} AS _src
+            """.format(schema_madlib=schema_madlib,
+                       independent_varname=independent_varname,
+                       dependent_varname=dependent_varname,
+                       prev_state=prev_state_str,
+                       # C++ uses double internally
+                       layer_sizes=py_list_to_sql_string(layer_sizes,
+                                                         array_type="double 
precision"),
+                       step_size=step_size,
+                       source_table=source_table,
+                       activation=activation_index,
+                       is_classification=int(is_classification))
+            curr_state = plpy.execute(train_sql)[0]["curr_state"]
+            dist_sql = """
+                SELECT {schema_madlib}.internal_mlp_igd_distance(
+                        {prev_state},
+                        {curr_state}) as state_dist
+                """.format(schema_madlib=schema_madlib,
+                           prev_state=prev_state_str,
+                           curr_state=py_list_to_sql_string(curr_state, 
"double precision"))
+            state_dist = plpy.execute(dist_sql)[0]["state_dist"]
+            if ((state_dist and state_dist < tolerance) or
+                    current_iteration > n_iterations):
+                break
+            prev_state = curr_state
+            current_iteration += 1
+        _build_model_table(schema_madlib, output_table,
+                           curr_state, n_iterations)
+        layer_sizes_str = py_list_to_sql_string(
+            layer_sizes, array_type="integer")
+        classes_str = py_list_to_sql_string(
+            [strip_end_quotes(cl, "'") for cl in classes],
+            array_type=dependent_type)
+        summary_table_creation_query = """
+        CREATE TABLE {summary_table}(
+            source_table TEXT,
+            independent_varname TEXT,
+            dependent_varname TEXT,
+            tolerance FLOAT,
+            step_size FLOAT,
+            n_iterations INTEGER,
+            n_tries INTEGER,
+            layer_sizes INTEGER[],
+            activation_function TEXT,
+            is_classification BOOLEAN,
+            classes {dependent_type}[]
+        )""".format(summary_table=summary_table,
+                    dependent_type=dependent_type)
+
+        summary_table_update_query = """
+            INSERT INTO {summary_table} VALUES(
+                '{source_table}',
+                '{independent_varname}',
+                '{original_dependent_varname}',
+                {tolerance},
+                {step_size},
+                {n_iterations},
+                {n_tries},
+                {layer_sizes_str},
+                '{activation_name}',
+                {is_classification},
+                {classes_str}
+            )
+            """.format(**locals())
+        plpy.execute(summary_table_creation_query)
+        plpy.execute(summary_table_update_query)
+# ----------------------------------------------------------------------
+
+
+def _build_model_table(schema_madlib, output_table, final_state, n_iterations):
+    final_state_str = py_list_to_sql_string(
+        final_state, array_type="double precision")
+
+    model_table_query = """
+        CREATE TABLE {output_table} AS
+            SELECT
+                (result).coeff AS coeff,
+                (result).loss  AS loss,
+                {n_iterations} AS num_iterations
+                -- (result).num_rows_processed     AS num_rows_processed,
+                -- n_tuples_including_nulls - (result).num_rows_processed
+            FROM (
+                SELECT
+                    {schema_madlib}.internal_mlp_igd_result(
+                        {final_state_str}
+                    ) AS result
+            ) rel_state_subq
+        """.format(**locals())
+    plpy.execute(model_table_query)
+# ----------------------------------------------------------------------
+
+
+def _get_optimizer_params(param_str):
+    params_defaults = {
+        "step_size": (0.001, float),
+        "n_iterations": (100, int),
+        "n_tries": (1, int),
+        "tolerance": (0.001, float),
+    }
+    param_defaults = dict([(k, v[0]) for k, v in params_defaults.items()])
+    param_types = dict([(k, v[1]) for k, v in params_defaults.items()])
+
+    if not param_str:
+        return param_defaults
+
+    name_value = extract_keyvalue_params(param_str, param_types, 
param_defaults,
+                                         ignore_invalid=False)
+    return name_value
+# ----------------------------------------------------------------------
+
+
+def _validate_args_classification(source_table, dependent_varname):
+    expr_type = get_expr_type(dependent_varname, source_table)
+    int_types = ['integer', 'smallint', 'bigint']
+    text_types = ['text', 'varchar', 'character varying', 'char', 'character']
+    boolean_types = ['boolean']
+    _assert("[]" in expr_type or expr_type in int_types + text_types + 
boolean_types,
+            "Dependent variable column should refer to an "
+            "integer, boolean, text, varchar, or character type.")
+# ----------------------------------------------------------------------
+
+
+def _validate_args_regression(source_table, dependent_varname):
+    expr_type = get_expr_type(dependent_varname, source_table)
+    int_types = ['integer', 'smallint', 'bigint']
+    float_types = ['double precision', 'real']
+    _assert("[]" in expr_type or expr_type in int_types + float_types,
+            "Dependent variable column should refer to an array or numeric 
type")
+    if "[]" in expr_type:
+        _assert(array_col_has_same_dimension(source_table, dependent_varname),
+                "Dependent variable column should refer to arrays of the same 
length")
+# ----------------------------------------------------------------------
+
+
+def _validate_args(source_table, output_table, summary_table, 
independent_varname,
+                   dependent_varname, hidden_layer_sizes,
+                   optimizer_params, is_classification):
+    input_tbl_valid(source_table, "MLP")
+    output_tbl_valid(output_table, "MLP")
+    output_tbl_valid(summary_table, "MLP")
+    _assert(is_var_valid(source_table, independent_varname),
+            "MLP error: invalid independent_varname "
+            "('{independent_varname}') for source_table "
+            "({source_table})!".format(independent_varname=independent_varname,
+                                       source_table=source_table))
+
+    _assert(is_var_valid(source_table, dependent_varname),
+            "MLP error: invalid dependent_varname "
+            "('{dependent_varname}') for source_table "
+            "({source_table})!".format(dependent_varname=dependent_varname,
+                                       source_table=source_table))
+    _assert(hidden_layer_sizes is not None,
+            "hidden_layer_sizes may not be null")
+    _assert(isinstance(hidden_layer_sizes, list),
+            "hidden_layer_sizes must be an array of integers")
+    _assert(all(isinstance(value, int) for value in hidden_layer_sizes),
+            "MLP error: Hidden layers sizes must be integers")
+    _assert(all(value >= 0 for value in hidden_layer_sizes),
+            "MLP error: Hidden layers sizes must be greater than 0.")
+    _assert(optimizer_params["tolerance"] >= 0,
+            "MLP error: Tolerance should be greater than or equal to 0.")
+    _assert(optimizer_params["n_tries"] >= 1,
+            "MLP error: Number of tries should be greater than or equal to 1")
+    _assert(optimizer_params["n_iterations"] >= 1,
+            "MLP error: Number of iterations should be greater than or equal 
to 1")
+    _assert(optimizer_params["step_size"] > 0,
+            "MLP error: Stepsize should be greater than 0.")
+    _assert("[]" in get_expr_type(independent_varname, source_table),
+            "Independent variable column should refer to an array")
+    _assert(array_col_has_same_dimension(source_table, independent_varname),
+            "Independent variable column should refer to arrays of the same 
length")
+
+    if is_classification:
+        _validate_args_classification(source_table, dependent_varname)
+    else:
+        _validate_args_regression(source_table, dependent_varname)
+# ----------------------------------------------------------------------
+
+
+def _get_activation_function_name(activation_function):
+    if not activation_function:
+        activation_function = 'sigmoid'
+    else:
+        # Add non-linear kernels below after implementing them.
+        supported_activation_function = ['sigmoid', 'tanh', 'relu']
+        try:
+            # allow user to specify a prefix substring of
+            # supported kernels. This works because the supported
+            # kernels have unique prefixes.
+            activation_function = next(x for x in supported_activation_function
+                                       if x.startswith(activation_function))
+        except StopIteration:
+            # next() returns a StopIteration if no element found
+            plpy.error("MLP Error: Invalid activation function: "
+                       "{0}. Supported activation functions are ({1})"
+                       .format(activation_function, ','.join(
+                           sorted(supported_activation_function))))
+    return activation_function
+# 
------------------------------------------------------------------------------
+
+
+def _get_activation_index(activation_name):
+    table = {"relu": 0, "sigmoid": 1, "tanh": 2}
+    return table[activation_name]
+
+
+def _format_label(label):
+    if isinstance(label, str):
+        return "'" + label + "'"
+    return label
+# -------------------------------------------------------------------------
+
+
+def mlp_predict(schema_madlib, model_table, data_table,
+                id_col_name, output_table,
+                pred_type='response', **kwargs):
+    """ Score new observations using a trained neural network
+
+    @param schema_madlib Name of the schema where MADlib is installed
+    @param model_table Name of learned model
+    @param data_table Name of table/view containing the data
+                          points to be scored
+    @param id_col_name Name of column in source_table containing
+                       (integer) identifier for data point
+    @param output_table Name of table to store the results
+    @param pred_type: str, The type of output required:
+                    'response' gives the actual response values,
+                    'prob' gives the probability of the classes in a
+                  For regression, only type='response' is defined.
+    """
+    input_tbl_valid(model_table, 'MLP')
+    cols_in_tbl_valid(model_table, ['coeff'], 'MLP')
+    summary_table = add_postfix(model_table, "_summary")
+    input_tbl_valid(summary_table, 'MLP')
+    cols_in_tbl_valid(summary_table,
+                      ['dependent_varname', 'independent_varname',
+                       'activation_function',
+                       'tolerance', 'step_size', 'n_iterations',
+                       'n_tries', 'classes', 'layer_sizes', 'source_table'],
+                      'MLP')
+
+    summary = plpy.execute("SELECT * FROM {0}".format(summary_table))[0]
+    coeff = py_list_to_sql_string(plpy.execute(
+        "SELECT * FROM {0}".format(model_table))[0]["coeff"])
+    dependent_varname = summary['dependent_varname']
+    independent_varname = summary['independent_varname']
+    source_table = summary['source_table']
+    activation_function = _get_activation_index(summary['activation_function'])
+    layer_sizes = py_list_to_sql_string(
+        summary['layer_sizes'], array_type="DOUBLE PRECISION")
+    is_classification = int(summary["is_classification"])
+    is_response = int(pred_type == 'response')
+
+    pred_name = ('"prob_{0}"' if pred_type == "prob" else
+                 '"estimated_{0}"').format(dependent_varname.replace('"', 
'').strip())
+
+    input_tbl_valid(data_table, 'MLP')
+
+    _assert(is_var_valid(data_table, independent_varname),
+            "MLP Error: independent_varname ('{0}') is invalid for data_table 
({1})".
+            format(independent_varname, data_table))
+    _assert(id_col_name is not None, "MLP Error: id_col_name is NULL")
+    _assert(is_var_valid(data_table, id_col_name),
+            "MLP Error: id_col_name ('{0}') is invalid for {1}".
+            format(id_col_name, data_table))
+    output_tbl_valid(output_table, 'MLP')
+
+    with MinWarning("warning"):
+        header = "CREATE TABLE " + output_table + " AS "
+        # Regression
+        if not is_classification:
+            dependent_type = get_expr_type(dependent_varname, source_table)
+            unnest_if_not_array = ""
+            # Return the same type as the user provided.  Internally we always 
use an array, but
+            # if they provided a scaler, unnest it for the user
+            if "[]" not in dependent_type:
+                unnest_if_not_array = "UNNEST"
+            sql = header + """
+                SELECT {id_col_name},
+                       
{unnest_if_not_array}({schema_madlib}.internal_predict_mlp(
+                            {coeff},
+                            {independent_varname}::DOUBLE PRECISION[],
+                            {is_classification},
+                            {activation_function},
+                            {layer_sizes},
+                            {is_response}
+                        )) as {pred_name}
+                FROM {data_table}
+                """
+        else:
+            summary_query = """
+            SELECT classes FROM {0}
+            """.format(summary_table)
+            classes = plpy.execute(summary_query)[0]['classes']
+            if pred_type == "response":
+                # This join is to recover the class name from the summary 
table,
+                #  as prediction just returns an index
+                classes_with_index_table = unique_string()
+                classes_table = unique_string()
+                sql = header + """
+                        SELECT
+                             q.{id_col_name}
+                            ,(ARRAY{classes})[pred_idx[1]+1] as {pred_name}
+                        FROM (
+                             SELECT
+                                {id_col_name},
+                                {schema_madlib}.internal_predict_mlp(
+                                        {coeff}::DOUBLE PRECISION[],
+                                        {independent_varname}::DOUBLE 
PRECISION[],
+                                        {is_classification},
+                                        {activation_function},
+                                        {layer_sizes},
+                                        {is_response}
+                                        )
+                               as pred_idx
+                            FROM {data_table}
+                        ) q
+                    """
+            else:
+                # Incomplete
+                intermediate_col = unique_string()
+                score_format = ',\n'.join([
+                    'CAST({interim}[{j}] as DOUBLE PRECISION) as 
"estimated_prob_{c_str}"'.
+                    format(j=i + 1, c_str=str(c).strip(' "'),
+                           interim=intermediate_col)
+                    for i, c in enumerate(classes)])
+                sql = header + """
+                    SELECT
+                        {id_col_name},
+                        {score_format}
+                        FROM (
+                            SELECT {id_col_name},
+                                   {schema_madlib}.internal_predict_mlp(
+                                       {coeff}::DOUBLE PRECISION[],
+                                       {independent_varname}::DOUBLE 
PRECISION[],
+                                       {is_classification},
+                                       {activation_function},
+                                       {layer_sizes},
+                                       {is_response}
+                                       )::TEXT[]
+                                            AS {intermediate_col}
+                            FROM {data_table}
+                        ) q
+                    """
+    sql = sql.format(**locals())
+    plpy.execute(sql)
+# ----------------------------------------------------------------------
+
+
+def mlp_help(schema_madlib, message, is_classification):
+    method = 'mlp_classification' if is_classification else 'mlp_regression'
+    int_types = ['integer', 'smallint', 'bigint']
+    text_types = ['text', 'varchar', 'character varying', 'char', 'character']
+    boolean_types = ['boolean']
+    supported_types = " " * 33 + ", ".join(text_types) + "\n" +\
+        " " * 33 + ", ".join(int_types + boolean_types)
+    label_description_classification = "Name of a column which specifies 
label.\n" +\
+        " " * 33 + "Supported types are:\n" + supported_types
+    label_description_regression = "Dependent variable. May be an array for 
multiple\n" +\
+        " " * 33 + "regression or the name of a column which is any\n" + " " * 
33 +\
+        "numeric type for single regression"
+    label_description = label_description_classification if is_classification\
+        else label_description_regression
+    args = dict(schema_madlib=schema_madlib, method=method,
+                label_description=label_description)
+
+    summary = """
+    ----------------------------------------------------------------
+                            SUMMARY
+    ----------------------------------------------------------------
+    Multilayer Perceptron (MLP) is a model for regression and
+    classification
+
+    Also called "vanilla neural networks", they consist of several
+    fully connected hidden layers with non-linear activation
+    functions.
+
+    For more details on function usage:
+        SELECT {schema_madlib}.{method}('usage')
+
+    For a small example on using the function:
+        SELECT {schema_madlib}.{method}('example')""".format(**args)
+
+    usage = """
+    ---------------------------------------------------------------------------
+                                    USAGE
+    ---------------------------------------------------------------------------
+    SELECT {schema_madlib}.{method}(
+        source_table,         -- name of input table
+        output_table,         -- name of output model table
+        independent_varname,  -- name of independent variable
+        dependent_varname,    -- {label_description}
+        hidden_layer_sizes,   -- Array of integers indicating the
+                                 number of hidden units per layer.
+                                 Length equal to the number of hidden layers.
+        optimizer_params,     -- optional, default NULL
+                                 parameters for optimization in
+                                 a comma-separated string of key-value pairs.
+
+            step_size DOUBLE PRECISION, -- Default: 0.001
+                                           Learning rate
+            n_iterations INTEGER,       -- Default: 100
+                                           Number of iterations per try
+            n_tries INTEGER,            -- Default: 1
+                                           Total number of training cycles,
+                                           with random initializations to avoid
+                                           local minima.
+            tolerance DOUBLE PRECISION, -- Default: 0.001
+                                           If the distance in loss between
+                                           two iterations is less than the
+                                           tolerance training will stop, even 
if
+                                           n_iterations has not been reached
+
+        activation            -- optional, default: 'sigmoid'.
+                                 supported activations: 'relu', 'sigmoid',
+                                 and 'tanh'
+    );
+
+
+    ---------------------------------------------------------------------------
+                                    OUTPUT
+    ---------------------------------------------------------------------------
+    The model table produced by MLP contains the following columns:
+
+    coeffs             -- Flat array containing the weights of the neural net
+
+    loss               -- The total loss over the training data. Cross entropy
+                          for classification and MSE for regression
+
+    num_iterations     -- The total number of training iterations
+
+    """.format(**args)
+
+    regression_example = """
+    - Create input table
+
+    CREATE TABLE lin_housing_wi (id serial, x float8[], grp_by_col int, y 
float8);
+    COPY lin_housing_wi (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|';
+    
{1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98} 
| 1 | 24.00
+    
{1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14} | 
1 | 21.60
+    
{1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03} | 
1 | 34.70
+    
{1,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94} | 
1 | 33.40
+    
{1,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33} | 
1 | 36.20
+    
{1,0.02985,0.00,2.180,0,0.4580,6.4300,58.70,6.0622,3,222.0,18.70,394.12,5.21} | 
1 | 28.70
+    
{1,0.08829,12.50,7.870,0,0.5240,6.0120,66.60,5.5605,5,311.0,15.20,395.60,12.43} 
| 1 | 22.90
+    
{1,0.14455,12.50,7.870,0,0.5240,6.1720,96.10,5.9505,5,311.0,15.20,396.90,19.15} 
| 1 | 27.10
+    
{1,0.21124,12.50,7.870,0,0.5240,5.6310,100.00,6.0821,5,311.0,15.20,386.63,29.93}
 | 1 | 16.50
+    
{1,0.17004,12.50,7.870,0,0.5240,6.0040,85.90,6.5921,5,311.0,15.20,386.71,17.10} 
| 1 | 18.90
+    
{1,0.22489,12.50,7.870,0,0.5240,6.3770,94.30,6.3467,5,311.0,15.20,392.52,20.45} 
| 1 | 15.00
+    
{1,0.11747,12.50,7.870,0,0.5240,6.0090,82.90,6.2267,5,311.0,15.20,396.90,13.27} 
| 1 | 18.90
+    
{1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71} 
| 1 | 21.70
+    \.
+
+    - Generate a multilayer perception with a two hidden layers of 5 units
+    each. Use the x column as the independent variables, and use the class
+    column as the classification. Set the tolerance to 0 so that 300
+    iterations will be run. Use a sigmoid activation function.
+    The model will be written to mlp_regress_result.
+
+    SELECT mlp_regression(
+        'lin_housing_wi',           -- Source table
+        'mlp_regress_result',  -- Desination table
+        'x',                        -- Independent variable
+        'y',                        -- Dependent variable
+        ARRAY[5,5],                 -- Number of hidden units per layer
+        'step_size=0.007,
+        n_iterations=300,
+        tolerance=0',
+        'sigmoid');                 -- Activation
+
+    """
+
+    classification_example = """
+    -- Create input table
+
+    CREATE TABLE iris_data(
+        id integer,
+        attributes numeric[],
+        class_text varchar,
+        class integer
+    );
+
+    INSERT INTO iris_data VALUES
+    (1,ARRAY[5.1,3.5,1.4,0.2],'Iris-setosa',1),
+    (2,ARRAY[4.9,3.0,1.4,0.2],'Iris-setosa',1),
+    (3,ARRAY[4.7,3.2,1.3,0.2],'Iris-setosa',1),
+    (4,ARRAY[4.6,3.1,1.5,0.2],'Iris-setosa',1),
+    (5,ARRAY[5.0,3.6,1.4,0.2],'Iris-setosa',1),
+    (6,ARRAY[5.4,3.9,1.7,0.4],'Iris-setosa',1),
+    (7,ARRAY[4.6,3.4,1.4,0.3],'Iris-setosa',1),
+    (8,ARRAY[5.0,3.4,1.5,0.2],'Iris-setosa',1),
+    (9,ARRAY[4.4,2.9,1.4,0.2],'Iris-setosa',1),
+    (10,ARRAY[4.9,3.1,1.5,0.1],'Iris-setosa',1),
+    (11,ARRAY[7.0,3.2,4.7,1.4],'Iris-versicolor',2),
+    (12,ARRAY[6.4,3.2,4.5,1.5],'Iris-versicolor',2),
+    (13,ARRAY[6.9,3.1,4.9,1.5],'Iris-versicolor',2),
+    (14,ARRAY[5.5,2.3,4.0,1.3],'Iris-versicolor',2),
+    (15,ARRAY[6.5,2.8,4.6,1.5],'Iris-versicolor',2),
+    (16,ARRAY[5.7,2.8,4.5,1.3],'Iris-versicolor',2),
+    (17,ARRAY[6.3,3.3,4.7,1.6],'Iris-versicolor',2),
+    (18,ARRAY[4.9,2.4,3.3,1.0],'Iris-versicolor',2),
+    (19,ARRAY[6.6,2.9,4.6,1.3],'Iris-versicolor',2),
+    (20,ARRAY[5.2,2.7,3.9,1.4],'Iris-versicolor',2);
+
+
+    -- Generate a multilayer perception with a single hidden layer of 5 units.
+    Use the attributes column as the independent variables, and use the class
+    column as the classification. Set the tolerance to 0 so that 1000
+    iterations will be run. Use a hyperbolic tangent activation function.
+    The model will be written to mlp_result.
+
+    SELECT {schema_madlib}.mlp_classification(
+        'iris_data',      -- Source table
+        'mlp_model',      -- Destination table
+        'attributes',     -- Input features
+        'class_text',     -- Label
+        ARRAY[5],         -- Number of units per layer
+        'step_size=0.003,
+        n_iterations=5000,
+        tolerance=0',     -- Optimizer params
+        'tanh');          -- Activation function
+
+    """.format(**args)
+    example = classification_example if is_classification else 
regression_example
+    if not message:
+        return summary
+    elif message.lower() in ('usage', 'help', '?'):
+        return usage
+    elif message.lower() == 'example':
+        return example
+    return """
+        No such option. Use "SELECT {schema_madlib}.{method}()" for help.
+    """.format(**args)
+
+
+def mlp_predict_help(schema_madlib, message):
+    args = dict(schema_madlib=schema_madlib)
+
+    summary = """
+    ----------------------------------------------------------------
+                            SUMMARY
+    ----------------------------------------------------------------
+    Multilayer Perceptron (MLP) is a model for regression and
+    classification
+
+    Also called "vanilla neural networks", they consist of several
+    fully connected hidden layers with non-linear activation
+    functions.
+
+    For more details on function usage:
+        SELECT {schema_madlib}.mlp_predict('usage')
+
+    For a small example on using the function:
+        SELECT {schema_madlib}.mlp_predict('example')""".format(**args)
+
+    usage = """
+    ---------------------------------------------------------------------------
+                                    USAGE
+    ---------------------------------------------------------------------------
+    SELECT {schema_madlib}.mlp_predict(
+        model_table,        -- name of model table
+        data_table,         -- name of data table
+        id_col_name,        -- id column for data table
+        output_table,       -- name of output table
+        pred_type           -- the type of output requested:
+                            -- 'response' gives the actual prediction,
+                            -- 'prob' gives the probability of each class.
+                            -- for regression, only type='response' is defined.
+    );
+
+
+    ---------------------------------------------------------------------------
+                                    OUTPUT
+    ---------------------------------------------------------------------------
+    The model table produced by mlp contains the following columns:
+
+    id                      -- The provided id for the given input vector
+
+    estimated_<COL_NAME>    -- (For pred_type='response') The estimated class
+                               for classification or value for regression, 
where
+                               <COL_NAME> is the name of the column to be
+                               predicted from training data
+
+    prob_<CLASS>           -- (For pred_type='prob' for classification) The
+                              probability of a given class <CLASS> as given by
+                              softmax. There will be one column for each class
+                              in the training data.
+
+    """.format(**args)
+
+    example = """
+    -- See {schema_madlib}.mlp_classification('example') for test
+    -- and model tables
+
+    -- Predict classes using
+    SELECT {schema_madlib}.mlp_predict(
+        'mlp_model',         -- Model table
+        'iris_data',         -- Test data table
+        'id',                -- Id column in test table
+        'mlp_prediction',    -- Output table for predictions
+        'response'           -- Output classes, not probabilities
+    );
+
+    SELECT * FROM mlp_prediction;
+
+    WITH total_count AS (SELECT count(*) AS c FROM iris_data)
+    SELECT count(*)/((SELECT c FROM total_count)::DOUBLE PRECISION)
+    AS train_accuracy
+    FROM
+        (
+            SELECT iris_data.class_text AS actual_label,
+                mlp_prediction.estimated_class_text AS predicted_label
+            FROM mlp_prediction
+            INNER JOIN iris_data ON iris_data.id=mlp_prediction.id
+        ) q
+    WHERE q.actual_label=q.predicted_label;
+    """.format(**args)
+
+    if not message:
+        return summary
+    elif message.lower() in ('usage', 'help', '?'):
+        return usage
+    elif message.lower() == 'example':
+        return example
+    return """
+        No such option. Use "SELECT {schema_madlib}.mlp_predict()" for help.
+    """.format(**args)

[2/3] incubator-madlib git commit: New module: Multilayer Perceptron

Reply via email to