http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4fcb60ed/src/ports/postgres/modules/convex/mlp.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in new file mode 100644 index 0000000..400f892 --- /dev/null +++ b/src/ports/postgres/modules/convex/mlp.sql_in @@ -0,0 +1,752 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * + * @file mlp.sql_in + * + * @brief SQL functions for multilayer perceptron + * @date June 2012 + * + * + *//* ----------------------------------------------------------------------- */ + +m4_include(`SQLCommon.m4') + +/** +@addtogroup grp_mlp + +<div class="toc"><b>Contents</b><ul> +<li class="level1"><a href="#mlp_classification">Classification</a></li> +<li class="level1"><a href="#mlp_regression">Regression</a></li> +<li class="level1"><a href="#optimization_params">Optimizer Parameters</a></li> +<li class="level1"><a href="#predict">Prediction Functions/a></li> +<li class="level1"><a href="#example">Examples</a></li> +<li class="level1"><a href="#background">Technical Background</a></li> +<li class="level1"><a href="#literature">Literature</a></li> +<li class="level1"><a href="#related">Related Topics</a></li> +</ul></div> + +Multilayer Perceptron (MLP) is a model for regression and +classification. + +Also called "vanilla neural networks", they consist of several +fully connected hidden layers with non-linear activation +functions. In the case of classification, the final layer of the +neural net has as many nodes as classes, and the output of the +neural net can be interpreted as the probability of a given input +feature belonging to a specific class. + + +@brief Solves classification and regression problems with several +fully connected layers and nonlinear activation functions. + +@anchor mlp_classification +@par Classification Training Function +The mlp classification training function has the following format: +<pre class="syntax"> +mlp_classification( + source_table, + output_table, + independent_varname, + dependent_varname, + hidden_layer_sizes, + optimizer_params, + activation + ) +</pre> +\b Arguments +<DL class="arglist"> + <DT>source_table</DT> + <DD>TEXT. Name of the table containing the training data.</DD> + + <DT>output_table</DT> + <DD>TEXT. Name of the output table containing the model. Details of the output + tables are provided below. + </DD> + + <DT>independent_varname</DT> + <DD>TEXT. Expression list to evaluate for the + independent variables. An intercept variable should not be included as part + of this expression. Please note that expression should be able to be cast + to DOUBLE PRECISION[]. + </DD> + + <DT>dependent_varname</DT> + <DD> TEXT. Name of the dependent variable column. For classification, supported types are: + text, varchar, character varying, char, character + integer, smallint, bigint, and boolean. </DD> + + <DT>hidden_layer_sizes (optional)</DT> + <DD>INTEGER[], default: ARRAY[]. + The number of neurons in each hidden layer. The length of this array will + determine the number of hidden layers. Empty for no hidden layers. + </DD> + + + <DT>optimizer_params (optional)</DT> + <DD>TEXT, default: NULL. + Parameters for optimization in a comma-separated string + of key-value pairs. See the description below for details. + </DD> + + <DT>activation (optional)</DT> + <DD>TEXT, default: 'sigmoid'. + Activation function. Currently three functions are supported: 'sigmoid' (default), + 'relu', and 'tanh'. The text can be any prefix of the three + strings; for e.g., activation='s' will use the sigmoid activation. + </DD> +</DL> + +<b>Output tables</b> +<br> + The model table produced by mlp contains the following columns: + <table class="output"> + <tr> + <th>coeffs</th> + <td>FLOAT8[]. Flat array containing the weights of the neural net</td> + </tr> + <tr> + <th>n_iterations</th> + <td>INTEGER. Number of iterations completed by stochastic gradient descent + algorithm. The algorithm either converged in this number of iterations + or hit the maximum number specified in the optimization parameters. </td> + </tr> + <tr> + <th>loss</th> + <td>FLOAT8. The cross entropy over the training data. + See Technical Background section below for more details.</td> + </tr> + </table> + + +A summary table named \<output_table\>_summary is also created, which has the following columns: + <table class="output"> + <tr> + <th>source_table</th> + <td>The source table.</td> + </tr> + <tr> + <th>dependent_varname</th> + <td>The dependent variable.</td> + </tr> + <tr> + <th>independent_varname</th> + <td>The independent variables.</td> + </tr> + <tr> + <th>tolerance</th> + <td>The tolerance as given in optimizer_params.</td> + </tr> + <tr> + <th>step_size</th> + <td>The step size as given in optimizer_params.</td> + </tr> + <tr> + <th>n_iterations</th> + <td>The number of iterations run</td> + </tr> + <tr> + <th>n_tries</th> + <td>The number of tries as given in optimizer_params.</td> + </tr> + <tr> + <th>layer_sizes</th> + <td>The number of units in each layer including the input and output layer.</td> + </tr> + <tr> + <th>activation_function</th> + <td>The activation function.</td> + </tr> + <tr> + <th>is_classification</th> + <td>True if the model was trained for classification, False if it was trained + for regression</td> + </tr> + <tr> + <th>classes</th> + <td>The classes which were trained against (empty for regression)</td> + </tr> + + </table> + + +@anchor mlp_regression +@par Regression Training Function +The mlp regression training function has the following format: +<pre class="syntax"> +mlp_regression(source_table, + source_table, + output_table, + independent_varname, + dependent_varname, + hidden_layer_sizes, + optimizer_params, + activation + ) +</pre> + +\b Arguments + +Specifications for regression are largely the same as for classification. In the +model table, the loss will refer to mean square error instead of cross entropy. In the +summary table, there is classes column. The following +arguments have specifications which differ from mlp_classification: +<DL class="arglist"> +<DT>dependent_varname</DT> + <DD>TEXT. Name of the dependent variable column. + For regression supported types are any numeric type, or array + or numeric types (for multiple regression). + </DD> +</DL> + + +@anchor optimizer_params +@par Optimizer Parameters +Parameters in this section are supplied in the \e optimizer_params argument as a string +containing a comma-delimited list of name-value pairs. All of these named +parameters are optional, and their order does not matter. You must use the +format "<param_name> = <value>" to specify the value of a parameter, otherwise +the parameter is ignored. + + +<pre class="syntax"> + 'step_size = <value>, + n_iterations = <value>, + n_tries = <value>, + tolerance = <value>' +</pre> +\b Optimizer Parameters +<DL class="arglist"> + +<DT>step_size</dt> +<DD>Default: [0.001]. +Also known as the learning rate. A small value is usually desirable to +ensure convergence, while a large value provides more room for progress during +training. Since the best value depends on the condition number of the data, in +practice one often tunes this parameter. +</DD> + + +<DT>n_iterations</dt> +<DD>Default: [100]. The maximum number of iterations allowed. +</DD> +<DT>n_tries</dt> +<DD>Default: [1]. Number of times to retrain the network with randomly initialized +weights +</DD> + +<DT>tolerance</dt> +<DD>Default: 0.001. The criterion to end iterations. The training stops whenever +<the difference between the training models of two consecutive iterations is +<smaller than \e tolerance or the iteration number is larger than \e max_iter. +</DD> + +</DL> + +@anchor predict +@par Prediction Function +Used to generate predictions given a previously trained model on novel data. +The same syntax is used for classification, and regression. +<pre class="syntax"> +mlp_predict(model_table, + data_table, + id_col_name, + output_table, + pred_type) +</pre> + +\b Arguments +<DL class="arglist"> + <DT>model_table</DT> + <DD>TEXT. Model table produced by the training function.</DD> + + <DT>data_table</DT> + <DD>TEXT. Name of the table containing the data for prediction. This table is expected + to contain the same input features that were used during training. The table should + also contain id_col_name used for identifying each row.</DD> + + <DT>id_col_name</DT> + <DD>TEXT. The name of the id column in the input table.</DD> + + <DT>output_table</DT> + <DD>TEXT. Name of the table where output predictions are written. If this +table name is already in use, then an error is returned. Table contains:</DD> + <table class="output"> + <tr> + <th>id</th> + <td>Gives the 'id' for each prediction, corresponding to each row from the data_table.</td> + </tr> + <tr> + <th>estimated_<COL_NAME></th> + <td> + (For pred_type='response') The estimated class + for classification or value for regression, where + <COL_NAME> is the name of the column to be + predicted from training data + </td> + </tr> + <tr> + <th>prob_<CLASS></th> + <td> + (For pred_type='prob' for classification) The + probability of a given class <CLASS> as given by + softmax. There will be one column for each class + in the training data. + </td> + </tr> + + + <DT>pred_type</DT> + <DD>TEXT. + +the type of output requested: +'response' gives the actual prediction, +'prob' gives the probability of each class. +for regression, only type='response' is defined. +The name of the id column in the input table.</DD> +</DL> +</table> + +@anchor example +@par Examples +-# Create an input data set. +<pre class="example"> +CREATE TABLE iris_data( + id integer, + attributes numeric[], + class_text varchar, + class integer +); +INSERT INTO iris_data VALUES +(1,ARRAY[5.1,3.5,1.4,0.2],'Iris-setosa',1), +(2,ARRAY[4.9,3.0,1.4,0.2],'Iris-setosa',1), +(3,ARRAY[4.7,3.2,1.3,0.2],'Iris-setosa',1), +(4,ARRAY[4.6,3.1,1.5,0.2],'Iris-setosa',1), +(5,ARRAY[5.0,3.6,1.4,0.2],'Iris-setosa',1), +(6,ARRAY[5.4,3.9,1.7,0.4],'Iris-setosa',1), +(7,ARRAY[4.6,3.4,1.4,0.3],'Iris-setosa',1), +(8,ARRAY[5.0,3.4,1.5,0.2],'Iris-setosa',1), +(9,ARRAY[4.4,2.9,1.4,0.2],'Iris-setosa',1), +(10,ARRAY[4.9,3.1,1.5,0.1],'Iris-setosa',1), +(11,ARRAY[7.0,3.2,4.7,1.4],'Iris-versicolor',2), +(12,ARRAY[6.4,3.2,4.5,1.5],'Iris-versicolor',2), +(13,ARRAY[6.9,3.1,4.9,1.5],'Iris-versicolor',2), +(14,ARRAY[5.5,2.3,4.0,1.3],'Iris-versicolor',2), +(15,ARRAY[6.5,2.8,4.6,1.5],'Iris-versicolor',2), +(16,ARRAY[5.7,2.8,4.5,1.3],'Iris-versicolor',2), +(17,ARRAY[6.3,3.3,4.7,1.6],'Iris-versicolor',2), +(18,ARRAY[4.9,2.4,3.3,1.0],'Iris-versicolor',2), +(19,ARRAY[6.6,2.9,4.6,1.3],'Iris-versicolor',2), +(20,ARRAY[5.2,2.7,3.9,1.4],'Iris-versicolor',2); +</pre> +-# Generate a multilayer perceptron with a single hidden layer of 5 units. +Use the attributes column as the independent variables, and use the class +column as the classification. Set the tolerance to 0 so that 5000 +iterations will be run. Use a hyperbolic tangent activation function. +The model will be written to mlp_model. +<pre class="example"> +DROP TABLE IF EXISTS mlp_model; +DROP TABLE IF EXISTS mlp_model_summary; +SELECT madlib.mlp_classification( + 'iris_data', -- Source table + 'mlp_model', -- Destination table + 'attributes', -- Input features + 'class_text', -- Label + ARRAY[5], -- Number of units per layer + 'step_size=0.003, + n_iterations=5000, + tolerance=0', -- Optimizer params + 'tanh'); -- Activation function +</pre> +-# View the result for the model. +<pre class="example"> +-- Set extended display on for easier reading of output +\\x ON +-- Neural net Initialization is non-deterministic, so your results may vary +SELECT * FROM mlp_model; +</pre> +Result: +<pre class="result"> +-[ RECORD 1 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +coeff | {1,1,1,1,1,0.136374930803,0.188739676875,0.662387810001,-1.03381622734,-0.469961067046,0.0614006983397,0.0811504589436,0.299008228258,-0.47391918521,-0.215098143699,0.10519213944,0.145844617525,0.511683525606,-0.800215552382,-0.36417142683,0.120751709056,0.167531106521,0.587074895969,-0.916946198095,-0.417055067449,0.0539541885146,0.0694359704131,0.262598585854,-0.419234805076,-0.189915344282,1,1,1,1,1,1,0.105645702152,1.46247470474,0.484457903226,0.965962824478,1.19361986431,0.419805760087,-0.105696503487,-1.46245956666,-0.484427811691,-0.965730981426,-1.19365280555,-0.419973628863} +loss | 0.0184092375519 +num_iterations | 5000 +</pre> +-# Next train a regression example. First create some test data. This dataset +contains housing prices data. +<pre class="example"> +CREATE TABLE lin_housing (id serial, x float8[], grp_by_col int, y float8); +COPY lin_housing (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|'; +{1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98}|1|24.00 +{1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14}|1|21.60 +{1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03}|1|34.70 +{1,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94}|1|33.40 +{1,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33}|1|36.20 +{1,0.02985,0.00,2.180,0,0.4580,6.4300,58.70,6.0622,3,222.0,18.70,394.12,5.21}|1|28.70 +{1,0.08829,12.50,7.870,0,0.5240,6.0120,66.60,5.5605,5,311.0,15.20,395.60,12.43}|1|22.90 +{1,0.14455,12.50,7.870,0,0.5240,6.1720,96.10,5.9505,5,311.0,15.20,396.90,19.15}|1|27.10 +{1,0.21124,12.50,7.870,0,0.5240,5.6310,100.00,6.0821,5,311.0,15.20,386.63,29.93}|1|16.50 +{1,0.17004,12.50,7.870,0,0.5240,6.0040,85.90,6.5921,5,311.0,15.20,386.71,17.10}|1|18.90 +{1,0.22489,12.50,7.870,0,0.5240,6.3770,94.30,6.3467,5,311.0,15.20,392.52,20.45}|1|15.00 +{1,0.11747,12.50,7.870,0,0.5240,6.0090,82.90,6.2267,5,311.0,15.20,396.90,13.27}|1|18.90 +{1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71}|1|21.70 +{1,0.62976,0.00,8.140,0,0.5380,5.9490,61.80,4.7075,4,307.0,21.00,396.90,8.26}|1|20.40 +{1,0.63796,0.00,8.140,0,0.5380,6.0960,84.50,4.4619,4,307.0,21.00,380.02,10.26}|1|18.20 +{1,0.62739,0.00,8.140,0,0.5380,5.8340,56.50,4.4986,4,307.0,21.00,395.62,8.47}|1|19.90 +{1,1.05393,0.00,8.140,0,0.5380,5.9350,29.30,4.4986,4,307.0,21.00,386.85,6.58}|1| 23.10 +{1,0.78420,0.00,8.140,0,0.5380,5.9900,81.70,4.2579,4,307.0,21.00,386.75,14.67}|1|17.50 +{1,0.80271,0.00,8.140,0,0.5380,5.4560,36.60,3.7965,4,307.0,21.00,288.99,11.69}|1|20.20 +{1,0.72580,0.00,8.140,0,0.5380,5.7270,69.50,3.7965,4,307.0,21.00,390.95,11.28}|1|18.20 +\\. +</pre> +-# Now train a regression model using a multilayer perceptron a single hidden layer of two nodes. +<pre class="example"> +DROP TABLE IF EXISTS mlp_regress; +DROP TABLE IF EXISTS mlp_regress_summary; +SELECT madlib.mlp_regression( + 'lin_housing', -- Source table + 'mlp_regress', -- Desination table + 'x', -- Input features + 'y', -- Dependent variable + ARRAY[5,5], -- Number of units per layer + 'step_size=0.000007, + n_iterations=10000, + tolerance=0', + 'relu'); +</pre> +-# Check the results of the model +<pre class="example"> +-- Set extended display on for easier reading of output +\\x ON +-- Neural net Initialization is non-deterministic, so your results may vary +SELECT * FROM mlp_regress; +</pre> +Result: +<pre class="result"> +-[ RECORD 1 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------- +coeff | {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2.79506311399e-05,3.56715008915e-05,-6.09333559685e-05,0.000251228318768,-0.000224772841379,-3.71863030857e-05,-3.5757865148e-06,5.27936784854e-05,-2.48474166186e-05,6.19731184294e-05,3.07638968743e-05,6.8964698578e-06,0.000106016701083,-1.71484730318e-05,1.18691881812e-05,-0.000163975464208,0.000170026304906,3.11688265279e-05,0.000177050148787,-1.58265976603e-05,2.70144422657e-05,0.000112667883422,3.77575139073e-05,8.12474658795e-05,-7.90458917626e-05,0.000107566386158,-2.63771171506e-06,2.47996880915e-05,-0.00012642310887,0.000203827391081,0.000139315565565,4.86147243454e-05,-0.000176126471913,-6.47820782916e-05,-8.51592776447e-06,-6.60601176758e-05,2.91421874156e-05,6.3556873752e-05,0.000197557443129,0.000220531367259,0.000135036310289,0.000143735913975,-4.75034117786e-05,-0.000179547345838,-1.6919846786e-05,0.000162784312994,0.000268595819851,-0.000460066553287,8.69756071591e-05,-0.00311762727057,0.000126024763103,0.000205988242921 ,0.003463432426,-0.00729789075286,0.00151625867549,-0.000890852767597,-0.00525016037249,0.0031043106659,0.00798041103839,-0.00552693050079,0.0232180415786,0.0230489850143,-0.0437890272341,0.0165765426407,-0.248554261758,-7.81336427846e-05,0.00558145591752,0.283465844585,-0.571699956182,0.133474351994,-0.0785181945605,-0.419269930709,0.249547772912,0.631761009875,-0.431305975666,1,1,1,1,1,1,0.0158747497572,-9.02809160806e-05,0.00015574347618,4.10805373863e-06,0.00121532434965,0.101790351335,0.0647558401493,-0.00013654998677,-9.92872075948e-06,-5.5319694394e-05,0.00519320756484,0.412736586036,0.0011998026977,-1.53688189815e-05,1.94817888201e-05,-4.63111489966e-05,7.24547899029e-05,0.00880394144485,5.45309822095e-05,-0.000140943219275,-7.96211486227e-05,-1.04337307472e-05,0.000161936762028,0.00136273797767,-4.54737243585e-05,-3.4083840736e-05,3.69286883662e-05,9.9047243188e-08,3.75014011824e-06,-9.45366086368e-08,1,1,1,1,1,1,6.67488547054,0.102754199001,0.41668912471,0.00886867296479,0 .00136206007228,-9.88642499013e-05} +loss | 144.965776158 +num_iterations | 10000 +</pre> +-# Now let's look at the prediction functions. In the following examples we will +use the training data set for prediction as well, which is not usual but serves to +show the syntax. First we will test the classification example. +The prediction is in the the estimated_class_text column with the +actual value in the class_text column. +<pre class="example"> +DROP TABLE IF EXISTS mlp_prediction; +SELECT madlib.mlp_predict( + 'mlp_model', -- Model table + 'iris_data', -- Test data table + 'id', -- Id column in test table + 'mlp_prediction', -- Output table for predictions + 'response' -- Output classes, not probabilities + ); +-# View results +<pre class="example"> +SELECT * FROM mlp_prediction JOIN iris_data USING (id); +</pre> +Result for the classification model: +<pre class="result"> + id | estimated_class_text | attributes | class_text | class +----+----------------------+-------------------+-----------------+------- + 1 | Iris-setosa | {5.1,3.5,1.4,0.2} | Iris-setosa | 1 + 2 | Iris-setosa | {4.9,3.0,1.4,0.2} | Iris-setosa | 1 + 3 | Iris-setosa | {4.7,3.2,1.3,0.2} | Iris-setosa | 1 + 4 | Iris-setosa | {4.6,3.1,1.5,0.2} | Iris-setosa | 1 + 5 | Iris-setosa | {5.0,3.6,1.4,0.2} | Iris-setosa | 1 + 6 | Iris-setosa | {5.4,3.9,1.7,0.4} | Iris-setosa | 1 + 7 | Iris-setosa | {4.6,3.4,1.4,0.3} | Iris-setosa | 1 + 8 | Iris-setosa | {5.0,3.4,1.5,0.2} | Iris-setosa | 1 + 9 | Iris-setosa | {4.4,2.9,1.4,0.2} | Iris-setosa | 1 + 10 | Iris-setosa | {4.9,3.1,1.5,0.1} | Iris-setosa | 1 + 11 | Iris-versicolor | {7.0,3.2,4.7,1.4} | Iris-versicolor | 2 + 12 | Iris-versicolor | {6.4,3.2,4.5,1.5} | Iris-versicolor | 2 + 13 | Iris-versicolor | {6.9,3.1,4.9,1.5} | Iris-versicolor | 2 + 14 | Iris-versicolor | {5.5,2.3,4.0,1.3} | Iris-versicolor | 2 + 15 | Iris-versicolor | {6.5,2.8,4.6,1.5} | Iris-versicolor | 2 + 16 | Iris-versicolor | {5.7,2.8,4.5,1.3} | Iris-versicolor | 2 + 17 | Iris-versicolor | {6.3,3.3,4.7,1.6} | Iris-versicolor | 2 + 18 | Iris-versicolor | {4.9,2.4,3.3,1.0} | Iris-versicolor | 2 + 19 | Iris-versicolor | {6.6,2.9,4.6,1.3} | Iris-versicolor | 2 + 20 | Iris-versicolor | {5.2,2.7,3.9,1.4} | Iris-versicolor | 2 +</pre> +Prediction using the regression model: +<pre class="example"> +DROP TABLE IF EXISTS mlp_regress_prediction; +SELECT madlib.mlp_predict( + 'mlp_regress', -- Model table + 'lin_housing', -- Test data table + 'id', -- Id column in test table + 'mlp_regress_prediction', -- Output table for predictions + 'response' -- Output values, not probabilities + ); +</pre> +-# View results +<pre class="example"> +SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id); +</pre> +Result for the regression model: +<pre class="result"> + id | x | grp_by_col | y | estimated_y +----+-------------------------------------------------------------------------+------------+------+-------------------- + 1 | {1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98} | 1 | 24 | {23.2627062018087} + 2 | {1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14} | 1 | 21.6 | {25.7088419115781} + 3 | {1,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03} | 1 | 34.7 | {27.5587003901404} + 4 | {1,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94} | 1 | 33.4 | {31.1812237427816} + 5 | {1,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33} | 1 | 36.2 | {30.3696873085477} + 6 | {1,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21} | 1 | 28.7 | {29.5290259241882} + 7 | {1,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43} | 1 | 22.9 | {21.1576051716888} + 8 | {1,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15} | 1 | 27.1 | {17.6194200563055} + 9 | {1,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93} | 1 | 16.5 | {15.1366297774139} +10 | {1,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1} | 1 | 18.9 | {17.6528662199369} +11 | {1,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45} | 1 | 15 | {17.2017487668181} +12 | {1,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27} | 1 | 18.9 | {19.4893860319992} +13 | {1,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71} | 1 | 21.7 | {23.2917226708039} +14 | {1,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26} | 1 | 20.4 | {22.8904812605193} +15 | {1,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26} | 1 | 18.2 | {18.2386754423677} +16 | {1,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47} | 1 | 19.9 | {23.28949550874} +17 | {1,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58} | 1 | 23.1 | {25.3288762085473} +18 | {1,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67} | 1 | 17.5 | {19.0203738118451} +19 | {1,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69} | 1 | 20.2 | {12.3162005347545} +20 | {1,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28} | 1 | 18.2 | {21.0902211848747} +</pre> +Note that the results you get for all examples may vary with the platform you are using. + +@anchor background +@par Technical Background + +To train a neural net, the respective loss function is minimized using stochastic gradient descent. +In the case of classification, the loss function is cross entropy. For regression, mean square error +is used. Weights in the neural net are updated via the backpropogation process, which uses dynamic +programming to compute the partial derivative of each weight with respect to the overall loss. This +partial derivative incorporates the respective activation function used, so this requires that the +activation function be differentiable. + +For an overview of multilayer perceptrons, see website [1]. + +For details on backpropogation, see the notes at [2]. + +@anchor literature +@literature + +@anchor mlp-lit-1 +[1] "Multilayer Perceptron." Wikipedia. Wikimedia Foundation, + 12 July 2017. Web. 12 July 2017. + +[2] Yu Hen Hu. "Lecture 11. MLP (III): Back-Propagation." + University of Wisconsin Madison: Computer-Aided Engineering. Web. 12 July 2017, + http://homepages.cae.wisc.edu/~ece539/videocourse/notes/pdf/lec%2011%20MLP%20(3)%20BP.pdf + +@anchor related +@par Related Topics + +File mlp.sql_in documenting the training function + +*/ + + +CREATE TYPE MADLIB_SCHEMA.mlp_result AS ( + coeff DOUBLE PRECISION[], + loss DOUBLE PRECISION +); + +-------------------------------------------------------------------------- +-- create SQL functions for IGD optimizer +-------------------------------------------------------------------------- +CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_transition( + state DOUBLE PRECISION[], + start_vec DOUBLE PRECISION[], + end_vec DOUBLE PRECISION[], + previous_state DOUBLE PRECISION[], + layer_sizes DOUBLE PRECISION[], + stepsize DOUBLE PRECISION, + activation INTEGER, + is_classification INTEGER) +RETURNS DOUBLE PRECISION[] +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE; + +CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_merge( + state1 DOUBLE PRECISION[], + state2 DOUBLE PRECISION[]) +RETURNS DOUBLE PRECISION[] +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_final( + state DOUBLE PRECISION[]) +RETURNS DOUBLE PRECISION[] +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + +/** + * @internal + * @brief Perform one iteration of backprop + */ +CREATE AGGREGATE MADLIB_SCHEMA.mlp_igd_step( + /* start_vec*/ DOUBLE PRECISION[], + /* end_vec */ DOUBLE PRECISION[], + /* previous_state */ DOUBLE PRECISION[], + /* layer_sizes */ DOUBLE PRECISION[], + /* stepsize */ DOUBLE PRECISION, + /* activation */ INTEGER, + /* is_classification */ INTEGER )( + STYPE=DOUBLE PRECISION[], + SFUNC=MADLIB_SCHEMA.mlp_igd_transition, + m4_ifdef(`GREENPLUM',`prefunc=MADLIB_SCHEMA.mlp_igd_merge,') + FINALFUNC=MADLIB_SCHEMA.mlp_igd_final, + INITCOND='{0,0,0,0,0,0,0,0}' +); +------------------------------------------------------------------------- + +CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_igd_distance( + /*+ state1 */ DOUBLE PRECISION[], + /*+ state2 */ DOUBLE PRECISION[]) +RETURNS DOUBLE PRECISION AS +'MODULE_PATHNAME' +LANGUAGE c IMMUTABLE STRICT; + +CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_igd_result( + /*+ state */ DOUBLE PRECISION[]) +RETURNS MADLIB_SCHEMA.mlp_result AS +'MODULE_PATHNAME' +LANGUAGE c IMMUTABLE STRICT; +------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification( + source_table VARCHAR, + output_table VARCHAR, + independent_varname VARCHAR, + dependent_varname VARCHAR, + hidden_layer_sizes INTEGER[], + optimizer_params VARCHAR, + activation VARCHAR +) RETURNS VOID AS $$ + PythonFunctionBodyOnly(`convex', `mlp_igd') + mlp_igd.mlp( + schema_madlib, + source_table, + output_table, + independent_varname, + dependent_varname, + hidden_layer_sizes, + optimizer_params, + activation, + True + ) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression( + source_table VARCHAR, + output_table VARCHAR, + independent_varname VARCHAR, + dependent_varname VARCHAR, + hidden_layer_sizes INTEGER[], + optimizer_params VARCHAR, + activation VARCHAR +) RETURNS VOID AS $$ + PythonFunctionBodyOnly(`convex', `mlp_igd') + mlp_igd.mlp( + schema_madlib, + source_table, + output_table, + independent_varname, + dependent_varname, + hidden_layer_sizes, + optimizer_params, + activation, + False + ) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict( + model_table VARCHAR, + data_table VARCHAR, + id_col_name VARCHAR, + output_table VARCHAR, + pred_type VARCHAR +) RETURNS VOID AS $$ + PythonFunctionBodyOnly(`convex', `mlp_igd') + mlp_igd.mlp_predict( + schema_madlib, + model_table, + data_table, + id_col_name, + output_table, + pred_type) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp( + coeff DOUBLE PRECISION[], + independent_varname DOUBLE PRECISION[], + is_classification DOUBLE PRECISION, + activation_function DOUBLE PRECISION, + layer_sizes DOUBLE PRECISION[], + is_response INTEGER + ) +RETURNS DOUBLE PRECISION[] +AS 'MODULE_PATHNAME' +LANGUAGE C IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification( + message TEXT +) RETURNS TEXT AS $$ + PythonFunctionBodyOnly(`convex', `mlp_igd') + return mlp_igd.mlp_help(schema_madlib,message,True) +$$ LANGUAGE plpythonu +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification() +RETURNS TEXT AS $$ + SELECT MADLIB_SCHEMA.mlp_classification(NULL::TEXT) +$$ LANGUAGE SQL IMMUTABLE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression( + message TEXT +) RETURNS TEXT AS $$ + PythonFunctionBodyOnly(`convex', `mlp_igd') + return mlp_igd.mlp_help(schema_madlib,message,False) +$$ LANGUAGE plpythonu +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_regression() +RETURNS TEXT AS $$ + SELECT MADLIB_SCHEMA.mlp_regression(NULL::TEXT) +$$ LANGUAGE SQL IMMUTABLE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict( + message TEXT +) RETURNS TEXT AS $$ + PythonFunctionBodyOnly(`convex', `mlp_igd') + return mlp_igd.mlp_predict_help(schema_madlib,message) +$$ LANGUAGE plpythonu +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_predict() +RETURNS TEXT AS $$ + SELECT MADLIB_SCHEMA.mlp_predict(NULL::TEXT) +$$ LANGUAGE SQL IMMUTABLE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4fcb60ed/src/ports/postgres/modules/convex/mlp_igd.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in new file mode 100644 index 0000000..6cea7b0 --- /dev/null +++ b/src/ports/postgres/modules/convex/mlp_igd.py_in @@ -0,0 +1,752 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +@file mlp_igd.py_in + +@brief Multilayer perceptron using IGD: Driver functions + +@namespace mlp_igd +""" +import plpy + +from utilities.control import MinWarning +from utilities.utilities import add_postfix +from utilities.utilities import py_list_to_sql_string +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import _assert +from utilities.utilities import unique_string +from utilities.utilities import strip_end_quotes + +from utilities.validate_args import cols_in_tbl_valid +from utilities.validate_args import input_tbl_valid +from utilities.validate_args import is_var_valid +from utilities.validate_args import output_tbl_valid +from utilities.validate_args import get_expr_type +from utilities.validate_args import array_col_has_same_dimension +from utilities.validate_args import array_col_dimension + + +def mlp(schema_madlib, source_table, output_table, independent_varname, + dependent_varname, hidden_layer_sizes, + optimizer_param_str, activation, is_classification, **kwargs): + """ + Args: + @param schema_madlib + @param source_table + @param output_table + @param independent_varname + @param dependent_varname + @param hidden_layer_sizes + @param optimizer_param_str + + Returns: + None + """ + with MinWarning('warning'): + optimizer_params = _get_optimizer_params(optimizer_param_str or "") + summary_table = add_postfix(output_table, "_summary") + _validate_args(source_table, output_table, summary_table, independent_varname, + dependent_varname, hidden_layer_sizes, + optimizer_params, is_classification) + + current_iteration = 1 + prev_state = None + tolerance = optimizer_params["tolerance"] + n_iterations = optimizer_params["n_iterations"] + step_size = optimizer_params["step_size"] + n_tries = optimizer_params["n_tries"] + activation_name = _get_activation_function_name(activation) + activation_index = _get_activation_index(activation_name) + num_input_nodes = array_col_dimension( + source_table, independent_varname) + num_output_nodes = 0 + classes = [] + dependent_type = get_expr_type(dependent_varname, source_table) + original_dependent_varname = dependent_varname + + if is_classification: + dependent_variable_sql = """ + SELECT DISTINCT {dependent_varname} + FROM {source_table} + """.format(dependent_varname=dependent_varname, + source_table=source_table) + labels = plpy.execute(dependent_variable_sql) + one_hot_dependent_varname = 'ARRAY[' + num_output_nodes = len(labels) + for label_obj in labels: + label = _format_label(label_obj[dependent_varname]) + classes.append(label) + one_hot_dependent_varname += dependent_varname + \ + "=" + str(label) + "," + # Remove the last comma + one_hot_dependent_varname = one_hot_dependent_varname[:-1] + one_hot_dependent_varname += ']::integer[]' + dependent_varname = one_hot_dependent_varname + else: + if "[]" not in dependent_type: + dependent_varname = "ARRAY[" + dependent_varname + "]" + num_output_nodes = array_col_dimension( + source_table, dependent_varname) + layer_sizes = [num_input_nodes] + \ + hidden_layer_sizes + [num_output_nodes] + + while True: + if prev_state: + prev_state_str = py_list_to_sql_string( + prev_state, array_type="double precision") + else: + prev_state_str = "(NULL)::DOUBLE PRECISION[]" + train_sql = """ + SELECT + {schema_madlib}.mlp_igd_step( + ({independent_varname})::DOUBLE PRECISION[], + ({dependent_varname})::DOUBLE PRECISION[], + {prev_state}, + {layer_sizes}, + ({step_size})::FLOAT8, + {activation}, + {is_classification}) as curr_state + FROM {source_table} AS _src + """.format(schema_madlib=schema_madlib, + independent_varname=independent_varname, + dependent_varname=dependent_varname, + prev_state=prev_state_str, + # C++ uses double internally + layer_sizes=py_list_to_sql_string(layer_sizes, + array_type="double precision"), + step_size=step_size, + source_table=source_table, + activation=activation_index, + is_classification=int(is_classification)) + curr_state = plpy.execute(train_sql)[0]["curr_state"] + dist_sql = """ + SELECT {schema_madlib}.internal_mlp_igd_distance( + {prev_state}, + {curr_state}) as state_dist + """.format(schema_madlib=schema_madlib, + prev_state=prev_state_str, + curr_state=py_list_to_sql_string(curr_state, "double precision")) + state_dist = plpy.execute(dist_sql)[0]["state_dist"] + if ((state_dist and state_dist < tolerance) or + current_iteration > n_iterations): + break + prev_state = curr_state + current_iteration += 1 + _build_model_table(schema_madlib, output_table, + curr_state, n_iterations) + layer_sizes_str = py_list_to_sql_string( + layer_sizes, array_type="integer") + classes_str = py_list_to_sql_string( + [strip_end_quotes(cl, "'") for cl in classes], + array_type=dependent_type) + summary_table_creation_query = """ + CREATE TABLE {summary_table}( + source_table TEXT, + independent_varname TEXT, + dependent_varname TEXT, + tolerance FLOAT, + step_size FLOAT, + n_iterations INTEGER, + n_tries INTEGER, + layer_sizes INTEGER[], + activation_function TEXT, + is_classification BOOLEAN, + classes {dependent_type}[] + )""".format(summary_table=summary_table, + dependent_type=dependent_type) + + summary_table_update_query = """ + INSERT INTO {summary_table} VALUES( + '{source_table}', + '{independent_varname}', + '{original_dependent_varname}', + {tolerance}, + {step_size}, + {n_iterations}, + {n_tries}, + {layer_sizes_str}, + '{activation_name}', + {is_classification}, + {classes_str} + ) + """.format(**locals()) + plpy.execute(summary_table_creation_query) + plpy.execute(summary_table_update_query) +# ---------------------------------------------------------------------- + + +def _build_model_table(schema_madlib, output_table, final_state, n_iterations): + final_state_str = py_list_to_sql_string( + final_state, array_type="double precision") + + model_table_query = """ + CREATE TABLE {output_table} AS + SELECT + (result).coeff AS coeff, + (result).loss AS loss, + {n_iterations} AS num_iterations + -- (result).num_rows_processed AS num_rows_processed, + -- n_tuples_including_nulls - (result).num_rows_processed + FROM ( + SELECT + {schema_madlib}.internal_mlp_igd_result( + {final_state_str} + ) AS result + ) rel_state_subq + """.format(**locals()) + plpy.execute(model_table_query) +# ---------------------------------------------------------------------- + + +def _get_optimizer_params(param_str): + params_defaults = { + "step_size": (0.001, float), + "n_iterations": (100, int), + "n_tries": (1, int), + "tolerance": (0.001, float), + } + param_defaults = dict([(k, v[0]) for k, v in params_defaults.items()]) + param_types = dict([(k, v[1]) for k, v in params_defaults.items()]) + + if not param_str: + return param_defaults + + name_value = extract_keyvalue_params(param_str, param_types, param_defaults, + ignore_invalid=False) + return name_value +# ---------------------------------------------------------------------- + + +def _validate_args_classification(source_table, dependent_varname): + expr_type = get_expr_type(dependent_varname, source_table) + int_types = ['integer', 'smallint', 'bigint'] + text_types = ['text', 'varchar', 'character varying', 'char', 'character'] + boolean_types = ['boolean'] + _assert("[]" in expr_type or expr_type in int_types + text_types + boolean_types, + "Dependent variable column should refer to an " + "integer, boolean, text, varchar, or character type.") +# ---------------------------------------------------------------------- + + +def _validate_args_regression(source_table, dependent_varname): + expr_type = get_expr_type(dependent_varname, source_table) + int_types = ['integer', 'smallint', 'bigint'] + float_types = ['double precision', 'real'] + _assert("[]" in expr_type or expr_type in int_types + float_types, + "Dependent variable column should refer to an array or numeric type") + if "[]" in expr_type: + _assert(array_col_has_same_dimension(source_table, dependent_varname), + "Dependent variable column should refer to arrays of the same length") +# ---------------------------------------------------------------------- + + +def _validate_args(source_table, output_table, summary_table, independent_varname, + dependent_varname, hidden_layer_sizes, + optimizer_params, is_classification): + input_tbl_valid(source_table, "MLP") + output_tbl_valid(output_table, "MLP") + output_tbl_valid(summary_table, "MLP") + _assert(is_var_valid(source_table, independent_varname), + "MLP error: invalid independent_varname " + "('{independent_varname}') for source_table " + "({source_table})!".format(independent_varname=independent_varname, + source_table=source_table)) + + _assert(is_var_valid(source_table, dependent_varname), + "MLP error: invalid dependent_varname " + "('{dependent_varname}') for source_table " + "({source_table})!".format(dependent_varname=dependent_varname, + source_table=source_table)) + _assert(hidden_layer_sizes is not None, + "hidden_layer_sizes may not be null") + _assert(isinstance(hidden_layer_sizes, list), + "hidden_layer_sizes must be an array of integers") + _assert(all(isinstance(value, int) for value in hidden_layer_sizes), + "MLP error: Hidden layers sizes must be integers") + _assert(all(value >= 0 for value in hidden_layer_sizes), + "MLP error: Hidden layers sizes must be greater than 0.") + _assert(optimizer_params["tolerance"] >= 0, + "MLP error: Tolerance should be greater than or equal to 0.") + _assert(optimizer_params["n_tries"] >= 1, + "MLP error: Number of tries should be greater than or equal to 1") + _assert(optimizer_params["n_iterations"] >= 1, + "MLP error: Number of iterations should be greater than or equal to 1") + _assert(optimizer_params["step_size"] > 0, + "MLP error: Stepsize should be greater than 0.") + _assert("[]" in get_expr_type(independent_varname, source_table), + "Independent variable column should refer to an array") + _assert(array_col_has_same_dimension(source_table, independent_varname), + "Independent variable column should refer to arrays of the same length") + + if is_classification: + _validate_args_classification(source_table, dependent_varname) + else: + _validate_args_regression(source_table, dependent_varname) +# ---------------------------------------------------------------------- + + +def _get_activation_function_name(activation_function): + if not activation_function: + activation_function = 'sigmoid' + else: + # Add non-linear kernels below after implementing them. + supported_activation_function = ['sigmoid', 'tanh', 'relu'] + try: + # allow user to specify a prefix substring of + # supported kernels. This works because the supported + # kernels have unique prefixes. + activation_function = next(x for x in supported_activation_function + if x.startswith(activation_function)) + except StopIteration: + # next() returns a StopIteration if no element found + plpy.error("MLP Error: Invalid activation function: " + "{0}. Supported activation functions are ({1})" + .format(activation_function, ','.join( + sorted(supported_activation_function)))) + return activation_function +# ------------------------------------------------------------------------------ + + +def _get_activation_index(activation_name): + table = {"relu": 0, "sigmoid": 1, "tanh": 2} + return table[activation_name] + + +def _format_label(label): + if isinstance(label, str): + return "'" + label + "'" + return label +# ------------------------------------------------------------------------- + + +def mlp_predict(schema_madlib, model_table, data_table, + id_col_name, output_table, + pred_type='response', **kwargs): + """ Score new observations using a trained neural network + + @param schema_madlib Name of the schema where MADlib is installed + @param model_table Name of learned model + @param data_table Name of table/view containing the data + points to be scored + @param id_col_name Name of column in source_table containing + (integer) identifier for data point + @param output_table Name of table to store the results + @param pred_type: str, The type of output required: + 'response' gives the actual response values, + 'prob' gives the probability of the classes in a + For regression, only type='response' is defined. + """ + input_tbl_valid(model_table, 'MLP') + cols_in_tbl_valid(model_table, ['coeff'], 'MLP') + summary_table = add_postfix(model_table, "_summary") + input_tbl_valid(summary_table, 'MLP') + cols_in_tbl_valid(summary_table, + ['dependent_varname', 'independent_varname', + 'activation_function', + 'tolerance', 'step_size', 'n_iterations', + 'n_tries', 'classes', 'layer_sizes', 'source_table'], + 'MLP') + + summary = plpy.execute("SELECT * FROM {0}".format(summary_table))[0] + coeff = py_list_to_sql_string(plpy.execute( + "SELECT * FROM {0}".format(model_table))[0]["coeff"]) + dependent_varname = summary['dependent_varname'] + independent_varname = summary['independent_varname'] + source_table = summary['source_table'] + activation_function = _get_activation_index(summary['activation_function']) + layer_sizes = py_list_to_sql_string( + summary['layer_sizes'], array_type="DOUBLE PRECISION") + is_classification = int(summary["is_classification"]) + is_response = int(pred_type == 'response') + + pred_name = ('"prob_{0}"' if pred_type == "prob" else + '"estimated_{0}"').format(dependent_varname.replace('"', '').strip()) + + input_tbl_valid(data_table, 'MLP') + + _assert(is_var_valid(data_table, independent_varname), + "MLP Error: independent_varname ('{0}') is invalid for data_table ({1})". + format(independent_varname, data_table)) + _assert(id_col_name is not None, "MLP Error: id_col_name is NULL") + _assert(is_var_valid(data_table, id_col_name), + "MLP Error: id_col_name ('{0}') is invalid for {1}". + format(id_col_name, data_table)) + output_tbl_valid(output_table, 'MLP') + + with MinWarning("warning"): + header = "CREATE TABLE " + output_table + " AS " + # Regression + if not is_classification: + dependent_type = get_expr_type(dependent_varname, source_table) + unnest_if_not_array = "" + # Return the same type as the user provided. Internally we always use an array, but + # if they provided a scaler, unnest it for the user + if "[]" not in dependent_type: + unnest_if_not_array = "UNNEST" + sql = header + """ + SELECT {id_col_name}, + {unnest_if_not_array}({schema_madlib}.internal_predict_mlp( + {coeff}, + {independent_varname}::DOUBLE PRECISION[], + {is_classification}, + {activation_function}, + {layer_sizes}, + {is_response} + )) as {pred_name} + FROM {data_table} + """ + else: + summary_query = """ + SELECT classes FROM {0} + """.format(summary_table) + classes = plpy.execute(summary_query)[0]['classes'] + if pred_type == "response": + # This join is to recover the class name from the summary table, + # as prediction just returns an index + classes_with_index_table = unique_string() + classes_table = unique_string() + sql = header + """ + SELECT + q.{id_col_name} + ,(ARRAY{classes})[pred_idx[1]+1] as {pred_name} + FROM ( + SELECT + {id_col_name}, + {schema_madlib}.internal_predict_mlp( + {coeff}::DOUBLE PRECISION[], + {independent_varname}::DOUBLE PRECISION[], + {is_classification}, + {activation_function}, + {layer_sizes}, + {is_response} + ) + as pred_idx + FROM {data_table} + ) q + """ + else: + # Incomplete + intermediate_col = unique_string() + score_format = ',\n'.join([ + 'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'. + format(j=i + 1, c_str=str(c).strip(' "'), + interim=intermediate_col) + for i, c in enumerate(classes)]) + sql = header + """ + SELECT + {id_col_name}, + {score_format} + FROM ( + SELECT {id_col_name}, + {schema_madlib}.internal_predict_mlp( + {coeff}::DOUBLE PRECISION[], + {independent_varname}::DOUBLE PRECISION[], + {is_classification}, + {activation_function}, + {layer_sizes}, + {is_response} + )::TEXT[] + AS {intermediate_col} + FROM {data_table} + ) q + """ + sql = sql.format(**locals()) + plpy.execute(sql) +# ---------------------------------------------------------------------- + + +def mlp_help(schema_madlib, message, is_classification): + method = 'mlp_classification' if is_classification else 'mlp_regression' + int_types = ['integer', 'smallint', 'bigint'] + text_types = ['text', 'varchar', 'character varying', 'char', 'character'] + boolean_types = ['boolean'] + supported_types = " " * 33 + ", ".join(text_types) + "\n" +\ + " " * 33 + ", ".join(int_types + boolean_types) + label_description_classification = "Name of a column which specifies label.\n" +\ + " " * 33 + "Supported types are:\n" + supported_types + label_description_regression = "Dependent variable. May be an array for multiple\n" +\ + " " * 33 + "regression or the name of a column which is any\n" + " " * 33 +\ + "numeric type for single regression" + label_description = label_description_classification if is_classification\ + else label_description_regression + args = dict(schema_madlib=schema_madlib, method=method, + label_description=label_description) + + summary = """ + ---------------------------------------------------------------- + SUMMARY + ---------------------------------------------------------------- + Multilayer Perceptron (MLP) is a model for regression and + classification + + Also called "vanilla neural networks", they consist of several + fully connected hidden layers with non-linear activation + functions. + + For more details on function usage: + SELECT {schema_madlib}.{method}('usage') + + For a small example on using the function: + SELECT {schema_madlib}.{method}('example')""".format(**args) + + usage = """ + --------------------------------------------------------------------------- + USAGE + --------------------------------------------------------------------------- + SELECT {schema_madlib}.{method}( + source_table, -- name of input table + output_table, -- name of output model table + independent_varname, -- name of independent variable + dependent_varname, -- {label_description} + hidden_layer_sizes, -- Array of integers indicating the + number of hidden units per layer. + Length equal to the number of hidden layers. + optimizer_params, -- optional, default NULL + parameters for optimization in + a comma-separated string of key-value pairs. + + step_size DOUBLE PRECISION, -- Default: 0.001 + Learning rate + n_iterations INTEGER, -- Default: 100 + Number of iterations per try + n_tries INTEGER, -- Default: 1 + Total number of training cycles, + with random initializations to avoid + local minima. + tolerance DOUBLE PRECISION, -- Default: 0.001 + If the distance in loss between + two iterations is less than the + tolerance training will stop, even if + n_iterations has not been reached + + activation -- optional, default: 'sigmoid'. + supported activations: 'relu', 'sigmoid', + and 'tanh' + ); + + + --------------------------------------------------------------------------- + OUTPUT + --------------------------------------------------------------------------- + The model table produced by MLP contains the following columns: + + coeffs -- Flat array containing the weights of the neural net + + loss -- The total loss over the training data. Cross entropy + for classification and MSE for regression + + num_iterations -- The total number of training iterations + + """.format(**args) + + regression_example = """ + - Create input table + + CREATE TABLE lin_housing_wi (id serial, x float8[], grp_by_col int, y float8); + COPY lin_housing_wi (x, grp_by_col, y) FROM STDIN NULL '?' DELIMITER '|'; + {1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98} | 1 | 24.00 + {1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14} | 1 | 21.60 + {1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03} | 1 | 34.70 + {1,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94} | 1 | 33.40 + {1,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33} | 1 | 36.20 + {1,0.02985,0.00,2.180,0,0.4580,6.4300,58.70,6.0622,3,222.0,18.70,394.12,5.21} | 1 | 28.70 + {1,0.08829,12.50,7.870,0,0.5240,6.0120,66.60,5.5605,5,311.0,15.20,395.60,12.43} | 1 | 22.90 + {1,0.14455,12.50,7.870,0,0.5240,6.1720,96.10,5.9505,5,311.0,15.20,396.90,19.15} | 1 | 27.10 + {1,0.21124,12.50,7.870,0,0.5240,5.6310,100.00,6.0821,5,311.0,15.20,386.63,29.93} | 1 | 16.50 + {1,0.17004,12.50,7.870,0,0.5240,6.0040,85.90,6.5921,5,311.0,15.20,386.71,17.10} | 1 | 18.90 + {1,0.22489,12.50,7.870,0,0.5240,6.3770,94.30,6.3467,5,311.0,15.20,392.52,20.45} | 1 | 15.00 + {1,0.11747,12.50,7.870,0,0.5240,6.0090,82.90,6.2267,5,311.0,15.20,396.90,13.27} | 1 | 18.90 + {1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71} | 1 | 21.70 + \. + + - Generate a multilayer perception with a two hidden layers of 5 units + each. Use the x column as the independent variables, and use the class + column as the classification. Set the tolerance to 0 so that 300 + iterations will be run. Use a sigmoid activation function. + The model will be written to mlp_regress_result. + + SELECT mlp_regression( + 'lin_housing_wi', -- Source table + 'mlp_regress_result', -- Desination table + 'x', -- Independent variable + 'y', -- Dependent variable + ARRAY[5,5], -- Number of hidden units per layer + 'step_size=0.007, + n_iterations=300, + tolerance=0', + 'sigmoid'); -- Activation + + """ + + classification_example = """ + -- Create input table + + CREATE TABLE iris_data( + id integer, + attributes numeric[], + class_text varchar, + class integer + ); + + INSERT INTO iris_data VALUES + (1,ARRAY[5.1,3.5,1.4,0.2],'Iris-setosa',1), + (2,ARRAY[4.9,3.0,1.4,0.2],'Iris-setosa',1), + (3,ARRAY[4.7,3.2,1.3,0.2],'Iris-setosa',1), + (4,ARRAY[4.6,3.1,1.5,0.2],'Iris-setosa',1), + (5,ARRAY[5.0,3.6,1.4,0.2],'Iris-setosa',1), + (6,ARRAY[5.4,3.9,1.7,0.4],'Iris-setosa',1), + (7,ARRAY[4.6,3.4,1.4,0.3],'Iris-setosa',1), + (8,ARRAY[5.0,3.4,1.5,0.2],'Iris-setosa',1), + (9,ARRAY[4.4,2.9,1.4,0.2],'Iris-setosa',1), + (10,ARRAY[4.9,3.1,1.5,0.1],'Iris-setosa',1), + (11,ARRAY[7.0,3.2,4.7,1.4],'Iris-versicolor',2), + (12,ARRAY[6.4,3.2,4.5,1.5],'Iris-versicolor',2), + (13,ARRAY[6.9,3.1,4.9,1.5],'Iris-versicolor',2), + (14,ARRAY[5.5,2.3,4.0,1.3],'Iris-versicolor',2), + (15,ARRAY[6.5,2.8,4.6,1.5],'Iris-versicolor',2), + (16,ARRAY[5.7,2.8,4.5,1.3],'Iris-versicolor',2), + (17,ARRAY[6.3,3.3,4.7,1.6],'Iris-versicolor',2), + (18,ARRAY[4.9,2.4,3.3,1.0],'Iris-versicolor',2), + (19,ARRAY[6.6,2.9,4.6,1.3],'Iris-versicolor',2), + (20,ARRAY[5.2,2.7,3.9,1.4],'Iris-versicolor',2); + + + -- Generate a multilayer perception with a single hidden layer of 5 units. + Use the attributes column as the independent variables, and use the class + column as the classification. Set the tolerance to 0 so that 1000 + iterations will be run. Use a hyperbolic tangent activation function. + The model will be written to mlp_result. + + SELECT {schema_madlib}.mlp_classification( + 'iris_data', -- Source table + 'mlp_model', -- Destination table + 'attributes', -- Input features + 'class_text', -- Label + ARRAY[5], -- Number of units per layer + 'step_size=0.003, + n_iterations=5000, + tolerance=0', -- Optimizer params + 'tanh'); -- Activation function + + """.format(**args) + example = classification_example if is_classification else regression_example + if not message: + return summary + elif message.lower() in ('usage', 'help', '?'): + return usage + elif message.lower() == 'example': + return example + return """ + No such option. Use "SELECT {schema_madlib}.{method}()" for help. + """.format(**args) + + +def mlp_predict_help(schema_madlib, message): + args = dict(schema_madlib=schema_madlib) + + summary = """ + ---------------------------------------------------------------- + SUMMARY + ---------------------------------------------------------------- + Multilayer Perceptron (MLP) is a model for regression and + classification + + Also called "vanilla neural networks", they consist of several + fully connected hidden layers with non-linear activation + functions. + + For more details on function usage: + SELECT {schema_madlib}.mlp_predict('usage') + + For a small example on using the function: + SELECT {schema_madlib}.mlp_predict('example')""".format(**args) + + usage = """ + --------------------------------------------------------------------------- + USAGE + --------------------------------------------------------------------------- + SELECT {schema_madlib}.mlp_predict( + model_table, -- name of model table + data_table, -- name of data table + id_col_name, -- id column for data table + output_table, -- name of output table + pred_type -- the type of output requested: + -- 'response' gives the actual prediction, + -- 'prob' gives the probability of each class. + -- for regression, only type='response' is defined. + ); + + + --------------------------------------------------------------------------- + OUTPUT + --------------------------------------------------------------------------- + The model table produced by mlp contains the following columns: + + id -- The provided id for the given input vector + + estimated_<COL_NAME> -- (For pred_type='response') The estimated class + for classification or value for regression, where + <COL_NAME> is the name of the column to be + predicted from training data + + prob_<CLASS> -- (For pred_type='prob' for classification) The + probability of a given class <CLASS> as given by + softmax. There will be one column for each class + in the training data. + + """.format(**args) + + example = """ + -- See {schema_madlib}.mlp_classification('example') for test + -- and model tables + + -- Predict classes using + SELECT {schema_madlib}.mlp_predict( + 'mlp_model', -- Model table + 'iris_data', -- Test data table + 'id', -- Id column in test table + 'mlp_prediction', -- Output table for predictions + 'response' -- Output classes, not probabilities + ); + + SELECT * FROM mlp_prediction; + + WITH total_count AS (SELECT count(*) AS c FROM iris_data) + SELECT count(*)/((SELECT c FROM total_count)::DOUBLE PRECISION) + AS train_accuracy + FROM + ( + SELECT iris_data.class_text AS actual_label, + mlp_prediction.estimated_class_text AS predicted_label + FROM mlp_prediction + INNER JOIN iris_data ON iris_data.id=mlp_prediction.id + ) q + WHERE q.actual_label=q.predicted_label; + """.format(**args) + + if not message: + return summary + elif message.lower() in ('usage', 'help', '?'): + return usage + elif message.lower() == 'example': + return example + return """ + No such option. Use "SELECT {schema_madlib}.mlp_predict()" for help. + """.format(**args)