This is an automated email from the ASF dual-hosted git repository.
domino pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new 33988ce Add generate mst table utility. This commit adds a new
function to generate the model selection table. It generates such table as the
naive combinations of the three input lists of parameter choices. This utility
also validates and de-duplicates the inputs.
new e0c2bad Merge pull request #430 from makemebitter/dl/add_mst_utility
33988ce is described below
commit 33988ce4654477f12c3140bab7340a2cba16251a
Author: Domino Valdano <[email protected]>
AuthorDate: Wed Aug 7 12:04:25 2019 -0700
Add generate mst table utility.
This commit adds a new function to generate the model selection table.
It generates such table as the naive combinations of the three input
lists of parameter choices. This utility also validates and
de-duplicates the inputs.
Also splits off madlib_keras_model_hopper.sql_in and
test/madlib_keras_model_hopper.sql_in from corresponding
madlib_keras.sql files.
Closes #430
Co-authored-by: Yuhao Zhang <[email protected]>
---
.../modules/deep_learning/madlib_keras.sql_in | 4 +-
.../madlib_keras_fit_multiple_model.py_in | 152 ++++++++++++++++++++
.../madlib_keras_fit_multiple_model.sql_in | 46 ++++++
.../deep_learning/madlib_keras_validator.py_in | 65 +++++++++
.../test/madlib_keras_fit_multiple_model.sql_in | 154 +++++++++++++++++++++
.../test/unit_tests/test_madlib_keras.py_in | 109 ++++++++++++++-
6 files changed, 527 insertions(+), 3 deletions(-)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 84c1054..6ff0da0 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -20,8 +20,8 @@
*
* @file madlib_keras.sql_in
*
- * @brief SQL functions for multilayer perceptron
- * @date June 2012
+ * @brief SQL functions for distributed deep learning with keras
+ * @date June 2019
*
*
*//* -----------------------------------------------------------------------
*/
diff --git
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
new file mode 100644
index 0000000..52fd3fb
--- /dev/null
+++
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
@@ -0,0 +1,152 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+
+import plpy
+from collections import OrderedDict
+from madlib_keras_validator import MstLoaderInputValidator
+from utilities.control import MinWarning
+from madlib_keras_wrapper import convert_string_of_args_to_dict
+
+@MinWarning("warning")
+class MstLoader():
+ """The utility class for loading a model selection table with model
parameters.
+
+ Currently just takes all combinations of input parameters passed. This
+ utility validates the inputs.
+
+ Attributes:
+ compile_params_list (list): The input list of compile params choices.
+ fit_params_list (list): The input list of fit params choices.
+ model_arch_id_list (list): The input list of model id choices.
+ model_arch_table (str): The name of model architechure table.
+ model_selection_table (str): The name of the output mst table.
+ msts (list): The list of generated msts.
+
+ """
+
+ def __init__(self,
+ model_arch_table,
+ model_selection_table,
+ model_arch_id_list,
+ compile_params_list,
+ fit_params_list,
+ **kwargs):
+
+ self.model_arch_table = model_arch_table
+ self.model_selection_table = model_selection_table
+ self.model_arch_id_list = sorted(list(set(model_arch_id_list)))
+ MstLoaderInputValidator(
+ model_arch_table=self.model_arch_table,
+ model_selection_table=self.model_selection_table,
+ model_arch_id_list=self.model_arch_id_list,
+ compile_params_list=compile_params_list,
+ fit_params_list=fit_params_list
+ )
+ self.compile_params_list = self.params_preprocessed(
+ compile_params_list)
+ self.fit_params_list = self.params_preprocessed(fit_params_list)
+
+ self.msts = []
+
+ self.find_combinations()
+
+ def load(self):
+ """The entry point for loading the model selection table.
+ """
+ # All of the side effects happen in this function.
+ self.create_mst_table()
+ self.insert_into_mst_table()
+
+ def params_preprocessed(self, list_strs):
+ """Preprocess the input lists. Eliminate white spaces and sort them.
+
+ Args:
+ list_strs (list): A list of strings.
+
+ Returns:
+ list: The preprocessed list of strings.
+ """
+
+ dict_dedup = {}
+ for string in list_strs:
+ d = convert_string_of_args_to_dict(string)
+ hash_tuple = tuple( '{0} = {1}'\
+ .format(x, d[x]) for x in sorted(d.keys()))
+ dict_dedup[hash_tuple] = string
+
+ return dict_dedup.values()
+
+ def find_combinations(self):
+ """Backtracking helper for generating the combinations.
+ """
+ param_grid = OrderedDict([
+ ('model_arch_id', self.model_arch_id_list),
+ ('compile_params', self.compile_params_list),
+ ('fit_params', self.fit_params_list)
+ ])
+
+ def find_combinations_helper(msts, p, i):
+ param_names = param_grid.keys()
+ if i < len(param_names):
+ for x in param_grid[param_names[i]]:
+ p[param_names[i]] = x
+ find_combinations_helper(msts, p, i + 1)
+ else:
+ msts.append(p.copy())
+ find_combinations_helper(self.msts, {}, 0)
+
+ def create_mst_table(self):
+ """Initialize the output mst table.
+ """
+ create_query = """
+ CREATE TABLE {self.model_selection_table} (
+ mst_key SERIAL,
+ model_arch_table VARCHAR,
+ model_arch_id INTEGER,
+ compile_params VARCHAR,
+ fit_params VARCHAR,
+ unique (model_arch_id, compile_params, fit_params)
+ );
+ """.format(self=self)
+ with MinWarning('warning'):
+ plpy.execute(create_query)
+
+ def insert_into_mst_table(self):
+ """Insert every thing in self.msts into the mst table.
+ """
+ for mst in self.msts:
+ model_arch_id = mst['model_arch_id']
+ compile_params = mst['compile_params']
+ fit_params = mst['fit_params']
+ insert_query = """
+ INSERT INTO
+ {self.model_selection_table}(
+ model_arch_table,
+ model_arch_id,
+ compile_params,
+ fit_params
+ )
+ VALUES (
+ $${self.model_arch_table}$$,
+ {model_arch_id},
+ $${compile_params}$$,
+ $${fit_params}$$
+ )
+ """.format(**locals())
+ plpy.execute(insert_query)
+
diff --git
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
new file mode 100644
index 0000000..472ff61
--- /dev/null
+++
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -0,0 +1,46 @@
+/* -----------------------------------------------------------------------
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file madlib_keras_fit_multiple_model.sql_in
+ *
+ * @brief SQL functions for model hopper distributed training
+ * @date August 2019
+ *
+ *
+ *//* -----------------------------------------------------------------------
*/
+
+m4_include(`SQLCommon.m4')
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.load_model_selection_table(
+ model_arch_table VARCHAR,
+ model_selection_table VARCHAR,
+ model_arch_id_list INTEGER[],
+ compile_params_list VARCHAR[],
+ fit_params_list VARCHAR[]
+) RETURNS VOID AS $$
+ PythonFunctionBodyOnly(`deep_learning', `madlib_keras_fit_multiple_model')
+ with AOControl(False):
+ mst_loader = madlib_keras_fit_multiple_model.MstLoader(**globals())
+ mst_loader.load()
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
diff --git
a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index e9d7d14..b4c5c4b 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
@@ -45,6 +45,8 @@ from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import get_expr_type
from utilities.validate_args import input_tbl_valid
from utilities.validate_args import output_tbl_valid
+from madlib_keras_wrapper import parse_and_validate_fit_params
+from madlib_keras_wrapper import parse_and_validate_compile_params
class InputValidator:
@staticmethod
@@ -309,3 +311,66 @@ class FitInputValidator:
self.validation_table, self.independent_varname,
input_shape, 2)
+
+
+class MstLoaderInputValidator():
+ def __init__(self,
+ model_arch_table,
+ model_selection_table,
+ model_arch_id_list,
+ compile_params_list,
+ fit_params_list
+ ):
+ self.model_arch_table = model_arch_table
+ self.model_selection_table = model_selection_table
+ self.model_arch_id_list = model_arch_id_list
+ self.compile_params_list = compile_params_list
+ self.fit_params_list = fit_params_list
+ self.module_name = 'load_model_selection_table'
+ self._validate_input_args()
+
+ def _validate_input_args(self):
+ self._validate_input_output_tables()
+ self._validate_model_arch_ids()
+ self._validate_compile_and_fit_params()
+
+ def _validate_model_arch_ids(self):
+ model_arch_id_str = '({0})'\
+ .format(','.join([str(x) for x in self.model_arch_id_list]))
+ query = """
+ SELECT count(model_id)
+ FROM {self.model_arch_table}
+ WHERE model_id IN {model_arch_id_str}
+ """.format(**locals())
+ res = int(plpy.execute(query)[0]['count'])
+ _assert(
+ res == len(self.model_arch_id_list),
+ "{0}: One or more model_id of {1} not found in table {2}".format(
+ self.module_name,
+ model_arch_id_str,
+ self.model_arch_table
+ )
+ )
+ def _validate_compile_and_fit_params(self):
+ for fit_params in self.fit_params_list:
+ try:
+ res = parse_and_validate_fit_params(fit_params)
+ except Exception as e:
+ plpy.error(
+ """Fit param check failed for: {} \n
+ {}
+ """.format(fit_params, str(e)))
+ for compile_params in self.compile_params_list:
+ try:
+ res = parse_and_validate_compile_params(compile_params)
+ except Exception as e:
+ plpy.error(
+ """Compile param check failed for: {} \n
+ {}
+ """.format(compile_params, str(e)))
+
+
+ def _validate_input_output_tables(self):
+ input_tbl_valid(self.model_arch_table, self.module_name)
+ output_tbl_valid(self.model_selection_table, self.module_name)
+
diff --git
a/src/ports/postgres/modules/deep_learning/test/madlib_keras_fit_multiple_model.sql_in
b/src/ports/postgres/modules/deep_learning/test/madlib_keras_fit_multiple_model.sql_in
new file mode 100644
index 0000000..7b3b1f5
--- /dev/null
+++
b/src/ports/postgres/modules/deep_learning/test/madlib_keras_fit_multiple_model.sql_in
@@ -0,0 +1,154 @@
+/* ---------------------------------------------------------------------*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ---------------------------------------------------------------------*/
+
+-- MST table generation tests
+
+-- First set up model arch table, to use as input
+DROP TABLE IF EXISTS iris_model_arch;
+-- NOTE: The seed is set to 0 for every layer.
+SELECT load_keras_model('iris_model_arch', -- Output table,
+$$
+{
+"class_name": "Sequential",
+"keras_version": "2.1.6",
+"config":
+ [{"class_name": "Dense", "config": {"kernel_initializer": {"class_name":
"VarianceScaling",
+ "config": {"distribution": "uniform", "scale": 1.0, "seed": 0, "mode":
"fan_avg"}},
+ "name": "dense_1", "kernel_constraint": null, "bias_regularizer": null,
+ "bias_constraint": null, "dtype": "float32", "activation": "relu",
"trainable": true,
+ "kernel_regularizer": null, "bias_initializer": {"class_name": "Zeros",
+ "config": {}}, "units": 10, "batch_input_shape": [null, 4], "use_bias":
true,
+ "activity_regularizer": null}}, {"class_name": "Dense",
+ "config": {"kernel_initializer": {"class_name": "VarianceScaling",
+ "config": {"distribution": "uniform", "scale": 1.0, "seed": 0, "mode":
"fan_avg"}},
+ "name": "dense_2", "kernel_constraint": null, "bias_regularizer": null,
+ "bias_constraint": null, "activation": "relu", "trainable": true,
"kernel_regularizer": null,
+ "bias_initializer": {"class_name": "Zeros", "config": {}}, "units": 10,
"use_bias": true,
+ "activity_regularizer": null}}, {"class_name": "Dense", "config":
{"kernel_initializer":
+ {"class_name": "VarianceScaling", "config": {"distribution": "uniform",
"scale": 1.0,
+ "seed": 0, "mode": "fan_avg"}}, "name": "dense_3", "kernel_constraint":
null,
+ "bias_regularizer": null, "bias_constraint": null, "activation": "softmax",
+ "trainable": true, "kernel_regularizer": null, "bias_initializer":
{"class_name": "Zeros",
+ "config": {}}, "units": 3, "use_bias": true, "activity_regularizer":
null}}],
+ "backend": "tensorflow"}
+$$
+);
+
+-- Valid inputs should pass and yield 6 msts in the table
+DROP TABLE IF EXISTS mst_table;
+SELECT load_model_selection_table(
+ 'iris_model_arch',
+ 'mst_table',
+ ARRAY[1],
+ ARRAY[
+
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.1)',metrics=['accuracy']$$,
+ $$loss='categorical_crossentropy',
optimizer='Adam(lr=0.01)',metrics=['accuracy']$$,
+
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.001)',metrics=['accuracy']$$
+ ],
+ ARRAY[
+ $$batch_size=5,epochs=1$$,
+ $$batch_size=10,epochs=1$$
+ ]
+);
+SELECT assert(
+ COUNT(*)=6,
+ 'The length of mst table does not match with the inputs'
+)
+FROM mst_table;
+
+-- Invalid arguments must be errored out
+
+DROP TABLE IF EXISTS mst_table;
+SELECT assert(trap_error($TRAP$
+ SELECT load_model_selection_table(
+ 'iris_model_arch',
+ 'mst_table',
+ ARRAY[-1],
+ ARRAY[
+
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.1)',metrics=['accuracy']$$
+ ],
+ ARRAY[
+ $$batch_size=5,epochs=1$$
+ ]
+ );
+$TRAP$)=1, 'Should error out if model_id is not in the model arch table');
+
+DROP TABLE IF EXISTS mst_table;
+SELECT assert(trap_error($TRAP$
+ SELECT load_model_selection_table(
+ 'iris_model_arch',
+ 'mst_table',
+ ARRAY[1],
+ ARRAY[
+ $$foo='bar'$$
+ ],
+ ARRAY[
+ $$batch_size='bar'$$
+ ]
+ );
+$TRAP$)=1, 'Should error out if the provided parameters are not valid');
+
+-- Must deduplicate, options with extrac white spaces should not be considered
+-- as distinct params.
+
+DROP TABLE IF EXISTS mst_table;
+SELECT load_model_selection_table(
+ 'iris_model_arch',
+ 'mst_table',
+ ARRAY[1],
+ ARRAY[
+
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.1)',metrics=['accuracy']$$,
+ $$ loss='categorical_crossentropy',
optimizer='Adam(lr=0.1)',metrics=['accuracy'] $$,
+
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.001)',metrics=['accuracy']$$
+ ],
+ ARRAY[
+ $$batch_size=5,epochs=1$$,
+ $$batch_size=10,epochs=1$$
+ ]
+);
+SELECT assert(
+ COUNT(*)=4,
+ 'The length of mst table (' || COUNT(*) || ')does not match with the
inputs due to deduplication failure'
+)
+FROM mst_table;
+
+-- Must also handle duplicates where order of key/value pairs is re-arranged
+DROP TABLE IF EXISTS mst_table;
+SELECT load_model_selection_table(
+ 'iris_model_arch',
+ 'mst_table',
+ ARRAY[1],
+ ARRAY[
+
$$loss='categorical_crossentropy',optimizer='Adam(lr=0.1)',metrics=['accuracy']$$,
+ $$metrics= ['accuracy'], loss='categorical_crossentropy',
optimizer='Adam(lr=0.1)'$$,
+ $$loss='mse',optimizer='Adam(lr=0.001)', metrics=['accuracy']$$
+ ],
+ ARRAY[
+ $$batch_size=5,epochs=1$$,
+ $$epochs=1, batch_size=5$$
+ ]
+);
+SELECT assert(
+ COUNT(*)=2,
+ 'The length of mst table (' || COUNT(*) || ') does not match with the
inputs due to deduplication failure'
+)
+FROM mst_table;
+
diff --git
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index 9cce86a..8467cbb 100644
---
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -17,6 +17,8 @@
# specific language governing permissions and limitations
# under the License.
+m4_changequote(`<!', `!>')
+
import sys
import numpy as np
import os
@@ -32,7 +34,7 @@ import unittest
from mock import *
import plpy_mock as plpy
-m4_changequote(`<!', `!>')
+
# helper for multiplying array by int
def mult(k,arr):
@@ -1295,6 +1297,111 @@ class MadlibKerasEvaluationTestCase(unittest.TestCase):
with self.assertRaises(plpy.PLPYException):
result = self.subject.internal_keras_eval_final(input_state)
+
+class LoadModelSelectionTableTestCase(unittest.TestCase):
+ def setUp(self):
+ # The side effects of this class(writing to the output table) are not
+ # tested here. They are tested in dev-check.
+ self.plpy_mock = Mock(spec='error')
+ patches = {
+ 'plpy': plpy
+ }
+
+ self.plpy_mock_execute = MagicMock()
+ plpy.execute = self.plpy_mock_execute
+
+ self.module_patcher = patch.dict('sys.modules', patches)
+ self.module_patcher.start()
+ import deep_learning.madlib_keras_fit_multiple_model
+ self.module = deep_learning.madlib_keras_fit_multiple_model
+ self.module.MstLoaderInputValidator._validate_input_args = \
+ MagicMock()
+
+ self.subject = self.module.MstLoader
+ self.model_selection_table = 'mst_table'
+ self.model_arch_table = 'model_arch_library'
+ self.model_arch_id_list = [1]
+ self.compile_params_list = [
+ """
+ loss='categorical_crossentropy',
+ optimizer='Adam(lr=0.1)',
+ metrics=['accuracy']
+ """,
+ """
+ loss='categorical_crossentropy',
+ optimizer='Adam(lr=0.01)',
+ metrics=['accuracy']
+ """,
+ """
+ loss='categorical_crossentropy',
+ optimizer='Adam(lr=0.001)',
+ metrics=['accuracy']
+ """
+ ]
+ self.fit_params_list = [
+ "batch_size=5,epochs=1",
+ "batch_size=10,epochs=1"
+ ]
+
+ def test_mst_table_dimension(self):
+ generate_mst = self.subject(
+ self.model_selection_table,
+ self.model_arch_table,
+ self.model_arch_id_list,
+ self.compile_params_list,
+ self.fit_params_list
+ )
+ self.assertEqual(6, len(generate_mst.msts))
+
+ def test_invalid_input_args(self):
+ self.module.MstLoaderInputValidator \
+ ._validate_input_args \
+ .side_effect = plpy.PLPYException('Invalid input args')
+ with self.assertRaises(plpy.PLPYException):
+ generate_mst = self.subject(
+ self.model_selection_table,
+ self.model_arch_table,
+ self.model_arch_id_list,
+ self.compile_params_list,
+ self.fit_params_list
+ )
+
+ def test_duplicate_params(self):
+ self.model_arch_id_list = [1, 1, 2]
+ self.compile_params_list = [
+ """
+ loss='categorical_crossentropy',
+ optimizer='Adam(lr=0.1)',
+ metrics=['accuracy']
+ """,
+ """
+ loss='categorical_crossentropy',
+ optimizer='Adam(lr=0.1)',
+ metrics=['accuracy']
+ """,
+ """
+ loss='categorical_crossentropy',
+ optimizer='Adam(lr=0.001)',
+ metrics=['accuracy']
+ """
+ ]
+ self.fit_params_list = [
+ "batch_size= 5,epochs=1",
+ "epochs=1 ,batch_size=5",
+ "batch_size=10,epochs =1"
+ ]
+ generate_mst = self.subject(
+ self.model_selection_table,
+ self.model_arch_table,
+ self.model_arch_id_list,
+ self.compile_params_list,
+ self.fit_params_list
+ )
+ self.assertEqual(8, len(generate_mst.msts))
+
+ def tearDown(self):
+ self.module_patcher.stop()
+
if __name__ == '__main__':
unittest.main()
# ---------------------------------------------------------------------