[madlib] branch master updated: DL: Naming improvements for dependent and independent varname

jingyimei Thu, 09 May 2019 00:22:47 -0700

This is an automated email from the ASF dual-hosted git repository.

jingyimei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new d9c2025  DL: Naming improvements for dependent and independent varname
d9c2025 is described below

commit d9c2025958a5f58625b26700ca003ee3feefa2ec
Author: Jingyi Mei <[email protected]>
AuthorDate: Wed May 1 17:04:23 2019 -0700

    DL: Naming improvements for dependent and independent varname
    
    JIRA: MADLIB-1324
    
    Previously, the name/value of 'dependent_varname' and 'indenpendent_varname'
    in madlib_keras_fit() is a bit confusing. This commit does the following:
    
    1. remove dependent_varname and independent_varname from the fit()
    interface since we know what these are from the minibatch summary table,
    and user must use minibatching for DL
    2. use original column names from the source table (i.e., pre minibatch) in
    fit summary table, not the generic names dependent_var and independent_var
    3. This change will also change the predict summary table to have the
    original column names instead of dependent_var and independent_var
    
    Closes #387
---
 .../modules/deep_learning/madlib_keras.py_in       | 22 +++++++-----
 .../modules/deep_learning/madlib_keras.sql_in      | 15 ++------
 .../modules/deep_learning/test/madlib_keras.sql_in | 40 +++++-----------------
 .../test/unit_tests/test_madlib_keras.py_in        |  3 +-
 4 files changed, 28 insertions(+), 52 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 8b2a747..d08bad6 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -36,6 +36,8 @@ from keras.models import *
 from keras.optimizers import *
 from keras.regularizers import *
 import madlib_keras_serializer
+from input_data_preprocessor import MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+from input_data_preprocessor import MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
 from madlib_keras_helper import CLASS_VALUES_COLNAME
 from madlib_keras_helper import DEPENDENT_VARTYPE_COLNAME
 from madlib_keras_helper import expand_input_dims
@@ -52,14 +54,14 @@ from utilities.utilities import madlib_version
 from utilities.validate_args import get_col_value_and_type
 from utilities.validate_args import quote_ident
 
-def fit(schema_madlib, source_table, model, dependent_varname,
-        independent_varname, model_arch_table, model_arch_id, compile_params,
-        fit_params, num_iterations, gpus_per_host = 0,
-        validation_table=None, name="", description="", **kwargs):
 
+def fit(schema_madlib, source_table, model,model_arch_table,
+        model_arch_id, compile_params, fit_params, num_iterations,
+        gpus_per_host = 0, validation_table=None, name="",
+        description="", **kwargs):
     source_table = quote_ident(source_table)
-    dependent_varname = quote_ident(dependent_varname)
-    independent_varname = quote_ident(independent_varname)
+    dependent_varname = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+    independent_varname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
     model_arch_table = quote_ident(model_arch_table)
 
     fit_validator = FitInputValidator(
@@ -227,6 +229,10 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
         fit_validator.source_summary_table, NORMALIZING_CONST_COLNAME)
     dep_vartype = plpy.execute("SELECT {0} AS dep FROM {1}".format(
         DEPENDENT_VARTYPE_COLNAME, 
fit_validator.source_summary_table))[0]['dep']
+    dependent_varname_in_source_table = quote_ident(plpy.execute("SELECT {0} 
FROM {1}".format(
+        'dependent_varname', 
fit_validator.source_summary_table))[0]['dependent_varname'])
+    independent_varname_in_source_table = quote_ident(plpy.execute("SELECT {0} 
FROM {1}".format(
+        'independent_varname', 
fit_validator.source_summary_table))[0]['independent_varname'])
     create_output_summary_table = plpy.prepare("""
         CREATE TABLE {0}_summary AS
         SELECT
@@ -279,8 +285,8 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
             "madlib_keras",
             start_training_time, end_training_time,
             source_table, validation_table,
-            model, dependent_varname,
-            independent_varname, name, description,
+            model, dependent_varname_in_source_table,
+            independent_varname_in_source_table, name, description,
             sys.getsizeof(model), version, compile_params,
             fit_params, num_iterations, num_classes,
             aggregate_accuracy[-1],
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index a492d14..1844a29 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -31,8 +31,6 @@ m4_include(`SQLCommon.m4')
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     source_table            VARCHAR,
     model                   VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
     model_arch_table        VARCHAR,
     model_arch_id           INTEGER,
     compile_params          VARCHAR,
@@ -53,8 +51,6 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', 
`');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     source_table            VARCHAR,
     model                   VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
     model_arch_table        VARCHAR,
     model_arch_id           INTEGER,
     compile_params          VARCHAR,
@@ -63,16 +59,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     gpus_per_host           INTEGER,
     validation_table        VARCHAR
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9,
-                                          $10, $11, NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     source_table            VARCHAR,
     model                   VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
     model_arch_table        VARCHAR,
     model_arch_id           INTEGER,
     compile_params          VARCHAR,
@@ -80,22 +73,20 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     num_iterations          INTEGER,
     gpus_per_host           INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
$10, NULL, NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, 
NULL, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     source_table            VARCHAR,
     model                   VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
     model_arch_table        VARCHAR,
     model_arch_id           INTEGER,
     compile_params          VARCHAR,
     fit_params              VARCHAR,
     num_iterations          INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
0, NULL, NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, 0, NULL, 
NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index 15d4725..9d34303 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -105,8 +105,6 @@ DROP TABLE IF EXISTS keras_saved_out, 
keras_saved_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_batched',
     'keras_saved_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -124,9 +122,9 @@ SELECT assert(
         source_table = 'cifar_10_sample_batched' AND
         validation_table = 'cifar_10_sample_val' AND
         model = 'keras_saved_out' AND
-        dependent_varname = 'dependent_var' AND
+        dependent_varname = 'y' AND
         dependent_vartype = 'smallint' AND
-        independent_varname = 'independent_var' AND
+        independent_varname = 'x' AND
         normalizing_const = 255.0 AND
         name is NULL AND
         description is NULL AND
@@ -158,8 +156,6 @@ DROP TABLE IF EXISTS keras_saved_out_gpu, 
keras_saved_out_gpu_summary;
 SELECT assert(trap_error($TRAP$madlib_keras_fit(
     'cifar_10_sample_batched',
     'keras_saved_out_gpu',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -195,8 +191,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_batched',
     'keras_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -214,9 +208,9 @@ SELECT assert(
     source_table = 'cifar_10_sample_batched' AND
     validation_table is NULL AND
     model = 'keras_out' AND
-    dependent_varname = 'dependent_var' AND
+    dependent_varname = 'y' AND
     dependent_vartype = 'smallint' AND
-    independent_varname = 'independent_var' AND
+    independent_varname = 'x' AND
     name = 'model name' AND
     description = 'model desc' AND
     model_size > 0 AND
@@ -258,13 +252,13 @@ SELECT assert(UPPER(atttypid::regtype::TEXT) = 'INTEGER', 
'id column should be I
 SELECT assert(UPPER(atttypid::regtype::TEXT) =
     'SMALLINT', 'prediction column should be SMALLINT type')
     FROM pg_attribute WHERE attrelid = 'cifar10_predict'::regclass
-        AND attname = 'estimated_dependent_var';
+        AND attname = 'estimated_y';
 
 -- Validate correct number of rows returned.
 SELECT assert(COUNT(*)=2, 'Output table of madlib_keras_predict should have 
two rows') FROM cifar10_predict;
 
 -- First test that all values are in set of class values; if this breaks, it's 
definitely a problem.
-SELECT assert(estimated_dependent_var IN (0,1),
+SELECT assert(estimated_y IN (0,1),
               'Predicted value not in set of defined class values for model')
 FROM cifar10_predict;
 
@@ -284,8 +278,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_batched',
     'keras_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer='SGD', loss=losses.categorical_crossentropy, 
metrics=['accuracy']$$::text,
@@ -299,8 +291,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_batched',
     'keras_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer='Adam()', loss=losses.categorical_crossentropy, 
metrics=['accuracy']$$::text,
@@ -314,8 +304,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_batched',
     'keras_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=Adam(epsilon=None), loss=losses.categorical_crossentropy, 
metrics=['accuracy']$$::text,
@@ -330,8 +318,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_batched',
     'keras_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
metrics=['accuracy'], loss_weights=[2], sample_weight_mode=None, 
loss='categorical_crossentropy' $$::text,
@@ -350,8 +336,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
 select assert(trap_error($TRAP$madlib_keras_fit(
            'cifar_10_sample_batched',
            'keras_out',
-           'dependent_var',
-           'independent_var',
            'model_arch',
            1,
            $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -431,8 +415,6 @@ DROP TABLE IF EXISTS keras_saved_out, 
keras_saved_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_text_batched',
     'keras_saved_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -497,7 +479,7 @@ SELECT assert(UPPER(atttypid::regtype::TEXT) =
     'TEXT', 'prediction column should be TEXT type')
 FROM pg_attribute
 WHERE attrelid = 'cifar10_predict'::regclass
-      AND attname = 'estimated_dependent_var';
+      AND attname = 'estimated_y';
 
 -- Tests where the assumption is user has one-hot encoded, so class_values
 -- in input summary table will be NULL.
@@ -541,7 +523,7 @@ SELECT assert(UPPER(atttypid::regtype::TEXT) =
     'DOUBLE PRECISION', 'prediction column should be double precision type')
 FROM pg_attribute
 WHERE attrelid = 'cifar10_predict'::regclass
-      AND attname = 'estimated_dependent_var';
+      AND attname = 'estimated_y';
 
 -- Test predict with INTEGER class_values
 -- with NULL as a valid class value
@@ -563,8 +545,6 @@ DROP TABLE IF EXISTS keras_saved_out, 
keras_saved_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_int_batched',
     'keras_saved_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -620,7 +600,7 @@ SELECT madlib_keras_predict(
 SELECT assert(UPPER(atttypid::regtype::TEXT) =
     'SMALLINT', 'prediction column should be smallint type')
 FROM pg_attribute
-WHERE attrelid = 'cifar10_predict'::regclass AND attname = 
'estimated_dependent_var';
+WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'estimated_y';
 
 -- Test case with a different input shape (3, 32, 32) instead of (32, 32, 3).
 -- Create a new table with image shape 3, 32, 32
@@ -661,8 +641,6 @@ DROP TABLE IF EXISTS keras_saved_out, 
keras_saved_out_summary;
 SELECT madlib_keras_fit(
     'cifar_10_sample_test_shape_batched',
     'keras_saved_out',
-    'dependent_var',
-    'independent_var',
     'model_arch',
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index 5eedd2e..d6a8e2b 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -42,7 +42,8 @@ class MadlibKerasFitTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.minibatch_preprocessing': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()

[madlib] branch master updated: DL: Naming improvements for dependent and independent varname

Reply via email to