This is an automated email from the ASF dual-hosted git repository.
jingyimei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new d9c2025 DL: Naming improvements for dependent and independent varname
d9c2025 is described below
commit d9c2025958a5f58625b26700ca003ee3feefa2ec
Author: Jingyi Mei <[email protected]>
AuthorDate: Wed May 1 17:04:23 2019 -0700
DL: Naming improvements for dependent and independent varname
JIRA: MADLIB-1324
Previously, the name/value of 'dependent_varname' and 'indenpendent_varname'
in madlib_keras_fit() is a bit confusing. This commit does the following:
1. remove dependent_varname and independent_varname from the fit()
interface since we know what these are from the minibatch summary table,
and user must use minibatching for DL
2. use original column names from the source table (i.e., pre minibatch) in
fit summary table, not the generic names dependent_var and independent_var
3. This change will also change the predict summary table to have the
original column names instead of dependent_var and independent_var
Closes #387
---
.../modules/deep_learning/madlib_keras.py_in | 22 +++++++-----
.../modules/deep_learning/madlib_keras.sql_in | 15 ++------
.../modules/deep_learning/test/madlib_keras.sql_in | 40 +++++-----------------
.../test/unit_tests/test_madlib_keras.py_in | 3 +-
4 files changed, 28 insertions(+), 52 deletions(-)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 8b2a747..d08bad6 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -36,6 +36,8 @@ from keras.models import *
from keras.optimizers import *
from keras.regularizers import *
import madlib_keras_serializer
+from input_data_preprocessor import MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+from input_data_preprocessor import MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
from madlib_keras_helper import CLASS_VALUES_COLNAME
from madlib_keras_helper import DEPENDENT_VARTYPE_COLNAME
from madlib_keras_helper import expand_input_dims
@@ -52,14 +54,14 @@ from utilities.utilities import madlib_version
from utilities.validate_args import get_col_value_and_type
from utilities.validate_args import quote_ident
-def fit(schema_madlib, source_table, model, dependent_varname,
- independent_varname, model_arch_table, model_arch_id, compile_params,
- fit_params, num_iterations, gpus_per_host = 0,
- validation_table=None, name="", description="", **kwargs):
+def fit(schema_madlib, source_table, model,model_arch_table,
+ model_arch_id, compile_params, fit_params, num_iterations,
+ gpus_per_host = 0, validation_table=None, name="",
+ description="", **kwargs):
source_table = quote_ident(source_table)
- dependent_varname = quote_ident(dependent_varname)
- independent_varname = quote_ident(independent_varname)
+ dependent_varname = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+ independent_varname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
model_arch_table = quote_ident(model_arch_table)
fit_validator = FitInputValidator(
@@ -227,6 +229,10 @@ def fit(schema_madlib, source_table, model,
dependent_varname,
fit_validator.source_summary_table, NORMALIZING_CONST_COLNAME)
dep_vartype = plpy.execute("SELECT {0} AS dep FROM {1}".format(
DEPENDENT_VARTYPE_COLNAME,
fit_validator.source_summary_table))[0]['dep']
+ dependent_varname_in_source_table = quote_ident(plpy.execute("SELECT {0}
FROM {1}".format(
+ 'dependent_varname',
fit_validator.source_summary_table))[0]['dependent_varname'])
+ independent_varname_in_source_table = quote_ident(plpy.execute("SELECT {0}
FROM {1}".format(
+ 'independent_varname',
fit_validator.source_summary_table))[0]['independent_varname'])
create_output_summary_table = plpy.prepare("""
CREATE TABLE {0}_summary AS
SELECT
@@ -279,8 +285,8 @@ def fit(schema_madlib, source_table, model,
dependent_varname,
"madlib_keras",
start_training_time, end_training_time,
source_table, validation_table,
- model, dependent_varname,
- independent_varname, name, description,
+ model, dependent_varname_in_source_table,
+ independent_varname_in_source_table, name, description,
sys.getsizeof(model), version, compile_params,
fit_params, num_iterations, num_classes,
aggregate_accuracy[-1],
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index a492d14..1844a29 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -31,8 +31,6 @@ m4_include(`SQLCommon.m4')
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
source_table VARCHAR,
model VARCHAR,
- dependent_varname VARCHAR,
- independent_varname VARCHAR,
model_arch_table VARCHAR,
model_arch_id INTEGER,
compile_params VARCHAR,
@@ -53,8 +51,6 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA',
`');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
source_table VARCHAR,
model VARCHAR,
- dependent_varname VARCHAR,
- independent_varname VARCHAR,
model_arch_table VARCHAR,
model_arch_id INTEGER,
compile_params VARCHAR,
@@ -63,16 +59,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
gpus_per_host INTEGER,
validation_table VARCHAR
) RETURNS VOID AS $$
- SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9,
- $10, $11, NULL, NULL);
+ SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9,
NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
source_table VARCHAR,
model VARCHAR,
- dependent_varname VARCHAR,
- independent_varname VARCHAR,
model_arch_table VARCHAR,
model_arch_id INTEGER,
compile_params VARCHAR,
@@ -80,22 +73,20 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
num_iterations INTEGER,
gpus_per_host INTEGER
) RETURNS VOID AS $$
- SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9,
$10, NULL, NULL, NULL);
+ SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8,
NULL, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
source_table VARCHAR,
model VARCHAR,
- dependent_varname VARCHAR,
- independent_varname VARCHAR,
model_arch_table VARCHAR,
model_arch_id INTEGER,
compile_params VARCHAR,
fit_params VARCHAR,
num_iterations INTEGER
) RETURNS VOID AS $$
- SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9,
0, NULL, NULL, NULL);
+ SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, 0, NULL,
NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index 15d4725..9d34303 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -105,8 +105,6 @@ DROP TABLE IF EXISTS keras_saved_out,
keras_saved_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_batched',
'keras_saved_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -124,9 +122,9 @@ SELECT assert(
source_table = 'cifar_10_sample_batched' AND
validation_table = 'cifar_10_sample_val' AND
model = 'keras_saved_out' AND
- dependent_varname = 'dependent_var' AND
+ dependent_varname = 'y' AND
dependent_vartype = 'smallint' AND
- independent_varname = 'independent_var' AND
+ independent_varname = 'x' AND
normalizing_const = 255.0 AND
name is NULL AND
description is NULL AND
@@ -158,8 +156,6 @@ DROP TABLE IF EXISTS keras_saved_out_gpu,
keras_saved_out_gpu_summary;
SELECT assert(trap_error($TRAP$madlib_keras_fit(
'cifar_10_sample_batched',
'keras_saved_out_gpu',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -195,8 +191,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_batched',
'keras_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -214,9 +208,9 @@ SELECT assert(
source_table = 'cifar_10_sample_batched' AND
validation_table is NULL AND
model = 'keras_out' AND
- dependent_varname = 'dependent_var' AND
+ dependent_varname = 'y' AND
dependent_vartype = 'smallint' AND
- independent_varname = 'independent_var' AND
+ independent_varname = 'x' AND
name = 'model name' AND
description = 'model desc' AND
model_size > 0 AND
@@ -258,13 +252,13 @@ SELECT assert(UPPER(atttypid::regtype::TEXT) = 'INTEGER',
'id column should be I
SELECT assert(UPPER(atttypid::regtype::TEXT) =
'SMALLINT', 'prediction column should be SMALLINT type')
FROM pg_attribute WHERE attrelid = 'cifar10_predict'::regclass
- AND attname = 'estimated_dependent_var';
+ AND attname = 'estimated_y';
-- Validate correct number of rows returned.
SELECT assert(COUNT(*)=2, 'Output table of madlib_keras_predict should have
two rows') FROM cifar10_predict;
-- First test that all values are in set of class values; if this breaks, it's
definitely a problem.
-SELECT assert(estimated_dependent_var IN (0,1),
+SELECT assert(estimated_y IN (0,1),
'Predicted value not in set of defined class values for model')
FROM cifar10_predict;
@@ -284,8 +278,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_batched',
'keras_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer='SGD', loss=losses.categorical_crossentropy,
metrics=['accuracy']$$::text,
@@ -299,8 +291,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_batched',
'keras_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer='Adam()', loss=losses.categorical_crossentropy,
metrics=['accuracy']$$::text,
@@ -314,8 +304,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_batched',
'keras_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=Adam(epsilon=None), loss=losses.categorical_crossentropy,
metrics=['accuracy']$$::text,
@@ -330,8 +318,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_batched',
'keras_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
metrics=['accuracy'], loss_weights=[2], sample_weight_mode=None,
loss='categorical_crossentropy' $$::text,
@@ -350,8 +336,6 @@ DROP TABLE IF EXISTS keras_out, keras_out_summary;
select assert(trap_error($TRAP$madlib_keras_fit(
'cifar_10_sample_batched',
'keras_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -431,8 +415,6 @@ DROP TABLE IF EXISTS keras_saved_out,
keras_saved_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_text_batched',
'keras_saved_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -497,7 +479,7 @@ SELECT assert(UPPER(atttypid::regtype::TEXT) =
'TEXT', 'prediction column should be TEXT type')
FROM pg_attribute
WHERE attrelid = 'cifar10_predict'::regclass
- AND attname = 'estimated_dependent_var';
+ AND attname = 'estimated_y';
-- Tests where the assumption is user has one-hot encoded, so class_values
-- in input summary table will be NULL.
@@ -541,7 +523,7 @@ SELECT assert(UPPER(atttypid::regtype::TEXT) =
'DOUBLE PRECISION', 'prediction column should be double precision type')
FROM pg_attribute
WHERE attrelid = 'cifar10_predict'::regclass
- AND attname = 'estimated_dependent_var';
+ AND attname = 'estimated_y';
-- Test predict with INTEGER class_values
-- with NULL as a valid class value
@@ -563,8 +545,6 @@ DROP TABLE IF EXISTS keras_saved_out,
keras_saved_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_int_batched',
'keras_saved_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
@@ -620,7 +600,7 @@ SELECT madlib_keras_predict(
SELECT assert(UPPER(atttypid::regtype::TEXT) =
'SMALLINT', 'prediction column should be smallint type')
FROM pg_attribute
-WHERE attrelid = 'cifar10_predict'::regclass AND attname =
'estimated_dependent_var';
+WHERE attrelid = 'cifar10_predict'::regclass AND attname = 'estimated_y';
-- Test case with a different input shape (3, 32, 32) instead of (32, 32, 3).
-- Create a new table with image shape 3, 32, 32
@@ -661,8 +641,6 @@ DROP TABLE IF EXISTS keras_saved_out,
keras_saved_out_summary;
SELECT madlib_keras_fit(
'cifar_10_sample_test_shape_batched',
'keras_saved_out',
- 'dependent_var',
- 'independent_var',
'model_arch',
1,
$$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
diff --git
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index 5eedd2e..d6a8e2b 100644
---
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -42,7 +42,8 @@ class MadlibKerasFitTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.minibatch_preprocessing': Mock()
}
self.plpy_mock_execute = MagicMock()