This is an automated email from the ASF dual-hosted git repository.

njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 16b18e3d6f10122b12079e309b211a81e1e6c673
Author: Nandish Jayaram <[email protected]>
AuthorDate: Tue Apr 30 17:18:04 2019 -0700

    DL: Dev-check for validation_preprocessor_dl
    
    JIRA: MADLIB-1333
    Add necessary dev check tests, and minor changes in unit test file for
    input_preprocessor_dl.
    
    Co-authored-by: Ekta Khanna <[email protected]>
---
 .../test/input_data_preprocessor.sql_in            | 198 ++++++++++++++++++++-
 .../unit_tests/test_input_data_preprocessor.py_in  |  22 +--
 2 files changed, 201 insertions(+), 19 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in 
b/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in
index 62014d9..ff0452e 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in
@@ -60,6 +60,27 @@ FROM data_preprocessor_input_batch WHERE buffer_id=1;
 SELECT assert(array_upper(independent_var, 1)=4, 'Incorrect buffer size.')
 FROM data_preprocessor_input_batch WHERE buffer_id=3;
 
+DROP TABLE IF EXISTS validation_out, validation_out_summary;
+SELECT validation_preprocessor_dl(
+  'data_preprocessor_input',
+  'validation_out',
+  'id',
+  'x',
+  'data_preprocessor_input_batch',
+  5);
+
+SELECT assert(count(*)=4, 'Incorrect number of buffers in validation_out.')
+FROM validation_out;
+
+SELECT assert(array_upper(independent_var, 2)=6, 'Incorrect buffer size.')
+FROM validation_out WHERE buffer_id=0;
+
+SELECT assert(array_upper(independent_var, 1)=5, 'Incorrect buffer size.')
+FROM validation_out WHERE buffer_id=1;
+
+SELECT assert(array_upper(independent_var, 1)=4, 'Incorrect buffer size.')
+FROM validation_out WHERE buffer_id=3;
+
 DROP TABLE IF EXISTS data_preprocessor_input_batch, 
data_preprocessor_input_batch_summary;
 SELECT training_preprocessor_dl(
   'data_preprocessor_input',
@@ -116,10 +137,50 @@ SELECT assert
         dependent_vartype   = 'integer' AND
         class_values        = '{-6,-3,-1,0,2,3,4,5,6,7,8,9,10,12,NULL,NULL}' 
AND
         buffer_size         = 4 AND  -- we sort the class values in python
-        normalizing_const   = 5,
+        normalizing_const   = 5 AND
+        num_classes         = 16,
         'Summary Validation failed. Actual:' || __to_char(summary)
         ) from (select * from data_preprocessor_input_batch_summary) summary;
 
+-- Test for validation data where the input table has only a subset of
+-- the classes compared to the original training data
+-- (data_preprocessor_input_batch). The one hot encoding must be based
+-- on class_values from data_preprocessor_input_batch_summary rather
+-- than the class levels found in validation_input.
+DROP TABLE IF EXISTS validation_input;
+CREATE TABLE validation_input(id serial, x_new double precision[], y_new 
INTEGER, y1 BOOLEAN, y2 TEXT, y3 DOUBLE PRECISION, y4 DOUBLE PRECISION[], y5 
INTEGER[]);
+INSERT INTO validation_input(x_new, y_new, y1, y2, y3, y4, y5) VALUES
+(ARRAY[1,2,3,4,5,6], 4, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]);
+DROP TABLE IF EXISTS validation_out, validation_out_summary;
+SELECT validation_preprocessor_dl(
+  'validation_input',
+  'validation_out',
+  'y_new',
+  'x_new',
+  'data_preprocessor_input_batch');
+-- Hard code 5.0 as the normalizing constant, based on the previous
+-- query's input param, to test if normalization is correct.
+SELECT assert(abs(x_new[1]/5.0-independent_var[1][1]) < 0.0000001, 'Incorrect 
normalizing in validation table.')
+FROM validation_input, validation_out;
+-- Validate if one hot encoding is as expected.
+SELECT assert(dependent_var = '{{0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0}}', 
'Incorrect one-hot encode dimension with num_classes') FROM
+  validation_out WHERE buffer_id = 0;
+
+-- Test summary table
+SELECT assert
+        (
+        source_table        = 'validation_input' AND
+        output_table        = 'validation_out' AND
+        dependent_varname   = 'y_new' AND
+        independent_varname = 'x_new' AND
+        dependent_vartype   = 'integer' AND
+        class_values        = '{-6,-3,-1,0,2,3,4,5,6,7,8,9,10,12,NULL,NULL}' 
AND
+        buffer_size         = 1 AND  -- we sort the class values in python
+        normalizing_const   = 5 AND
+        num_classes         = 16,
+        'Summary Validation failed. Actual:' || __to_char(summary)
+        ) from (select * from validation_out_summary) summary;
+
 -- Test one-hot encoding for dependent_var
 -- test boolean type
 DROP TABLE IF EXISTS data_preprocessor_input_batch, 
data_preprocessor_input_batch_summary;
@@ -135,10 +196,30 @@ SELECT assert(array_upper(dependent_var, 2) = 2, 
'Incorrect one-hot encode dimen
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT 
buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) 
a WHERE buffer_id = 0;
 SELECT assert (dependent_vartype   = 'boolean' AND
-               class_values        = '{f,t}',
+               class_values        = '{f,t}' AND
+               num_classes         = 2,
                'Summary Validation failed. Actual:' || __to_char(summary)
               ) from (select * from data_preprocessor_input_batch_summary) 
summary;
 
+-- Test to assert the output summary table for validation has the correct
+-- num_classes and class_values
+DROP TABLE IF EXISTS validation_input;
+CREATE TABLE validation_input(id serial, x_new double precision[], y INTEGER, 
y_new BOOLEAN, y2 TEXT, y3 DOUBLE PRECISION, y4 DOUBLE PRECISION[], y5 
INTEGER[]);
+INSERT INTO validation_input(x_new, y, y_new, y2, y3, y4, y5) VALUES
+(ARRAY[1,2,3,4,5,6], 4, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]);
+DROP TABLE IF EXISTS validation_out, validation_out_summary;
+SELECT validation_preprocessor_dl(
+  'validation_input',
+  'validation_out',
+  'y_new',
+  'x_new',
+  'data_preprocessor_input_batch');
+
+SELECT assert (dependent_vartype   = 'boolean' AND
+               class_values        = '{f,t}' AND
+               num_classes         = 2,
+               'Summary Validation failed. Actual:' || __to_char(summary)
+              ) from (select * from validation_out_summary) summary;
 -- test text type
 DROP TABLE IF EXISTS data_preprocessor_input_batch, 
data_preprocessor_input_batch_summary;
 SELECT training_preprocessor_dl(
@@ -153,10 +234,23 @@ SELECT assert(array_upper(dependent_var, 2) = 3, 
'Incorrect one-hot encode dimen
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT 
buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) 
a WHERE buffer_id = 0;
 SELECT assert (dependent_vartype   = 'text' AND
-               class_values        = '{a,b,c}',
+               class_values        = '{a,b,c}' AND
+               num_classes         = 3,
                'Summary Validation failed. Actual:' || __to_char(summary)
               ) from (select * from data_preprocessor_input_batch_summary) 
summary;
 
+-- data_preprocessor_input_batch was created for boolean dependent var,
+-- trying to create validation data using this table for reference, but
+-- passing a non-boolean dep var should error out.
+select assert(trap_error($TRAP$validation_preprocessor_dl(
+  'validation_input',
+  'validation_out',
+  'y_new',
+  'x_new',
+  'data_preprocessor_input_batch');$TRAP$) = 1,
+    'Passing integer dep var while expecting boolean dep var should error 
out.');
+
+
 -- test double precision type
 DROP TABLE IF EXISTS data_preprocessor_input_batch, 
data_preprocessor_input_batch_summary;
 SELECT training_preprocessor_dl(
@@ -171,7 +265,8 @@ SELECT assert(array_upper(dependent_var, 2) = 3, 'Incorrect 
one-hot encode dimen
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT 
buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) 
a WHERE buffer_id = 0;
 SELECT assert (dependent_vartype   = 'double precision' AND
-               class_values        = '{4.0,4.2,5.0}',
+               class_values        = '{4.0,4.2,5.0}' AND
+               num_classes         = 3,
                'Summary Validation failed. Actual:' || __to_char(summary)
               ) from (select * from data_preprocessor_input_batch_summary) 
summary;
 
@@ -189,10 +284,22 @@ SELECT assert(array_upper(dependent_var, 2) = 2, 
'Incorrect one-hot encode dimen
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(relative_error(SUM(y), SUM(y4)) < 0.000001, 'Incorrect one-hot 
encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM 
data_preprocessor_input_batch) a, (SELECT UNNEST(y4) as y4 FROM 
data_preprocessor_input) b;
 SELECT assert (dependent_vartype   = 'double precision[]' AND
-               class_values        IS NULL,
+               class_values        IS NULL AND
+               num_classes         IS NULL,
                'Summary Validation failed. Actual:' || __to_char(summary)
               ) from (select * from data_preprocessor_input_batch_summary) 
summary;
 
+DROP TABLE IF EXISTS validation_out, validation_out_summary;
+SELECT validation_preprocessor_dl(
+  'validation_input',
+  'validation_out',
+  'y4',
+  'x_new',
+  'data_preprocessor_input_batch');
+
+SELECT assert(dependent_var = '{{1,0}}', 'Incorrect one-hot encoding for 
already encoded dep var') FROM
+  validation_out WHERE buffer_id = 0;
+
 -- test integer array type
 DROP TABLE IF EXISTS data_preprocessor_input_batch, 
data_preprocessor_input_batch_summary;
 SELECT training_preprocessor_dl(
@@ -207,7 +314,8 @@ SELECT assert(array_upper(dependent_var, 2) = 2, 'Incorrect 
one-hot encode dimen
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(relative_error(SUM(y), SUM(y5)) < 0.000001, 'Incorrect one-hot 
encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM 
data_preprocessor_input_batch) a, (SELECT UNNEST(y5) as y5 FROM 
data_preprocessor_input) b;
 SELECT assert (dependent_vartype   = 'integer[]' AND
-               class_values        IS NULL,
+               class_values        IS NULL AND
+               num_classes         IS NULL,
                'Summary Validation failed. Actual:' || __to_char(summary)
               ) from (select * from data_preprocessor_input_batch_summary) 
summary;
 
@@ -255,6 +363,42 @@ SELECT assert
 SELECT assert(array_upper(dependent_var, 2) = 5, 'Incorrect one-hot encode 
dimension with NULL data') FROM
   data_preprocessor_input_batch WHERE buffer_id = 0;
 
+-- The same tests, but for validation.
+DROP TABLE IF EXISTS data_preprocessor_input_validation_null;
+CREATE TABLE data_preprocessor_input_validation_null(id serial, x double 
precision[], label TEXT);
+INSERT INTO data_preprocessor_input_validation_null(x, label) VALUES
+(ARRAY[1,2,3,4,5,6], 'a'),
+(ARRAY[11,2,3,4,5,6], 'b'),
+(ARRAY[11,2,3,4,5,6], 'b'),
+(ARRAY[111,22,33,4,5,6], NULL);
+DROP TABLE IF EXISTS validation_out_batch, validation_out_batch_summary;
+SELECT validation_preprocessor_dl(
+  'data_preprocessor_input_validation_null',
+  'validation_out_batch',
+  'label',
+  'x',
+  'data_preprocessor_input_batch',
+  1
+  );
+
+-- Test summary table if class_values has NULL as a legitimate
+-- class label, and also two other NULLs because num_classes=5
+-- but table has only 3 distinct class labels (including NULL)
+SELECT assert
+        (
+        class_values        = '{NULL,a,b,NULL,NULL}',
+        'Summary Validation failed with NULL data. Actual:' || 
__to_char(summary)
+        ) from (select * from validation_out_batch_summary) summary;
+
+-- Validate one hot encoding for specific row is correct
+SELECT assert(dependent_var = '{{0,1,0,0,0}}', 'Incorrect normalizing in 
validation table.')
+FROM data_preprocessor_input_validation_null, validation_out_batch
+WHERE x[1]=1 AND abs(x[1]/5.0 - independent_var[1][1]) < 0.000001;
+-- Assert one-hot encoding for NULL label
+SELECT assert(dependent_var = '{{1,0,0,0,0}}', 'Incorrect normalizing in 
validation table.')
+FROM data_preprocessor_input_validation_null, validation_out_batch
+WHERE x[1]=111 AND abs(x[1]/5.0 - independent_var[1][1]) < 0.000001;
+
 -- Test the content of 1-hot encoded dep var when NULL is the
 -- class label.
 DROP TABLE IF EXISTS data_preprocessor_input_null;
@@ -288,3 +432,45 @@ SELECT assert(array_upper(dependent_var, 2) = 3, 
'Incorrect one-hot encode dimen
 -- first index
 SELECT assert(dependent_var = '{{1,0,0}}', 'Incorrect one-hot encode dimension 
with NULL data') FROM
   data_preprocessor_input_batch WHERE buffer_id = 0;
+
+-- The same tests for validation.
+DROP TABLE IF EXISTS validation_out_batch, validation_out_batch_summary;
+SELECT validation_preprocessor_dl(
+  'data_preprocessor_input_null',
+  'validation_out_batch',
+  'label',
+  'x',
+  'data_preprocessor_input_batch',
+  4
+  );
+
+-- class_values must be '{NULL,NULL,NULL}' where the first NULL
+-- is for the class label seen in data, and the other two NULLs
+-- are added as num_classes=3.
+SELECT assert
+        (
+        class_values        = '{NULL,NULL,NULL}',
+        'Summary Validation failed with NULL data. Actual:' || 
__to_char(summary)
+        ) from (select * from validation_out_batch_summary) summary;
+
+SELECT assert(array_upper(dependent_var, 2) = 3, 'Incorrect one-hot encode 
dimension with NULL data') FROM
+  validation_out_batch WHERE buffer_id = 0;
+-- NULL is treated as a class label, so it should show '1' for the
+-- first index
+SELECT assert(dependent_var = '{{1,0,0}}', 'Incorrect one-hot encode dimension 
with NULL data') FROM
+  validation_out_batch WHERE buffer_id = 0;
+
+-- Test if validation class values is not a subset of training data class 
values.
+DROP TABLE IF EXISTS data_preprocessor_input_validation_null;
+CREATE TABLE data_preprocessor_input_validation_null(id serial, x double 
precision[], label TEXT);
+INSERT INTO data_preprocessor_input_validation_null(x, label) VALUES
+(ARRAY[11,2,3,4,5,6], 'bbbb');
+DROP TABLE IF EXISTS validation_out_batch, validation_out_batch_summary;
+select assert(trap_error($TRAP$validation_preprocessor_dl(
+  'data_preprocessor_input_validation_null',
+  'validation_out_batch',
+  'label',
+  'x',
+  'data_preprocessor_input_batch',
+  1);$TRAP$) = 1,
+    'validation class values should be a subset of training data class 
values.');
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
index 3cb735a..4eb835c 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
@@ -31,10 +31,6 @@ import plpy_mock as plpy
 
 m4_changequote(`<!', `!>')
 
-class AnyStringWith(str):
-    def __eq__(self, other):
-        return self in other
-
 class InputPreProcessorDLTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
@@ -276,16 +272,16 @@ class InputPreProcessorDLTestCase(unittest.TestCase):
     def test_validate_num_classes_lesser(self):
         self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text'])
         self.module.dependent_levels = Mock(return_value = ["'a'", "'b'", 
"'c'"])
+        obj = self.module.TrainingDataPreprocessorDL(
+            self.default_schema_madlib,
+            self.default_source_table,
+            self.default_output_table,
+            self.default_dep_var,
+            self.default_ind_var,
+            self.default_buffer_size,
+            self.default_normalizing_const,
+            2)
         with self.assertRaises(plpy.PLPYException):
-            obj = self.module.TrainingDataPreprocessorDL(
-                self.default_schema_madlib,
-                self.default_source_table,
-                self.default_output_table,
-                self.default_dep_var,
-                self.default_ind_var,
-                self.default_buffer_size,
-                self.default_normalizing_const,
-                2)
             obj._set_one_hot_encoding_variables()
 
 if __name__ == '__main__':

Reply via email to