fmcquillan99 edited a comment on issue #381: DL: Add input preprocessor for validation data URL: https://github.com/apache/madlib/pull/381#issuecomment-489358794 I think we should fix: (10) Wondering about: (6) (9) (14) (1) create image data ``` DROP TABLE IF EXISTS image_data; CREATE TABLE image_data AS ( SELECT ARRAY[ ARRAY[ ARRAY[(random() * 256)::integer, -- pixel (1,1) (random() * 256)::integer, (random() * 256)::integer], ARRAY[(random() * 256)::integer, -- pixel (2,1) (random() * 256)::integer, (random() * 256)::integer] ], ARRAY[ ARRAY[(random() * 256)::integer, -- pixel (1,2) (random() * 256)::integer, (random() * 256)::integer], ARRAY[(random() * 256)::integer, -- pixel (2,1) (random() * 256)::integer, (random() * 256)::integer] ] ] as x, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as y FROM generate_series(1, 7) ); SELECT * FROM image_data; x | y ------------------------------------------------------------+------ {{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}} | cat {{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}} | bird {{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}} | cat {{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}} | cat {{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}} | bird {{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}} | cat {{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}} | dog (7 rows) ``` (2) minibatch training ``` DROP TABLE IF EXISTS train_tiny1_packed, train_tiny1_packed_summary; SELECT madlib.training_preprocessor_dl('image_data', -- Source table 'train_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable NULL, -- Buffer size 1 -- Normalizing constant ); SELECT * FROM train_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}},{{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}},{{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}}} dependent_var | {{0,1,0},{0,1,0},{1,0,0}} buffer_id | 0 -[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}},{{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}},{{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}},{{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}}} dependent_var | {{0,0,1},{1,0,0},{0,1,0},{0,1,0}} buffer_id | 1 ``` train row order buffer 0: 6, 4, 2 row order buffer 1: 7, 5, 3, 1 OK (3) rerun minibatch training ``` madlib=# SELECT * FROM train_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}},{{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}},{{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}}} dependent_var | {{1,0,0},{0,1,0},{0,1,0}} buffer_id | 0 -[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}},{{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}},{{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}},{{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}}} dependent_var | {{0,1,0},{0,0,1},{1,0,0},{0,1,0}} buffer_id | 1 ``` train row order buffer 0: 2, 6, 4 row order buffer 1: 3, 7, 5, 1 OK ^^^ is a new random order for training (4) minibatch validation ``` DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'train_tiny1_packed' ); SELECT * FROM test_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}},{{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}},{{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}}} dependent_var | {{1,0,0},{0,1,0},{0,1,0}} buffer_id | 0 -[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}},{{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}},{{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}},{{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}}} dependent_var | {{0,1,0},{0,1,0},{1,0,0},{0,0,1}} buffer_id | 1 ``` validation row order buffer 0: 2, 4, 6 row order buffer 1: 1, 3, 5, 7 OK (5) rerun minibatch validation ``` -[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}},{{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}},{{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}}} dependent_var | {{1,0,0},{0,1,0},{0,1,0}} buffer_id | 0 -[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}},{{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}},{{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}},{{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}}} dependent_var | {{0,1,0},{0,1,0},{1,0,0},{0,0,1}} buffer_id | 1 ``` validation row order buffer 0: 2, 4, 6 row order buffer 1: 1, 3, 5, 7 OK ^^^ is same order for validation data (not shuffled) (6) output table NULL ``` DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table NULL, -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'train_tiny1_packed' ); ERROR: AttributeError: 'NoneType' object has no attribute 'strip' (plpython.c:5038) CONTEXT: Traceback (most recent call last): PL/Python function "validation_preprocessor_dl", line 23, in <module> validation_preprocessor_obj = input_data_preprocessor.ValidationDataPreprocessorDL(**globals()) PL/Python function "validation_preprocessor_dl", line 297, in __init__ PL/Python function "validation_preprocessor_dl", line 71, in __init__ PL/Python function "validation_preprocessor_dl", line 183, in add_postfix PL/Python function "validation_preprocessor_dl" ``` That is a cryptic error, is what would normally be expected for a mandatory parameter that is set to NULL? Might be OK, just asking. (7) minibatch training table does not exist ``` DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'xyz' ); ERROR: plpy.Error: validation_preprocessor_DL error: Input table 'xyz' does not exist (plpython.c:5038) CONTEXT: Traceback (most recent call last): PL/Python function "validation_preprocessor_dl", line 23, in <module> validation_preprocessor_obj = input_data_preprocessor.ValidationDataPreprocessorDL(**globals()) PL/Python function "validation_preprocessor_dl", line 291, in __init__ PL/Python function "validation_preprocessor_dl", line 340, in _validate_and_process_training_preprocessor_table PL/Python function "validation_preprocessor_dl", line 670, in input_tbl_valid PL/Python function "validation_preprocessor_dl" ``` OK (8) minibatch training table exists but is the wrong format ``` DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'image_data' ); ERROR: plpy.Error: validation_preprocessor_DL error: Input table 'image_data_summary' does not exist (plpython.c:5038) CONTEXT: Traceback (most recent call last): PL/Python function "validation_preprocessor_dl", line 23, in <module> validation_preprocessor_obj = input_data_preprocessor.ValidationDataPreprocessorDL(**globals()) PL/Python function "validation_preprocessor_dl", line 291, in __init__ PL/Python function "validation_preprocessor_dl", line 343, in _validate_and_process_training_preprocessor_table PL/Python function "validation_preprocessor_dl", line 670, in input_tbl_valid PL/Python function "validation_preprocessor_dl" ``` OK (9) use minibatch validation table instead of training table ``` DROP TABLE IF EXISTS test_tiny2_packed, test_tiny2_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny2_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'train_tiny1_packed' ); SELECT * FROM test_tiny2_packed ORDER BY buffer_id; DROP TABLE IF EXISTS test_tiny2_packed, test_tiny2_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny2_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'test_tiny1_packed' ); SELECT * FROM test_tiny2_packed ORDER BY buffer_id; ``` NOT OK??? ^^^ it is taking validation data table and it still works? Should only take training table? Or can use either? (10) Summary table missing num_classes from training and validation summary tables: ``` SELECT * FROM train_tiny1_packed_summary; -[ RECORD 1 ]-------+------------------- source_table | image_data output_table | train_tiny1_packed dependent_varname | y independent_varname | x dependent_vartype | text class_values | {bird,cat,dog} buffer_size | 4 normalizing_const | 1.0 num_classes | SELECT * FROM test_tiny1_packed_summary; -[ RECORD 1 ]-------+------------------ source_table | image_data output_table | test_tiny1_packed dependent_varname | y independent_varname | x dependent_vartype | text class_values | {bird,cat,dog} buffer_size | 4 normalizing_const | 1.0 num_classes | ``` num_classes should not be blank even when blank; it should be the computed value from the input data (11) padding number of classes ``` DROP TABLE IF EXISTS train_tiny1_packed, train_tiny1_packed_summary; SELECT madlib.training_preprocessor_dl('image_data', -- Source table 'train_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable NULL, -- Buffer size 1, -- Normalizing constant 5 -- Number of classes ); SELECT * FROM train_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}},{{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}},{{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}}} dependent_var | {{1,0,0,0,0},{1,0,0,0,0},{0,0,1,0,0}} buffer_id | 0 -[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}},{{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}},{{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}},{{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}}} dependent_var | {{0,1,0,0,0},{0,1,0,0,0},{0,1,0,0,0},{0,1,0,0,0}} buffer_id | 1 ``` OK ``` DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'train_tiny1_packed' ); SELECT * FROM test_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}},{{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}},{{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}}} dependent_var | {{1,0,0,0,0},{0,1,0,0,0},{0,1,0,0,0}} buffer_id | 0 -[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}},{{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}},{{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}},{{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}}} dependent_var | {{0,1,0,0,0},{0,1,0,0,0},{1,0,0,0,0},{0,0,1,0,0}} buffer_id | 1 ``` OK (12) mismatch number of classes ``` DROP TABLE IF EXISTS train_tiny1_packed, train_tiny1_packed_summary; SELECT madlib.training_preprocessor_dl('image_data', -- Source table 'train_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable NULL, -- Buffer size 1, -- Normalizing constant 2 -- Number of classes ); SELECT * FROM train_tiny1_packed ORDER BY buffer_id; ERROR: plpy.Error: training_preprocessor_DL: Invalid num_classes value specified. It must be equal to or greater than distinct class values found in table (3). (plpython.c:5038) CONTEXT: Traceback (most recent call last): PL/Python function "training_preprocessor_dl", line 24, in <module> training_preprocessor_obj.training_preprocessor_dl() PL/Python function "training_preprocessor_dl", line 397, in training_preprocessor_dl PL/Python function "training_preprocessor_dl", line 154, in input_preprocessor_dl PL/Python function "training_preprocessor_dl", line 94, in _set_one_hot_encoding_variables PL/Python function "training_preprocessor_dl", line 105, in _validate_num_classes PL/Python function "training_preprocessor_dl" madlib=# madlib=# SELECT * FROM train_tiny1_packed ORDER BY buffer_id; ERROR: relation "train_tiny1_packed" does not exist LINE 1: SELECT * FROM train_tiny1_packed ORDER BY buffer_id; ``` OK (13) fewer validation classes than training classes ``` DROP TABLE IF EXISTS image_data_val; CREATE TABLE image_data_val AS ( SELECT ARRAY[ ARRAY[ ARRAY[(random() * 256)::integer, -- pixel (1,1) (random() * 256)::integer, (random() * 256)::integer], ARRAY[(random() * 256)::integer, -- pixel (2,1) (random() * 256)::integer, (random() * 256)::integer] ], ARRAY[ ARRAY[(random() * 256)::integer, -- pixel (1,2) (random() * 256)::integer, (random() * 256)::integer], ARRAY[(random() * 256)::integer, -- pixel (2,1) (random() * 256)::integer, (random() * 256)::integer] ] ] as x, ('{cat,dog}'::text[])[ceil(random()*2)] as y FROM generate_series(1, 7) ); SELECT * FROM image_data_val; x | y -----------------------------------------------------------+----- {{{49,222,101},{50,80,102}},{{127,183,216},{72,24,232}}} | dog {{{150,169,249},{84,150,76}},{{97,191,81},{21,236,104}}} | dog {{{176,236,161},{116,252,209}},{{82,97,4},{162,199,130}}} | cat {{{159,203,113},{135,78,8}},{{48,71,91},{198,147,188}}} | dog {{{152,109,231},{109,133,9}},{{195,22,97},{251,188,250}}} | cat {{{97,12,240},{58,6,253}},{{65,97,35},{103,24,174}}} | dog {{{253,153,241},{156,68,137}},{{10,43,247},{143,52,186}}} | dog (7 rows) DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data_val', -- Source table 'test_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'train_tiny1_packed' ); SELECT * FROM test_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{97,12,240},{58,6,253}},{{65,97,35},{103,24,174}}},{{{49,222,101},{50,80,102}},{{127,183,216},{72,24,232}}},{{{176,236,161},{116,252,209}},{{82,97,4},{162,199,130}}}} dependent_var | {{0,0,1},{0,0,1},{0,1,0}} buffer_id | 0 -[ RECORD 2 ]---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{152,109,231},{109,133,9}},{{195,22,97},{251,188,250}}},{{{253,153,241},{156,68,137}},{{10,43,247},{143,52,186}}},{{{150,169,249},{84,150,76}},{{97,191,81},{21,236,104}}},{{{159,203,113},{135,78,8}},{{48,71,91},{198,147,188}}}} dependent_var | {{0,1,0},{0,0,1},{0,0,1},{0,0,1}} buffer_id | 1 ``` OK (14) more validation classes than training classes ``` DROP TABLE IF EXISTS train_tiny1_packed, train_tiny1_packed_summary; SELECT madlib.training_preprocessor_dl('image_data_val', -- Source table 'train_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable NULL, -- Buffer size 1 -- Normalizing constant ); SELECT * FROM train_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{176,236,161},{116,252,209}},{{82,97,4},{162,199,130}}},{{{97,12,240},{58,6,253}},{{65,97,35},{103,24,174}}},{{{49,222,101},{50,80,102}},{{127,183,216},{72,24,232}}}} dependent_var | {{1,0},{0,1},{0,1}} buffer_id | 0 -[ RECORD 2 ]---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{159,203,113},{135,78,8}},{{48,71,91},{198,147,188}}},{{{152,109,231},{109,133,9}},{{195,22,97},{251,188,250}}},{{{253,153,241},{156,68,137}},{{10,43,247},{143,52,186}}},{{{150,169,249},{84,150,76}},{{97,191,81},{21,236,104}}}} dependent_var | {{0,1},{1,0},{0,1},{0,1}} buffer_id | 1 DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'train_tiny1_packed' ); SELECT * FROM test_tiny1_packed ORDER BY buffer_id; -[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{181,161,175},{253,75,39}},{{140,227,99},{255,158,191}}},{{{84,80,130},{197,122,56}},{{102,41,53},{177,80,193}}},{{{16,255,166},{69,118,164}},{{209,61,189},{230,21,1}}}} dependent_var | {{0,0},{0,1},{0,0}} buffer_id | 0 -[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- independent_var | {{{{151,96,176},{229,4,158}},{{24,119,196},{89,105,113}}},{{{190,14,99},{63,66,250}},{{159,242,223},{162,144,247}}},{{{1,118,10},{149,103,151}},{{120,193,118},{14,228,52}}},{{{28,34,240},{138,17,241}},{{256,27,134},{103,178,255}}}} dependent_var | {{1,0},{1,0},{1,0},{1,0}} buffer_id | 1 ``` I wonder if this should error out. (15) `buffer_size` in `validation_preprocessor_dl` <1 ``` DROP TABLE IF EXISTS test_tiny1_packed, test_tiny1_packed_summary; SELECT madlib.validation_preprocessor_dl('image_data', -- Source table 'test_tiny1_packed', -- Output table 'y', -- Dependent variable 'x', -- Independent variable 'train_tiny1_packed', 0 -- buffer size ); SELECT * FROM test_tiny1_packed ORDER BY buffer_id; ERROR: plpy.Error: validation_preprocessor_DL: The buffer size has to be a positive integer or NULL. (plpython.c:5038) CONTEXT: Traceback (most recent call last): PL/Python function "validation_preprocessor_dl", line 23, in <module> validation_preprocessor_obj = input_data_preprocessor.ValidationDataPreprocessorDL(**globals()) PL/Python function "validation_preprocessor_dl", line 297, in __init__ PL/Python function "validation_preprocessor_dl", line 77, in __init__ PL/Python function "validation_preprocessor_dl", line 264, in _validate_args PL/Python function "validation_preprocessor_dl", line 78, in _assert PL/Python function "validation_preprocessor_dl" madlib=# madlib=# SELECT * FROM test_tiny1_packed ORDER BY buffer_id; ERROR: relation "test_tiny1_packed" does not exist LINE 1: SELECT * FROM test_tiny1_packed ORDER BY buffer_id; ``` OK
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
