Vec2Cols: Allow arrays of different lengths JIRA: MADLIB-1270
Added support to split arrays of different lengths in the vector_col. If the user does not provide feature names, we pad each array to the maximum length and split across the maximum possible number of features. If the user does provide feature names, we truncate/pad the arrays according to the number of features the user desires. Closes #311 Co-authored-by: Arvind Sridhar <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/5ccf12e1 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/5ccf12e1 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/5ccf12e1 Branch: refs/heads/master Commit: 5ccf12e113f04b02b9ccf8c9aee107a4feb4bd88 Parents: ec328db Author: Rahul Iyer <[email protected]> Authored: Thu Aug 16 20:08:32 2018 -0700 Committer: Rahul Iyer <[email protected]> Committed: Thu Aug 16 20:08:32 2018 -0700 ---------------------------------------------------------------------- .../utilities/test/transform_vec_cols.sql_in | 47 ++++++++++++++++++++ .../unit_tests/test_transform_vec_cols.py_in | 14 +----- .../modules/utilities/transform_vec_cols.py_in | 34 +++++++------- 3 files changed, 64 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/5ccf12e1/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in index 47ab299..b43b39f 100644 --- a/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in +++ b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in @@ -104,6 +104,53 @@ SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name SELECT assert ((SELECT clouds_airquality[1] FROM dt_golf WHERE id = 1) = (SELECT clouds FROM out_table WHERE id = 1), 'Split values do not match up'); SELECT assert ((SELECT clouds_airquality[2] FROM dt_golf WHERE id = 1) = (SELECT air_quality FROM out_table WHERE id = 1), 'Split values do not match up'); +-- Testing splitting arrays of different lengths into features + +DROP TABLE IF EXISTS diff_lengths_test; +CREATE TABLE diff_lengths_test( + "id" INTEGER, + "arr" TEXT[]); +INSERT INTO diff_lengths_test VALUES (1, '{a, b}'), (2, '{c, d}'), (3, '{e, f, g, h}'), (4, '{i}'), (5, '{}'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'diff_lengths_test', + 'out_table', + 'arr' +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = (SELECT max(array_upper(arr, 1)) from diff_lengths_test), 'Number of split columns does not match'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'diff_lengths_test', + 'out_table', + 'arr', + ARRAY['a'] +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 1, 'Number of split columns does not match'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'diff_lengths_test', + 'out_table', + 'arr', + ARRAY['a', 'b', 'c'] +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 3, 'Number of split columns does not match'); + +DROP TABLE IF EXISTS out_table; +SELECT vec2cols( + 'diff_lengths_test', + 'out_table', + 'arr', + ARRAY['a', 'b', 'c', 'd', 'e', 'f', 'g'] +); + +SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE table_name='out_table') = 7, 'Number of split columns does not match'); + -- Special character tests DROP TABLE IF EXISTS special_char_check; http://git-wip-us.apache.org/repos/asf/madlib/blob/5ccf12e1/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in index 6475f9b..3020309 100644 --- a/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in +++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in @@ -125,23 +125,13 @@ class Vec2ColsTestSuite(unittest.TestCase): def test_get_names_for_split_output_cols_feature_names_none(self): self.plpy_mock_execute.return_value = [{"n_x": 3}] - new_cols = self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', None) + new_cols = self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar') self.assertEqual(['f1', 'f2', 'f3'], new_cols) - def test_get_names_for_split_output_cols_feature_names_not_none(self): - self.plpy_mock_execute.return_value = [{"n_x": 3}] - new_cols = self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', ['a', 'b', 'c']) - self.assertEqual(['a', 'b', 'c'], new_cols) - def test_get_names_for_split_output_cols_array_upper_returns_none(self): self.plpy_mock_execute.return_value = [{"n_x": None}] with self.assertRaises(plpy.PLPYException): - self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', None) - - def test_get_names_for_split_output_cols_feature_names_size_matches_vector_col(self): - self.plpy_mock_execute.return_value = [{"n_x": 3}] - with self.assertRaises(plpy.PLPYException): - self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar', ['a', 'b']) + self.subject.get_names_for_split_output_cols(self.default_source_table, 'foobar') def test_validate_output_cols_max_cols_exception(self): with self.assertRaises(plpy.PLPYException): http://git-wip-us.apache.org/repos/asf/madlib/blob/5ccf12e1/src/ports/postgres/modules/utilities/transform_vec_cols.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/transform_vec_cols.py_in b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in index bf9229e..49064d3 100644 --- a/src/ports/postgres/modules/utilities/transform_vec_cols.py_in +++ b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in @@ -83,7 +83,7 @@ class vec2cols: _assert(is_col_1d_array(source_table, vector_col), "{0}: vector_col must be a 1-dimensional array.".format(self.module_name)) - def get_names_for_split_output_cols(self, source_table, vector_col, feature_names): + def get_names_for_split_output_cols(self, source_table, vector_col): """ Get list of names for the newly-split columns to include in the output table. @@ -94,24 +94,15 @@ class vec2cols: use for the split elements in the vector_col array """ query = """ - SELECT array_upper({0}, 1) AS n_x + SELECT max(array_upper({0}, 1)) AS n_x FROM {1} - LIMIT 1 """.format(vector_col, source_table) - result = plpy.execute(query)[0]["n_x"] - if not result: + max_array_length = plpy.execute(query)[0]["n_x"] + if not max_array_length: plpy.error('{0}: Column to split ({1}) must not be an empty array' .format(self.module_name, vector_col)) - if not feature_names: - # Create custom col names for output columns, with prefix "f". - feature_names = ["f{0}".format(i+1) for i in range(result)] - else: - # Check if the array dimension is equal to the number of col names - # specified in feature_names. - _assert(result == len(feature_names), - "{0}: Mismatch between size of vector_col and number of " - "cols in feature_names.".format(self.module_name)) - return feature_names + # Create custom col names for output columns, with prefix "f". + return ["f{0}".format(i+1) for i in range(max_array_length)] def validate_output_cols(self, features_to_unnest, cols_to_keep): # If there are more than 1600 columns for the output table, we give a @@ -146,8 +137,13 @@ class vec2cols: cols_to_output) # Get names of columns to use for the split vector_col - features_to_unnest = self.get_names_for_split_output_cols(source_table, - vector_col, feature_names) + if not feature_names: + features_to_unnest = self.get_names_for_split_output_cols( + source_table, vector_col) + else: + features_to_unnest = feature_names + + # Get the columns to keep from the source table cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output, source_table) @@ -155,8 +151,8 @@ class vec2cols: # Construct the output query and populate the output table with all the # correct parameters - select_new_cols = ', '.join(["{0}[{1}] AS {2}".format(vector_col, - i+1, features_to_unnest[i]) for i in range(len(features_to_unnest))]) + select_new_cols = (', '.join(["{0}[{1}] AS {2}".format(vector_col, i + 1, f) + for i, f in enumerate(features_to_unnest)])) cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep else '' query = """ CREATE TABLE {output_table} AS
