Vec2Cols: Allow arrays of different lengths

JIRA: MADLIB-1270

Added support to split arrays of different lengths in the vector_col.
If the user does not provide feature names, we pad each array to the
maximum length and split across the maximum possible number of features.
If the user does provide feature names, we truncate/pad the arrays
according to the number of features the user desires.

Closes #311

Co-authored-by: Arvind Sridhar <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/5ccf12e1
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/5ccf12e1
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/5ccf12e1

Branch: refs/heads/master
Commit: 5ccf12e113f04b02b9ccf8c9aee107a4feb4bd88
Parents: ec328db
Author: Rahul Iyer <[email protected]>
Authored: Thu Aug 16 20:08:32 2018 -0700
Committer: Rahul Iyer <[email protected]>
Committed: Thu Aug 16 20:08:32 2018 -0700

----------------------------------------------------------------------
 .../utilities/test/transform_vec_cols.sql_in    | 47 ++++++++++++++++++++
 .../unit_tests/test_transform_vec_cols.py_in    | 14 +-----
 .../modules/utilities/transform_vec_cols.py_in  | 34 +++++++-------
 3 files changed, 64 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/5ccf12e1/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in 
b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in
index 47ab299..b43b39f 100644
--- a/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in
+++ b/src/ports/postgres/modules/utilities/test/transform_vec_cols.sql_in
@@ -104,6 +104,53 @@ SELECT assert ((SELECT count(*) FROM 
information_schema.columns WHERE table_name
 SELECT assert ((SELECT clouds_airquality[1] FROM dt_golf WHERE id = 1) = 
(SELECT clouds FROM out_table WHERE id = 1), 'Split values do not match up');
 SELECT assert ((SELECT clouds_airquality[2] FROM dt_golf WHERE id = 1) = 
(SELECT air_quality FROM out_table WHERE id = 1), 'Split values do not match 
up');
 
+-- Testing splitting arrays of different lengths into features
+
+DROP TABLE IF EXISTS diff_lengths_test;
+CREATE TABLE diff_lengths_test(
+    "id" INTEGER,
+    "arr" TEXT[]);
+INSERT INTO diff_lengths_test VALUES (1, '{a, b}'), (2, '{c, d}'), (3, '{e, f, 
g, h}'), (4, '{i}'), (5, '{}');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'diff_lengths_test',
+    'out_table',
+    'arr'
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = (SELECT max(array_upper(arr, 1)) from 
diff_lengths_test), 'Number of split columns does not match');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'diff_lengths_test',
+    'out_table',
+    'arr',
+    ARRAY['a']
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 1, 'Number of split columns does not match');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'diff_lengths_test',
+    'out_table',
+    'arr',
+    ARRAY['a', 'b', 'c']
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 3, 'Number of split columns does not match');
+
+DROP TABLE IF EXISTS out_table;
+SELECT vec2cols(
+    'diff_lengths_test',
+    'out_table',
+    'arr',
+    ARRAY['a', 'b', 'c', 'd', 'e', 'f', 'g']
+);
+
+SELECT assert ((SELECT count(*) FROM information_schema.columns WHERE 
table_name='out_table') = 7, 'Number of split columns does not match');
+
 -- Special character tests
 
 DROP TABLE IF EXISTS special_char_check;

http://git-wip-us.apache.org/repos/asf/madlib/blob/5ccf12e1/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
 
b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
index 6475f9b..3020309 100644
--- 
a/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
+++ 
b/src/ports/postgres/modules/utilities/test/unit_tests/test_transform_vec_cols.py_in
@@ -125,23 +125,13 @@ class Vec2ColsTestSuite(unittest.TestCase):
 
     def test_get_names_for_split_output_cols_feature_names_none(self):
         self.plpy_mock_execute.return_value = [{"n_x": 3}]
-        new_cols = 
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', None)
+        new_cols = 
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar')
         self.assertEqual(['f1', 'f2', 'f3'], new_cols)
 
-    def test_get_names_for_split_output_cols_feature_names_not_none(self):
-        self.plpy_mock_execute.return_value = [{"n_x": 3}]
-        new_cols = 
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', ['a', 'b', 'c'])
-        self.assertEqual(['a', 'b', 'c'], new_cols)
-
     def test_get_names_for_split_output_cols_array_upper_returns_none(self):
         self.plpy_mock_execute.return_value = [{"n_x": None}]
         with self.assertRaises(plpy.PLPYException):
-            
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', None)
-
-    def 
test_get_names_for_split_output_cols_feature_names_size_matches_vector_col(self):
-        self.plpy_mock_execute.return_value = [{"n_x": 3}]
-        with self.assertRaises(plpy.PLPYException):
-            
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar', ['a', 'b'])
+            
self.subject.get_names_for_split_output_cols(self.default_source_table, 
'foobar')
 
     def test_validate_output_cols_max_cols_exception(self):
         with self.assertRaises(plpy.PLPYException):

http://git-wip-us.apache.org/repos/asf/madlib/blob/5ccf12e1/src/ports/postgres/modules/utilities/transform_vec_cols.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/transform_vec_cols.py_in 
b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in
index bf9229e..49064d3 100644
--- a/src/ports/postgres/modules/utilities/transform_vec_cols.py_in
+++ b/src/ports/postgres/modules/utilities/transform_vec_cols.py_in
@@ -83,7 +83,7 @@ class vec2cols:
         _assert(is_col_1d_array(source_table, vector_col),
             "{0}: vector_col must be a 1-dimensional 
array.".format(self.module_name))
 
-    def get_names_for_split_output_cols(self, source_table, vector_col, 
feature_names):
+    def get_names_for_split_output_cols(self, source_table, vector_col):
         """
             Get list of names for the newly-split columns to include in the
             output table.
@@ -94,24 +94,15 @@ class vec2cols:
                     use for the split elements in the vector_col array
         """
         query = """
-            SELECT array_upper({0}, 1) AS n_x
+            SELECT max(array_upper({0}, 1)) AS n_x
             FROM {1}
-            LIMIT 1
         """.format(vector_col, source_table)
-        result = plpy.execute(query)[0]["n_x"]
-        if not result:
+        max_array_length = plpy.execute(query)[0]["n_x"]
+        if not max_array_length:
             plpy.error('{0}: Column to split ({1}) must not be an empty array'
                 .format(self.module_name, vector_col))
-        if not feature_names:
-            # Create custom col names for output columns, with prefix "f".
-            feature_names = ["f{0}".format(i+1) for i in range(result)]
-        else:
-            # Check if the array dimension is equal to the number of col names
-            # specified in feature_names.
-            _assert(result == len(feature_names),
-                    "{0}: Mismatch between size of vector_col and number of "
-                    "cols in feature_names.".format(self.module_name))
-        return feature_names
+        # Create custom col names for output columns, with prefix "f".
+        return ["f{0}".format(i+1) for i in range(max_array_length)]
 
     def validate_output_cols(self, features_to_unnest, cols_to_keep):
         # If there are more than 1600 columns for the output table, we give a
@@ -146,8 +137,13 @@ class vec2cols:
                            cols_to_output)
 
         # Get names of columns to use for the split vector_col
-        features_to_unnest = self.get_names_for_split_output_cols(source_table,
-                             vector_col, feature_names)
+        if not feature_names:
+            features_to_unnest = self.get_names_for_split_output_cols(
+                source_table, vector_col)
+        else:
+            features_to_unnest = feature_names
+
+        # Get the columns to keep from the source table
         cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output,
                        source_table)
 
@@ -155,8 +151,8 @@ class vec2cols:
 
         # Construct the output query and populate the output table with all the
         # correct parameters
-        select_new_cols = ', '.join(["{0}[{1}] AS {2}".format(vector_col,
-            i+1, features_to_unnest[i]) for i in 
range(len(features_to_unnest))])
+        select_new_cols = (', '.join(["{0}[{1}] AS {2}".format(vector_col, i + 
1, f)
+                                     for i, f in 
enumerate(features_to_unnest)]))
         cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep 
else ''
         query = """
         CREATE TABLE {output_table} AS

Reply via email to