[GitHub] madlib pull request #291: Feature: Vector-Column Transformations

2018-08-01 Thread asfgit
Github user asfgit closed the pull request at:

https://github.com/apache/madlib/pull/291


---


[GitHub] madlib pull request #291: Feature: Vector-Column Transformations

2018-07-27 Thread iyerr3
Github user iyerr3 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/291#discussion_r205920988
  
--- Diff: src/ports/postgres/modules/utilities/validate_args.py_in ---
@@ -513,11 +513,12 @@ def array_col_has_same_dimension(tbl, col):
 # 
 
 
-def explicit_bool_to_text(tbl, cols, schema_madlib):
+def explicit_bool_to_text(tbl, cols, schema_madlib, is_forced=False):
--- End diff --

Well you don't need this function - you might need `::TEXT`. But I believe 
we decided that we'll let the platform fail if the array cannot be built by it. 
Wouldn't this build a successful array when the platform is failing. 


---


[GitHub] madlib pull request #291: Feature: Vector-Column Transformations

2018-07-27 Thread iyerr3
Github user iyerr3 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/291#discussion_r205897258
  
--- Diff: src/ports/postgres/modules/utilities/transform_vec_cols.py_in ---
@@ -0,0 +1,495 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+from control import MinWarning
+from internal.db_utils import is_col_1d_array
+from internal.db_utils import quote_literal
+from utilities import _assert
+from utilities import add_postfix
+from utilities import ANY_ARRAY
+from utilities import is_valid_psql_type
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from validate_args import is_var_valid
+from validate_args import explicit_bool_to_text
+from validate_args import get_cols
+from validate_args import get_cols_and_types
+from validate_args import get_expr_type
+from validate_args import input_tbl_valid
+from validate_args import output_tbl_valid
+from validate_args import table_exists
+
+class vec_cols_helper:
+def __init__(self):
+self.all_cols = None
+
+def get_cols_as_list(self, cols_to_process, source_table=None, 
exclude_cols=None):
+"""
+Get a list of columns based on the value of cols_to_process
+Args:
+@param cols_to_process: str, Either a * or a comma-separated 
list of col names
+@param source_table: str, optional. Source table name
+@param exclude_cols: str, optional. Comma-separated list of 
the col(s) to exclude
+ from the source table, only used if 
cols_to_process is *
+Returns:
+A list of column names (or an empty list)
+"""
+# If cols_to_process is empty/None, return empty list
+if not cols_to_process:
+return []
+if cols_to_process.strip() != "*":
+# If cols_to_process is a comma separated list of names, 
return list
+# of column names in cols_to_process.
+return [col for col in 
split_quoted_delimited_str(cols_to_process)
+if col not in split_quoted_delimited_str(exclude_cols)]
+if source_table:
+if not self.all_cols:
+self.all_cols = get_cols(source_table)
+return [col for col in self.all_cols
+if col not in split_quoted_delimited_str(exclude_cols)]
+return []
+
+class vec2cols:
+def __init__(self):
+self.get_cols_helper = vec_cols_helper()
+self.module_name = self.__class__.__name__
+
+def validate_args(self, source_table, output_table, vector_col, 
feature_names,
+  cols_to_output):
+"""
+Validate args for vec2cols
+"""
+input_tbl_valid(source_table, self.module_name)
+output_tbl_valid(output_table, self.module_name)
+is_var_valid(source_table, cols_to_output)
+is_var_valid(source_table, vector_col)
+_assert(is_valid_psql_type(get_expr_type(vector_col, 
source_table), ANY_ARRAY),
+"{0}: vector_col should refer to an 
array.".format(self.module_name))
+_assert(is_col_1d_array(source_table, vector_col),
+"{0}: vector_col must be a 1-dimensional 
array.".format(self.module_name))
+
+def get_names_for_split_output_cols(self, source_table, vector_col, 
feature_names):
+"""
+Get list of names for the newly-split columns to include in the
+output table.
+Args:
+@param: source_table, str. Source table
+@param: vector_col, str. Column name containing the array input
+@param: feature_names, list. Python list of the feature names 
to
+use for the split elements in the vector_col array
+"""
+query = """
+

[GitHub] madlib pull request #291: Feature: Vector-Column Transformations

2018-07-23 Thread iyerr3
Github user iyerr3 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/291#discussion_r204607008
  
--- Diff: src/ports/postgres/modules/utilities/transform_vec_cols.py_in ---
@@ -0,0 +1,513 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+from control import MinWarning
+from internal.db_utils import is_col_1d_array
+from internal.db_utils import quote_literal
+from utilities import _assert
+from utilities import add_postfix
+from utilities import ANY_ARRAY
+from utilities import is_psql_boolean_type
+from utilities import is_psql_char_type
+from utilities import is_psql_numeric_type
+from utilities import is_valid_psql_type
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from validate_args import is_var_valid
+from validate_args import get_cols
+from validate_args import get_cols_and_types
+from validate_args import get_expr_type
+from validate_args import input_tbl_valid
+from validate_args import output_tbl_valid
+from validate_args import table_exists
+
+class vec_cols_helper:
+def __init__(self):
+self.all_cols = None
+
+def get_cols_as_list(self, cols_to_process, source_table=None, 
exclude_cols=None):
+"""
+Get a list of columns based on the value of cols_to_process
+Args:
+@param cols_to_process: str, Either a * or a comma-separated 
list of col names
+@param source_table: str, optional. Source table name
+@param exclude_cols: str, optional. Comma-separated list of 
the col(s) to exclude
+ from the source table, only used if 
cols_to_process is *
+Returns:
+A list of column names (or an empty list)
+"""
+# If cols_to_process is empty/None, return empty list
+if not cols_to_process:
+return []
+if cols_to_process.strip() != "*":
+# If cols_to_process is a comma separated list of names, 
return list
+# of column names in cols_to_process.
+return [col for col in 
split_quoted_delimited_str(cols_to_process)
+if col not in split_quoted_delimited_str(exclude_cols)]
+if source_table:
+if not self.all_cols:
+self.all_cols = get_cols(source_table)
+return [col for col in self.all_cols
+if col not in split_quoted_delimited_str(exclude_cols)]
+return []
+
+def get_type_class(self, arg):
+if is_psql_numeric_type(arg):
+return "double precision"
+elif is_psql_char_type(arg):
+return "text"
+else:
+return arg
+
+class vec2cols:
+def __init__(self):
+self.get_cols_helper = vec_cols_helper()
+self.module_name = self.__class__.__name__
+
+def validate_args(self, source_table, output_table, vector_col, 
feature_names,
+  cols_to_output):
+"""
+Validate args for vec2cols
+"""
+input_tbl_valid(source_table, self.module_name)
+output_tbl_valid(output_table, self.module_name)
+is_var_valid(source_table, cols_to_output)
+is_var_valid(source_table, vector_col)
+_assert(is_valid_psql_type(get_expr_type(vector_col, 
source_table), ANY_ARRAY),
+"{0}: vector_col should refer to an 
array.".format(self.module_name))
+_assert(is_col_1d_array(source_table, vector_col),
+"{0}: vector_col must be a 1-dimensional 
array.".format(self.module_name))
+
+def get_names_for_split_output_cols(self, source_table, vector_col, 
feature_names):
+"""
+Get list of names for the newly-split columns to include in the
+output table.
+Args:
+   

[GitHub] madlib pull request #291: Feature: Vector-Column Transformations

2018-07-23 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/291#discussion_r204589559
  
--- Diff: src/ports/postgres/modules/utilities/transform_vec_cols.py_in ---
@@ -0,0 +1,513 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+from control import MinWarning
+from internal.db_utils import is_col_1d_array
+from internal.db_utils import quote_literal
+from utilities import _assert
+from utilities import add_postfix
+from utilities import ANY_ARRAY
+from utilities import is_psql_boolean_type
+from utilities import is_psql_char_type
+from utilities import is_psql_numeric_type
+from utilities import is_valid_psql_type
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from validate_args import is_var_valid
+from validate_args import get_cols
+from validate_args import get_cols_and_types
+from validate_args import get_expr_type
+from validate_args import input_tbl_valid
+from validate_args import output_tbl_valid
+from validate_args import table_exists
+
+class vec_cols_helper:
+def __init__(self):
+self.all_cols = None
+
+def get_cols_as_list(self, cols_to_process, source_table=None, 
exclude_cols=None):
+"""
+Get a list of columns based on the value of cols_to_process
+Args:
+@param cols_to_process: str, Either a * or a comma-separated 
list of col names
+@param source_table: str, optional. Source table name
+@param exclude_cols: str, optional. Comma-separated list of 
the col(s) to exclude
+ from the source table, only used if 
cols_to_process is *
+Returns:
+A list of column names (or an empty list)
+"""
+# If cols_to_process is empty/None, return empty list
+if not cols_to_process:
+return []
+if cols_to_process.strip() != "*":
+# If cols_to_process is a comma separated list of names, 
return list
+# of column names in cols_to_process.
+return [col for col in 
split_quoted_delimited_str(cols_to_process)
+if col not in split_quoted_delimited_str(exclude_cols)]
+if source_table:
+if not self.all_cols:
+self.all_cols = get_cols(source_table)
+return [col for col in self.all_cols
+if col not in split_quoted_delimited_str(exclude_cols)]
+return []
+
+def get_type_class(self, arg):
+if is_psql_numeric_type(arg):
+return "double precision"
+elif is_psql_char_type(arg):
+return "text"
+else:
+return arg
+
+class vec2cols:
+def __init__(self):
+self.get_cols_helper = vec_cols_helper()
+self.module_name = self.__class__.__name__
+
+def validate_args(self, source_table, output_table, vector_col, 
feature_names,
+  cols_to_output):
+"""
+Validate args for vec2cols
+"""
+input_tbl_valid(source_table, self.module_name)
+output_tbl_valid(output_table, self.module_name)
+is_var_valid(source_table, cols_to_output)
+is_var_valid(source_table, vector_col)
+_assert(is_valid_psql_type(get_expr_type(vector_col, 
source_table), ANY_ARRAY),
+"{0}: vector_col should refer to an 
array.".format(self.module_name))
+_assert(is_col_1d_array(source_table, vector_col),
+"{0}: vector_col must be a 1-dimensional 
array.".format(self.module_name))
+
+def get_names_for_split_output_cols(self, source_table, vector_col, 
feature_names):
+"""
+Get list of names for the newly-split columns to include in the
+output table.
+Args:
+