[GitHub] madlib pull request #291: Feature: Vector-Column Transformations
Github user asfgit closed the pull request at: https://github.com/apache/madlib/pull/291 ---
[GitHub] madlib pull request #291: Feature: Vector-Column Transformations
Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/291#discussion_r205920988 --- Diff: src/ports/postgres/modules/utilities/validate_args.py_in --- @@ -513,11 +513,12 @@ def array_col_has_same_dimension(tbl, col): # -def explicit_bool_to_text(tbl, cols, schema_madlib): +def explicit_bool_to_text(tbl, cols, schema_madlib, is_forced=False): --- End diff -- Well you don't need this function - you might need `::TEXT`. But I believe we decided that we'll let the platform fail if the array cannot be built by it. Wouldn't this build a successful array when the platform is failing. ---
[GitHub] madlib pull request #291: Feature: Vector-Column Transformations
Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/291#discussion_r205897258 --- Diff: src/ports/postgres/modules/utilities/transform_vec_cols.py_in --- @@ -0,0 +1,495 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import plpy +from control import MinWarning +from internal.db_utils import is_col_1d_array +from internal.db_utils import quote_literal +from utilities import _assert +from utilities import add_postfix +from utilities import ANY_ARRAY +from utilities import is_valid_psql_type +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from validate_args import is_var_valid +from validate_args import explicit_bool_to_text +from validate_args import get_cols +from validate_args import get_cols_and_types +from validate_args import get_expr_type +from validate_args import input_tbl_valid +from validate_args import output_tbl_valid +from validate_args import table_exists + +class vec_cols_helper: +def __init__(self): +self.all_cols = None + +def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None): +""" +Get a list of columns based on the value of cols_to_process +Args: +@param cols_to_process: str, Either a * or a comma-separated list of col names +@param source_table: str, optional. Source table name +@param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude + from the source table, only used if cols_to_process is * +Returns: +A list of column names (or an empty list) +""" +# If cols_to_process is empty/None, return empty list +if not cols_to_process: +return [] +if cols_to_process.strip() != "*": +# If cols_to_process is a comma separated list of names, return list +# of column names in cols_to_process. +return [col for col in split_quoted_delimited_str(cols_to_process) +if col not in split_quoted_delimited_str(exclude_cols)] +if source_table: +if not self.all_cols: +self.all_cols = get_cols(source_table) +return [col for col in self.all_cols +if col not in split_quoted_delimited_str(exclude_cols)] +return [] + +class vec2cols: +def __init__(self): +self.get_cols_helper = vec_cols_helper() +self.module_name = self.__class__.__name__ + +def validate_args(self, source_table, output_table, vector_col, feature_names, + cols_to_output): +""" +Validate args for vec2cols +""" +input_tbl_valid(source_table, self.module_name) +output_tbl_valid(output_table, self.module_name) +is_var_valid(source_table, cols_to_output) +is_var_valid(source_table, vector_col) +_assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY), +"{0}: vector_col should refer to an array.".format(self.module_name)) +_assert(is_col_1d_array(source_table, vector_col), +"{0}: vector_col must be a 1-dimensional array.".format(self.module_name)) + +def get_names_for_split_output_cols(self, source_table, vector_col, feature_names): +""" +Get list of names for the newly-split columns to include in the +output table. +Args: +@param: source_table, str. Source table +@param: vector_col, str. Column name containing the array input +@param: feature_names, list. Python list of the feature names to +use for the split elements in the vector_col array +""" +query = """ +
[GitHub] madlib pull request #291: Feature: Vector-Column Transformations
Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/291#discussion_r204607008 --- Diff: src/ports/postgres/modules/utilities/transform_vec_cols.py_in --- @@ -0,0 +1,513 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import plpy +from control import MinWarning +from internal.db_utils import is_col_1d_array +from internal.db_utils import quote_literal +from utilities import _assert +from utilities import add_postfix +from utilities import ANY_ARRAY +from utilities import is_psql_boolean_type +from utilities import is_psql_char_type +from utilities import is_psql_numeric_type +from utilities import is_valid_psql_type +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from validate_args import is_var_valid +from validate_args import get_cols +from validate_args import get_cols_and_types +from validate_args import get_expr_type +from validate_args import input_tbl_valid +from validate_args import output_tbl_valid +from validate_args import table_exists + +class vec_cols_helper: +def __init__(self): +self.all_cols = None + +def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None): +""" +Get a list of columns based on the value of cols_to_process +Args: +@param cols_to_process: str, Either a * or a comma-separated list of col names +@param source_table: str, optional. Source table name +@param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude + from the source table, only used if cols_to_process is * +Returns: +A list of column names (or an empty list) +""" +# If cols_to_process is empty/None, return empty list +if not cols_to_process: +return [] +if cols_to_process.strip() != "*": +# If cols_to_process is a comma separated list of names, return list +# of column names in cols_to_process. +return [col for col in split_quoted_delimited_str(cols_to_process) +if col not in split_quoted_delimited_str(exclude_cols)] +if source_table: +if not self.all_cols: +self.all_cols = get_cols(source_table) +return [col for col in self.all_cols +if col not in split_quoted_delimited_str(exclude_cols)] +return [] + +def get_type_class(self, arg): +if is_psql_numeric_type(arg): +return "double precision" +elif is_psql_char_type(arg): +return "text" +else: +return arg + +class vec2cols: +def __init__(self): +self.get_cols_helper = vec_cols_helper() +self.module_name = self.__class__.__name__ + +def validate_args(self, source_table, output_table, vector_col, feature_names, + cols_to_output): +""" +Validate args for vec2cols +""" +input_tbl_valid(source_table, self.module_name) +output_tbl_valid(output_table, self.module_name) +is_var_valid(source_table, cols_to_output) +is_var_valid(source_table, vector_col) +_assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY), +"{0}: vector_col should refer to an array.".format(self.module_name)) +_assert(is_col_1d_array(source_table, vector_col), +"{0}: vector_col must be a 1-dimensional array.".format(self.module_name)) + +def get_names_for_split_output_cols(self, source_table, vector_col, feature_names): +""" +Get list of names for the newly-split columns to include in the +output table. +Args: +
[GitHub] madlib pull request #291: Feature: Vector-Column Transformations
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/291#discussion_r204589559 --- Diff: src/ports/postgres/modules/utilities/transform_vec_cols.py_in --- @@ -0,0 +1,513 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import plpy +from control import MinWarning +from internal.db_utils import is_col_1d_array +from internal.db_utils import quote_literal +from utilities import _assert +from utilities import add_postfix +from utilities import ANY_ARRAY +from utilities import is_psql_boolean_type +from utilities import is_psql_char_type +from utilities import is_psql_numeric_type +from utilities import is_valid_psql_type +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from validate_args import is_var_valid +from validate_args import get_cols +from validate_args import get_cols_and_types +from validate_args import get_expr_type +from validate_args import input_tbl_valid +from validate_args import output_tbl_valid +from validate_args import table_exists + +class vec_cols_helper: +def __init__(self): +self.all_cols = None + +def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None): +""" +Get a list of columns based on the value of cols_to_process +Args: +@param cols_to_process: str, Either a * or a comma-separated list of col names +@param source_table: str, optional. Source table name +@param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude + from the source table, only used if cols_to_process is * +Returns: +A list of column names (or an empty list) +""" +# If cols_to_process is empty/None, return empty list +if not cols_to_process: +return [] +if cols_to_process.strip() != "*": +# If cols_to_process is a comma separated list of names, return list +# of column names in cols_to_process. +return [col for col in split_quoted_delimited_str(cols_to_process) +if col not in split_quoted_delimited_str(exclude_cols)] +if source_table: +if not self.all_cols: +self.all_cols = get_cols(source_table) +return [col for col in self.all_cols +if col not in split_quoted_delimited_str(exclude_cols)] +return [] + +def get_type_class(self, arg): +if is_psql_numeric_type(arg): +return "double precision" +elif is_psql_char_type(arg): +return "text" +else: +return arg + +class vec2cols: +def __init__(self): +self.get_cols_helper = vec_cols_helper() +self.module_name = self.__class__.__name__ + +def validate_args(self, source_table, output_table, vector_col, feature_names, + cols_to_output): +""" +Validate args for vec2cols +""" +input_tbl_valid(source_table, self.module_name) +output_tbl_valid(output_table, self.module_name) +is_var_valid(source_table, cols_to_output) +is_var_valid(source_table, vector_col) +_assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY), +"{0}: vector_col should refer to an array.".format(self.module_name)) +_assert(is_col_1d_array(source_table, vector_col), +"{0}: vector_col must be a 1-dimensional array.".format(self.module_name)) + +def get_names_for_split_output_cols(self, source_table, vector_col, feature_names): +""" +Get list of names for the newly-split columns to include in the +output table. +Args: +