This is an automated email from the ASF dual-hosted git repository. okislal pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit acb9e50f280dc0efd96f1236712f6643824e2acd Author: Orhan Kislal <[email protected]> AuthorDate: Tue Mar 3 18:41:44 2020 -0500 Graph: Fix column type concat bug JIRA: MADLIB-1408 The apsp get path function fails if the the input columns are of bigint type. The issue stems from the concatenate operation we use to build the array that represents the path. Adding an integer to an integer array works as expected but adding a bigint to an int array (or vice versa) doesn't. This commit fixes the issue by detecting the appropriate type and adding the proper cast operations. Closes #483 --- src/ports/postgres/modules/graph/apsp.py_in | 18 +++++-------- src/ports/postgres/modules/graph/apsp.sql_in | 16 ++++++------ src/ports/postgres/modules/graph/graph_utils.py_in | 24 +++++++++++++++++ src/ports/postgres/modules/graph/sssp.py_in | 16 ++++++++---- src/ports/postgres/modules/graph/sssp.sql_in | 16 ++++++------ .../postgres/modules/graph/test/measures.sql_in | 30 ++++++++++++++++++++++ src/ports/postgres/modules/graph/test/sssp.sql_in | 1 + 7 files changed, 89 insertions(+), 32 deletions(-) diff --git a/src/ports/postgres/modules/graph/apsp.py_in b/src/ports/postgres/modules/graph/apsp.py_in index 2d50929..1d55696 100644 --- a/src/ports/postgres/modules/graph/apsp.py_in +++ b/src/ports/postgres/modules/graph/apsp.py_in @@ -31,6 +31,7 @@ import plpy from graph_utils import validate_graph_coding from graph_utils import get_graph_usage +from graph_utils import get_edge_params from utilities.control import MinWarning from utilities.utilities import _assert from utilities.utilities import _check_groups @@ -412,17 +413,12 @@ def graph_apsp_get_path(schema_madlib, apsp_table, edge_args = summary[0]['edge_args'] grouping_cols = summary[0]['grouping_cols'] - params_types = {'src': str, 'dest': str, 'weight': str} - default_args = {'src': 'src', 'dest': 'dest', 'weight': 'weight'} - edge_params = extract_keyvalue_params(edge_args, - params_types, - default_args) + edge_params, final_type = get_edge_params(schema_madlib, apsp_table, edge_args) src = edge_params["src"] dest = edge_params["dest"] weight = edge_params["weight"] - if not vertex_id: vertex_id = "id" @@ -450,7 +446,7 @@ def graph_apsp_get_path(schema_madlib, apsp_table, if source_vertex == dest_vertex: plpy.execute(""" CREATE TABLE {path_table} AS - SELECT {grp_comma} '{{{dest_vertex}}}'::INT[] AS path + SELECT {grp_comma} '{{{dest_vertex}}}'::{final_type}[] AS path FROM {apsp_table} WHERE {src} = {source_vertex} AND {dest} = {dest_vertex} """.format(**locals())) @@ -462,7 +458,7 @@ def graph_apsp_get_path(schema_madlib, apsp_table, # Initialize the temporary tables out = plpy.execute(""" CREATE TEMP TABLE {temp1_name} AS SELECT {grp_comma} {apsp_table}.parent AS {vertex_id}, - ARRAY[{dest_vertex}] AS path + ARRAY[{dest_vertex}]::{final_type}[] AS path FROM {apsp_table} WHERE {src} = {source_vertex} AND {dest} = {dest_vertex} AND {apsp_table}.parent IS NOT NULL @@ -513,15 +509,15 @@ def graph_apsp_get_path(schema_madlib, apsp_table, # Path with 1 vertex (src == dest) has been handled before plpy.execute(""" CREATE TABLE {path_table} AS - SELECT {grp_comma} {source_vertex} || ({vertex_id} || path) AS path + SELECT {grp_comma} {source_vertex}::{final_type} || ({vertex_id} || path) AS path FROM {temp2_name} WHERE {vertex_id} <> {dest_vertex} UNION - SELECT {grp_comma} {source_vertex} || path AS path + SELECT {grp_comma} {source_vertex}::{final_type} || path AS path FROM {temp2_name} WHERE {vertex_id} = {dest_vertex} UNION - SELECT {grp_comma} '{{}}'::INT[] AS path + SELECT {grp_comma} '{{}}'::{final_type}[] AS path FROM {apsp_table} WHERE {src} = {source_vertex} AND {dest} = {dest_vertex} AND {apsp_table}.parent IS NULL diff --git a/src/ports/postgres/modules/graph/apsp.sql_in b/src/ports/postgres/modules/graph/apsp.sql_in index 7cd77d3..66d1f74 100644 --- a/src/ports/postgres/modules/graph/apsp.sql_in +++ b/src/ports/postgres/modules/graph/apsp.sql_in @@ -78,7 +78,7 @@ contain the column specified in the 'vertex_id' parameter below.</dd> <dt>vertex_id</dt> <dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing -vertex ids. The vertex ids are of type INTEGER with no duplicates. +vertex ids. The vertex ids are of type BIGINT with no duplicates. They do not need to be contiguous.</dd> <dt>edge_table</dt> @@ -90,9 +90,9 @@ Column naming convention is described below in the 'edge_args' parameter.</dd> <dd>TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument: - - src (INTEGER): Name of the column containing the source vertex ids in the + - src (BIGINT): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'. - - dest (INTEGER): Name of the column containing the destination vertex ids + - dest (BIGINT): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'. - weight (FLOAT8): Name of the column containing the edge weights in the edge table. Default column name is 'weight'.</dd> @@ -142,10 +142,10 @@ graph_apsp_get_path( apsp_table, <dd>TEXT. Name of the table that contains the APSP output.</dd> <dt>source_vertex</dt> -<dd>INTEGER. The vertex that will be the source of the desired path.</dd> +<dd>BIGINT. The vertex that will be the source of the desired path.</dd> <dt>dest_vertex</dt> -<dd>INTEGER. The vertex that will be the destination of the desired path.</dd> +<dd>BIGINT. The vertex that will be the destination of the desired path.</dd> <dt>path_table</dt> <dd>TEXT. Name of the output table that contains the path. @@ -412,9 +412,9 @@ m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp_get_path( apsp_table TEXT, - source_vertex INT, - dest_vertex INT, - path_table TEXT + source_vertex BIGINT, + dest_vertex BIGINT, + path_table TEXT ) RETURNS VOID AS $$ PythonFunction(graph, apsp, graph_apsp_get_path) diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in b/src/ports/postgres/modules/graph/graph_utils.py_in index c104f8e..889ef88 100644 --- a/src/ports/postgres/modules/graph/graph_utils.py_in +++ b/src/ports/postgres/modules/graph/graph_utils.py_in @@ -29,11 +29,13 @@ import plpy from utilities.utilities import _assert, add_postfix from utilities.utilities import get_filtered_cols_subquery_str +from utilities.utilities import extract_keyvalue_params from utilities.validate_args import get_cols from utilities.validate_args import unquote_ident from utilities.validate_args import table_exists from utilities.validate_args import columns_exist_in_table from utilities.validate_args import table_is_empty +from utilities.validate_args import get_cols_and_types def validate_output_and_summary_tables(model_out_table, module_name, out_table=None): @@ -221,3 +223,25 @@ def get_graph_usage(schema_madlib, func_name, other_text): other_text=other_text, comma=',' if other_text is not None else ' ') return usage + + +def get_edge_params(schema_madlib, table, edge_args): + + params_types = {'src': str, 'dest': str, 'weight': str} + default_args = {'src': 'src', 'dest': 'dest', 'weight': 'weight'} + edge_params = extract_keyvalue_params(edge_args, + params_types, + default_args) + src = edge_params["src"] + dest = edge_params["dest"] + weight = edge_params["weight"] + + # Find the appropriate column type for correct concat operations + names_and_types = get_cols_and_types(table) + final_type = 'integer' + for col_name,col_type in names_and_types: + # We want to check the column types except the weight column + if col_type == 'bigint' and col_name != weight: + final_type = 'bigint' + + return edge_params, final_type diff --git a/src/ports/postgres/modules/graph/sssp.py_in b/src/ports/postgres/modules/graph/sssp.py_in index d7f2c80..0819132 100644 --- a/src/ports/postgres/modules/graph/sssp.py_in +++ b/src/ports/postgres/modules/graph/sssp.py_in @@ -30,6 +30,7 @@ import plpy from graph_utils import validate_graph_coding from graph_utils import get_graph_usage +from graph_utils import get_edge_params from utilities.control import MinWarning from utilities.utilities import _assert @@ -411,6 +412,11 @@ def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, path_table, summary = plpy.execute("SELECT * FROM {0}".format(summary_table)) vertex_id = summary[0]['vertex_id'] source_vertex = summary[0]['source_vertex'] + edge_args = summary[0]['edge_args'] + + edge_params, final_type = get_edge_params(schema_madlib, sssp_table, edge_args) + + weight = edge_params["weight"] if not vertex_id: vertex_id = "id" @@ -432,7 +438,7 @@ def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, path_table, if source_vertex == dest_vertex: plpy.execute(""" CREATE TABLE {path_table} AS - SELECT {grp_comma} '{{{dest_vertex}}}'::INT[] AS path + SELECT {grp_comma} '{{{dest_vertex}}}'::{final_type}[] AS path FROM {sssp_table} WHERE {vertex_id} = {dest_vertex} """.format(**locals())) return @@ -441,7 +447,7 @@ def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, path_table, format(temp1_name, temp2_name)) out = plpy.execute(""" CREATE TEMP TABLE {temp1_name} AS SELECT {grp_comma} {sssp_table}.parent AS {vertex_id}, - ARRAY[{dest_vertex}] AS path + ARRAY[{dest_vertex}]::{final_type}[] AS path FROM {sssp_table} WHERE {vertex_id} = {dest_vertex} AND {sssp_table}.parent IS NOT NULL @@ -481,10 +487,10 @@ def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, path_table, # the destination vertex plpy.execute(""" CREATE TABLE {path_table} AS - SELECT {grp_comma} {source_vertex} || path AS path + SELECT {grp_comma} {source_vertex}::{final_type} || path AS path FROM {temp2_name} UNION - SELECT {grp_comma} '{{}}'::INT[] AS path + SELECT {grp_comma} '{{}}'::{final_type}[] AS path FROM {sssp_table} WHERE {vertex_id} = {dest_vertex} AND {sssp_table}.parent IS NULL @@ -508,7 +514,7 @@ def _validate_sssp(vertex_table, vertex_id, edge_table, edge_params, validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params, out_table, 'SSSP') - _assert(isinstance(source_vertex, int), + _assert(isinstance(source_vertex, int) or isinstance(source_vertex, long), """Graph SSSP: Source vertex {source_vertex} has to be an integer.""". format(**locals())) src_exists = plpy.execute(""" diff --git a/src/ports/postgres/modules/graph/sssp.sql_in b/src/ports/postgres/modules/graph/sssp.sql_in index 8175624..fd05fcf 100644 --- a/src/ports/postgres/modules/graph/sssp.sql_in +++ b/src/ports/postgres/modules/graph/sssp.sql_in @@ -68,7 +68,7 @@ column specified in the 'vertex_id' parameter below.</dd> <dt>vertex_id</dt> <dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing -vertex ids. The vertex ids are of type INTEGER with no duplicates. +vertex ids. The vertex ids are of type BIGINT with no duplicates. They do not need to be contiguous.</dd> <dt>edge_table</dt> @@ -80,12 +80,12 @@ Column naming convention is described below in the 'edge_args' parameter.</dd> <dd>TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument: - - src (INTEGER): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'. - - dest (INTEGER): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'. + - src (BIGINT): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'. + - dest (BIGINT): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'. - weight (FLOAT8): Name of the column containing the edge weights in the edge table. Default column name is 'weight'.</dd> <dt>source_vertex</dt> -<dd>INTEGER. The source vertex id for the algorithm to start. This vertex id must +<dd>BIGINT. The source vertex id for the algorithm to start. This vertex id must exist in the 'vertex_id' column of 'vertex_table'.</dd> <dt>out_table</dt> @@ -125,7 +125,7 @@ graph_sssp_get_path( sssp_table, <dd>TEXT. Name of the table that contains the SSSP output.</dd> <dt>dest_vertex</dt> -<dd>INTEGER. The vertex that will be the destination of the desired path.</dd> +<dd>BIGINT. The vertex that will be the destination of the desired path.</dd> <dt>path_table</dt> <dd>TEXT. Name of the output table that contains the path. @@ -332,7 +332,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp( vertex_id TEXT, edge_table TEXT, edge_args TEXT, - source_vertex INT, + source_vertex BIGINT, out_table TEXT, grouping_cols TEXT @@ -346,7 +346,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp( vertex_id TEXT, edge_table TEXT, edge_args TEXT, - source_vertex INT, + source_vertex BIGINT, out_table TEXT ) RETURNS VOID AS $$ @@ -356,7 +356,7 @@ m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); ------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp_get_path( sssp_table TEXT, - dest_vertex INT, + dest_vertex BIGINT, path_table TEXT ) RETURNS VOID AS $$ diff --git a/src/ports/postgres/modules/graph/test/measures.sql_in b/src/ports/postgres/modules/graph/test/measures.sql_in index f6e87bd..3b3886a 100644 --- a/src/ports/postgres/modules/graph/test/measures.sql_in +++ b/src/ports/postgres/modules/graph/test/measures.sql_in @@ -199,3 +199,33 @@ ALTER TABLE vertex RENAME COLUMN src_id TO "DEST_ID"; SELECT graph_vertex_degrees('vertex','"DEST_ID"','"EDGE"', 'src=src_id, dest="DEST_ID", weight=edge_weight','out'); SELECT * FROM out; +ALTER TABLE vertex RENAME COLUMN "DEST_ID" TO id; + +-- Test for bigint columns + +CREATE TABLE v2 AS SELECT id::bigint FROM vertex; +CREATE TABLE e2 AS SELECT src_id::bigint, "DEST_ID"::bigint, edge_weight FROM "EDGE"; + +DROP TABLE IF EXISTS out_apsp, out_apsp_summary; +SELECT graph_apsp('v2', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'e2', -- "EDGE" table + 'src=src_id, dest="DEST_ID", weight=edge_weight', + -- "EDGE" arguments (NULL means use default naming) + 'out_apsp'); -- Output table of shortest paths + +DROP TABLE IF EXISTS out_closeness; +SELECT graph_closeness('out_apsp', 'out_closeness'); + +DROP TABLE IF EXISTS out_diameter; +SELECT graph_diameter('out_apsp', 'out_diameter'); + +DROP TABLE IF EXISTS out; +SELECT graph_avg_path_length('out_apsp', 'out'); + +DROP TABLE IF EXISTS out_degrees; +SELECT graph_vertex_degrees('v2', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'e2', -- "EDGE" table + 'src=src_id, dest="DEST_ID", weight=edge_weight', + 'out_degrees'); diff --git a/src/ports/postgres/modules/graph/test/sssp.sql_in b/src/ports/postgres/modules/graph/test/sssp.sql_in index eab2b38..a147976 100644 --- a/src/ports/postgres/modules/graph/test/sssp.sql_in +++ b/src/ports/postgres/modules/graph/test/sssp.sql_in @@ -142,6 +142,7 @@ SELECT graph_sssp_get_path('"out_Q"',5,'"out_Q_path"'); SELECT * FROM "out_Q_path"; -- Test for common column names in vertex and edge tables + DROP TABLE IF EXISTS out, out_summary; ALTER TABLE vertex RENAME COLUMN id TO src;
