Repository: incubator-madlib Updated Branches: refs/heads/master 492fc7e41 -> 8c9b955cd
Graph: Add Breadth-first Search algorithm with grouping support. JIRA: MADLIB-1102 This algorithm searches or traverses connected nodes in a graph in breadth-first order starting at a user-specified origin node. Grouping support is also included. Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8c9b955c Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8c9b955c Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8c9b955c Branch: refs/heads/master Commit: 8c9b955cd2e3150ad935ad1581e164670723184f Parents: 492fc7e Author: Rashmi Raghu <rra...@pivotal.io> Authored: Wed Jul 12 15:15:21 2017 -0700 Committer: Rashmi Raghu <rra...@pivotal.io> Committed: Wed Jul 12 15:24:16 2017 -0700 ---------------------------------------------------------------------- doc/mainpage.dox.in | 3 + src/ports/postgres/modules/graph/bfs.py_in | 477 +++++++++++++++++++ src/ports/postgres/modules/graph/bfs.sql_in | 471 ++++++++++++++++++ .../postgres/modules/graph/graph_utils.py_in | 11 + .../postgres/modules/graph/test/bfs.sql_in | 278 +++++++++++ 5 files changed, 1240 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8c9b955c/doc/mainpage.dox.in ---------------------------------------------------------------------- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index e16f9b2..52afd48 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -129,6 +129,9 @@ complete matrix stored as a distributed table. @defgroup grp_apsp All Pairs Shortest Path @ingroup grp_graph + @defgroup grp_bfs Breadth-first Search + @ingroup grp_graph + @defgroup grp_pagerank PageRank @ingroup grp_graph http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8c9b955c/src/ports/postgres/modules/graph/bfs.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/bfs.py_in b/src/ports/postgres/modules/graph/bfs.py_in new file mode 100644 index 0000000..a5631ea --- /dev/null +++ b/src/ports/postgres/modules/graph/bfs.py_in @@ -0,0 +1,477 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Breadth-First Search + +# Please refer to the bfs.sql_in file for the documentation + +""" +@file bfs.py_in + +@namespace graph +""" + +import plpy +from graph_utils import validate_graph_coding +from graph_utils import get_graph_usage +from graph_utils import _grp_null_checks +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import split_quoted_delimited_str +from utilities.validate_args import table_exists +from utilities.validate_args import columns_exist_in_table + +m4_changequote(`<!', `!>') + +def _validate_bfs(vertex_table, vertex_id, edge_table, edge_params, + source_vertex, out_table, max_distance, directed, grouping_cols_list, **kwargs): + + validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params, + out_table,'BFS') + + _assert((max_distance >= 0) and isinstance(max_distance,int), + """Graph BFS: Invalid max_distance type or value ({0}), must be integer, + be greater than or equal to 0 and be less than max allowable integer + (2147483647).""". + format(max_distance)) + + _assert(isinstance(directed,bool), + """Graph BFS: Invalid value for directed ({0}), must be boolean.""". + format(directed)) + + _assert(isinstance(source_vertex,int), + """Graph BFS: Source vertex {source_vertex} has to be an integer.""". + format(**locals())) + src_exists = plpy.execute(""" + SELECT * FROM {vertex_table} WHERE {vertex_id}={source_vertex} + """.format(**locals())) + if src_exists.nrows() == 0: + plpy.error( + """Graph BFS: Source vertex {source_vertex} is not present in the + vertex table {vertex_table}.""". + format(**locals())) + + vt_error = plpy.execute( + """ SELECT {vertex_id} + FROM {vertex_table} + WHERE {vertex_id} IS NOT NULL + GROUP BY {vertex_id} + HAVING count(*) > 1 """.format(**locals())) + if vt_error.nrows() != 0: + plpy.error( + """Graph BFS: Source vertex table {vertex_table} contains duplicate + vertex id's.""". + format(**locals())) + + _assert(not table_exists(out_table+"_summary"), + "Graph BFS: Output summary table already exists!") + + if grouping_cols_list is not None: + _assert(columns_exist_in_table(edge_table, grouping_cols_list), + """Graph BFS: Not all columns from {grouping_cols_list} are present + in edge table ({edge_table}).""". + format(**locals())) + + return None + + +def graph_bfs(schema_madlib, vertex_table, vertex_id, edge_table, + edge_args, source_vertex, out_table, max_distance, directed, grouping_cols, + **kwargs): + + """ + Breadth First Search algorithm for graphs [1]. + Args: + @param vertex_table Name of the table that contains the vertex data. + @param vertex_id Name of the column containing the vertex ids. + @param edge_table Name of the table that contains the edge data. + @param edge_args A comma-delimited string containing multiple + named arguments of the form "name=value". + @param source_vertex The source vertex id for the algorithm to start. + @param out_table Name of the table to store the result of BFS. + @param max_distance Maximum distance from the source_vertex to search for. + @param directed Graph will be treated as directed if this boolean flag + is set to TRUE. Graph is treated as undirected by default. + @param grouping_cols The list of grouping columns. + + [1] https://en.wikipedia.org/wiki/Breadth-first_search + """ + + with MinWarning("warning"): + + INT_MAX = 2147483647 + + params_types = {'src': str, 'dest': str} + default_args = {'src': 'src', 'dest': 'dest'} + edge_params = extract_keyvalue_params(edge_args, + params_types, + default_args) + + # Prepare the input for recording in the summary table + if vertex_id is None: + v_st= "NULL" + vertex_id = "id" + else: + v_st = vertex_id + if edge_args is None: + e_st = "NULL" + else: + e_st = edge_args + if max_distance is None: + d_st= "NULL" + max_distance = INT_MAX + else: + d_st = max_distance + if directed is None: + dir_st= "NULL" + directed = False + else: + dir_st = directed + if grouping_cols is None or grouping_cols is '': + g_st = "NULL" + glist = None + else: + g_st = grouping_cols + glist = split_quoted_delimited_str(grouping_cols) + + src = edge_params["src"] + dest = edge_params["dest"] + + distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>, + <!"DISTRIBUTED BY ({0})".format(vertex_id)!>) + local_distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>, + <!"DISTRIBUTED RANDOMLY"!>) + + _validate_bfs(vertex_table, vertex_id, edge_table, + edge_params, source_vertex, out_table, max_distance, directed, glist) + + # Initialize grouping related variables + grp_comma = "" + and_grp_null_checks = "" + + if grouping_cols is not None and grouping_cols is not '': + grp_comma = grouping_cols + ", " + and_grp_null_checks = " AND " + _grp_null_checks(glist) + + # We keep a table of every vertex, the distance to that vertex from source + # and the parent in the path to the vertex. + # This table will be updated throughout the execution. + dist_col = "dist" + parent_col = "parent" + curr_dist_val = 0 + + # Creating the output table with the appropriate columns and data types + plpy.execute(""" + CREATE TABLE {out_table} AS ( + SELECT + {grp_comma} + {src} AS {vertex_id}, + {curr_dist_val}::INT AS {dist_col}, + {src} AS {parent_col} + FROM {edge_table} + LIMIT 0 + ) {distribution}""".format(**locals())) + + # We keep a summary table to keep track of the parameters used for this + # BFS run + plpy.execute( """ + CREATE TABLE {out_table}_summary ( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + source_vertex INTEGER, + out_table TEXT, + max_distance INTEGER, + directed BOOLEAN, + grouping_cols TEXT + )""".format(**locals())) + + plpy.execute(""" + INSERT INTO {out_table}_summary VALUES + ('{vertex_table}', '{v_st}', '{edge_table}', '{e_st}', + {source_vertex}, '{out_table}', {d_st}, {dir_st}, '{g_st}') + """.format(**locals())) + + + # The queries for directed and undirected graphs share a common section. + # There are additional clauses added to the undirected graph queries. + # In the undirected case edges can be considered to go from {src} to + # {dest} and {dest} to {src} + + insert_qry_undirected_init = "" + count_qry_undirected = "" + insert_qry_undirected_loop = "" + + if not directed: + insert_qry_undirected_init = """ OR {dest} = {source_vertex} + """.format(**locals()) + + count_qry_undirected = """ OR ( + ({grp_comma} {dest}) IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + WHERE {dist_col}={{curr_dist_val}} + ) + AND + ({grp_comma} {src}) NOT IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + ) + ) + """.format(**locals()) + + insert_qry_undirected_loop = """ UNION + SELECT {grp_comma} + {src} AS {vertex_id}, + {{curr_dist_val}}+1 AS {dist_col}, + {dest} AS {parent_col} + FROM {edge_table} + WHERE ( + ({grp_comma} {dest}) IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + WHERE {dist_col}={{curr_dist_val}} + ) + AND + ({grp_comma} {src}) NOT IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + ) + ) + """.format(**locals()) + + # This step inserts into the output table the source vertex for each + # group in which it is present. Grouping behavior is not predictable + # when there are NULLs in any grouping column. Therefore those rows + # are explicitly removed from analysis + insert_qry_init = """ + INSERT INTO {out_table} + SELECT {grp_comma} + {source_vertex} AS {vertex_id}, + {curr_dist_val} AS {dist_col}, + NULL AS {parent_col} + FROM {edge_table} + WHERE ({src} = {source_vertex} {insert_qry_undirected_init}) + {and_grp_null_checks} + GROUP BY {grp_comma} {vertex_id}, {dist_col} + """.format(**locals()) + plpy.execute(insert_qry_init.format(**locals())) + + # After initialization of the output table, number of nodes connected + # by edges to the source vertex in each group is counted. This is also used + # below in the BFS iteration while-loop + count_qry = """ + SELECT COUNT(*) + FROM {edge_table} + WHERE ( + ({grp_comma} {src}) IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + WHERE {dist_col}={{curr_dist_val}} + ) + AND + ({grp_comma} {dest}) NOT IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + ) + ) {count_qry_undirected} + """.format(**locals()) + + vct = plpy.execute(count_qry.format(**locals()))[0]['count'] + + # This insert statement is executed within the BFS iteration while-loop + # below. It is used to discover and store all nodes (not already found) + # connected to those found in the immediate previous iteration. + insert_qry_loop = """ + INSERT INTO {out_table} + SELECT {grp_comma} {vertex_id}, {dist_col}, min({parent_col}) + FROM ( + SELECT {grp_comma} + {dest} AS {vertex_id}, + {{curr_dist_val}}+1 AS {dist_col}, + {src} AS {parent_col} + FROM {edge_table} + WHERE ( + ({grp_comma} {src}) IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + WHERE {dist_col}={{curr_dist_val}} + ) + AND + ({grp_comma} {dest}) NOT IN ( + SELECT {grp_comma} {vertex_id} FROM {out_table} + ) + ) + {insert_qry_undirected_loop} + ) t1 + GROUP BY {grp_comma} {vertex_id}, {dist_col} + """.format(**locals()) + + # Main loop for traversing the graph + while vct > 0 and curr_dist_val < max_distance: + # The loop consists of two steps: + # 1) Disover and store all nodes that are linked to nodes found in + # the immediate previous iteration of the loop that have not already + # been found in all previous iterations + # 2) Check for any nodes linked to those discovered in Step 1 above + # that have not yet been discovered + # + # If a node has multiple possible parents then the parent with the + # smallest ID is chosen for output + + # In the directed graph case only nodes in the {dest} column of + # the edge table are searched to find new nodes reachable from + # previously discovered nodes + + # In the undirected graph case edges are treated as non-directional + # (or bidirectional). Nodes in both the {src} and {dest} columns of + # the edge table are searched to find new nodes reachable from + # previously discovered nodes. + # + # This approach does NOT require the user to provide a forward edge + # and a reverse edge between the same two nodes to indicate the + # graph's undirected nature. However, it will work in that scenario + # as well. + + # Discover and store all nodes (not already found) connected to + # those found in the immediate previous iteration + plpy.execute(insert_qry_loop.format(**locals())) + + # Update distance value for next iteration + curr_dist_val = curr_dist_val + 1 + + # Count / find any nodes that are connected to those discovered and + # stored in this iteration. This is used to check if the iterations + # need to continue. + vct = plpy.execute(count_qry.format(**locals()))[0]['count'] + + return None + +def graph_bfs_help(schema_madlib, message, **kwargs): + """ + Help function for graph_bfs + + Args: + @param schema_madlib + @param message: string, Help message string + @param kwargs + + Returns: + String. Help/usage information + """ + if not message: + help_string = """ +----------------------------------------------------------------------- + SUMMARY +----------------------------------------------------------------------- + +Given a graph and a source vertex, the Breadth-first Search (BFS) algorithm +finds all nodes reachable from the source vertex. + +For more details on function usage: + SELECT {schema_madlib}.graph_bfs('usage') + """ + elif message.lower() in ['usage', 'help', '?']: + help_string = """ +Given a graph and a source vertex, the Breadth-first Search (BFS) algorithm +finds all nodes reachable from the source vertex. + +{graph_usage} + +---------------------------------------------------------------------------- + OUTPUT +---------------------------------------------------------------------------- +The output of BFS ('out_table' above) contains a row for every vertex of that is +reachable from the source_vertex. In the presence of grouping columns, only those +edges are used for which there are no NULL values in any grouping column. +The output table will have the following columns (in addition to the +grouping columns): + - vertex_id : The id for any node reachable from source_vertex in addition to + the source_vertex. Will use the input parameter 'vertex_id' for + column naming. + - dist : The distance in number of edges (or hops) from the source_vertex + to where this vertex is located. + - parent : The parent of this vertex in BFS traversal of the graph from + source_vertex. Will use 'parent' for column naming. For the + case where vertex_id = source_vertex, the value for parent is NULL. +""" + elif message.lower() in ("example", "examples"): + help_string = """ +---------------------------------------------------------------------------- + EXAMPLES +---------------------------------------------------------------------------- +-- Create a graph, represented as vertex and edge tables. +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER + ); +CREATE TABLE edge( + src INTEGER, + dest INTEGER + ); +INSERT INTO vertex VALUES +(0), +(1), +(2), +(3), +(4), +(5), +(6), +(7), +(8), +(9), +(10), +(11) +; +INSERT INTO edge VALUES +(0, 5), +(1, 0), +(1, 3), +(2, 6), +(3, 4), +(3, 5), +(4, 2), +(8, 9), +(9, 10), +(9, 11), +(10, 8) +; + +-- Traverse undirected graph from vertex 3: +DROP TABLE IF EXISTS out, out_summary; +SELECT madlib.graph_bfs( + 'vertex', -- Vertex table + NULL, -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + NULL, -- Edge arguments (NULL means use default naming) + 3, -- Source vertex for BFS + 'out' -- Output table of nodes reachable from source_vertex + ); + -- Default values used for the other arguments +SELECT * FROM out ORDER BY dist,id; + +SELECT * FROM out_summary; + +""" + else: + help_string = "No such option. Use {schema_madlib}.graph_bfs()" + + return help_string.format(schema_madlib=schema_madlib, + graph_usage=get_graph_usage(schema_madlib, 'graph_bfs', + """source_vertex INT, -- The source vertex id for the algorithm to start. + out_table TEXT, -- Name of the table to store the result of BFS. + max_distance INT, -- Maximum distance from source_vertex to search through in the graph. + directed INT, -- If TRUE the graph will be treated as directed. + grouping_cols TEXT -- A comma-separated list of grouping columns.""")) +# --------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8c9b955c/src/ports/postgres/modules/graph/bfs.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/bfs.sql_in b/src/ports/postgres/modules/graph/bfs.sql_in new file mode 100644 index 0000000..fd7a396 --- /dev/null +++ b/src/ports/postgres/modules/graph/bfs.sql_in @@ -0,0 +1,471 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * + * @file bfs.sql_in + * + * @brief SQL functions for graph analytics + * @date Jun 2017 + * + * @sa Provides Breadth First Search graph algorithm. + * + *//* ----------------------------------------------------------------------- */ +m4_include(`SQLCommon.m4') +/** +@addtogroup grp_bfs + +<div class="toc"><b>Contents</b> +<ul> +<li><a href="#bfs">Breadth-first Search</a></li> +<li><a href="#notes">Notes</a></li> +<li><a href="#examples">Examples</a></li> +<li><a href="#literature">Literature</a></li> +</ul> +</div> + +@brief Finds the nodes reachable from a given source vertex using a breadth-first approach. + +Given a graph and a source vertex, the Breadth-first Search (BFS) algorithm +finds all nodes reachable from the source vertex by searching / traversing the graph +in a breadth-first manner. + +@anchor bfs +@par BFS +<pre class="syntax"> +graph_bfs( vertex_table, + vertex_id, + edge_table, + edge_args, + source_vertex, + out_table, + max_distance, + directed, + grouping_cols + ) +</pre> + +\b Arguments +<dl class="arglist"> +<dt>vertex_table</dt> +<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the +column specified in the 'vertex_id' parameter below.</dd> + +<dt>vertex_id</dt> +<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing +vertex ids. The vertex ids are of type INTEGER with no duplicates. +They do not need to be contiguous.</dd> + +<dt>edge_table</dt> +<dd>TEXT. Name of the table containing the edge data. The edge table must contain +columns for source vertex and destination vertex. Column naming convention is +described below in the 'edge_args' parameter. +In addition to vertex columns, if grouping is used then the columns specified +in the 'grouping_cols' parameter must be present. </dd> + +<dt>edge_args</dt> +<dd>TEXT. A comma-delimited string containing multiple named arguments of +the form "name=value". The following parameters are supported for +this string argument: + - src (INTEGER): Name of the column containing the source vertex ids in the edge table. + Default column name is 'src'. + (Not to be confused with the source_vertex argument passed to the BFS function) + - dest (INTEGER): Name of the column containing the destination vertex ids in + the edge table. Default column name is 'dest'. + +<dt>source_vertex</dt> +<dd>INTEGER. The source vertex id for the algorithm to start. This vertex id must +exist in the 'vertex_id' column of 'vertex_table'.</dd> + +<dt>out_table</dt> +<dd>TEXT. Name of the table to store the result of BFS. +It contains a row for every vertex that is reachable from the source_vertex. +In the presence of grouping columns, only those edges are used for which there are no NULL values +in any grouping column. +The output table will have the following columns (in addition to the grouping columns): + - vertex_id : The id for any node reachable from source_vertex in addition to + the source_vertex. Will use the input parameter 'vertex_id' for + column naming. + - dist : The distance in number of edges (or hops) from the source_vertex + to where this vertex is located. + - parent : The parent of this vertex in BFS traversal of the graph from source_vertex. + Will use 'parent' for column naming. For the + case where vertex_id = source_vertex, the value for parent is NULL. + +A summary table named <out_table>_summary is also created. This is an internal table that keeps a record of the input parameters. +</dd> + +<dt>max_distance</dt> +<dd>INT, default = NULL. Maximum distance (number of edges) from source_vertex to search through in the graph.</dd> + +<dt>directed</dt> +<dd>BOOLEAN, default = FALSE. If TRUE the graph will be treated as directed, else it will be treated as an undirected graph.</dd> + +<dt>grouping_cols</dt> +<dd>TEXT, default = NULL. A comma-separated list of columns used to group the +input into discrete subgraphs. +These columns must exist in the edge table. When this value is NULL, no grouping is used +and a single BFS result is generated. +@note Expressions are not currently supported for 'grouping_cols'.</dd> + +</dl> + +@anchor notes +@par Notes + +The graph_bfs function is a SQL implementation of the well-known Breadth-first +Search algorithm [1] modified appropriately for a relational database. It will +find any node in the graph reachable from the source_vertex only once. If a node +is reachable by many different paths from the source_vertex (i.e. has more than +one parent), then only one of those parents is present in the output table. +The BFS result will, in general, be different for different choices of source_vertex. + +@anchor examples +@examp + +-# Create vertex and edge tables to represent the graph: +<pre class="syntax"> +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER + ); +CREATE TABLE edge( + src INTEGER, + dest INTEGER + ); +INSERT INTO vertex VALUES +(0), +(1), +(2), +(3), +(4), +(5), +(6), +(7), +(8), +(9), +(10), +(11) +; +INSERT INTO edge VALUES +(0, 5), +(1, 0), +(1, 3), +(2, 6), +(3, 4), +(3, 5), +(4, 2), +(8, 9), +(9, 10), +(9, 11), +(10, 8) +; +</pre> + +-# Traverse undirected graph from vertex 3: +<pre class="syntax"> +DROP TABLE IF EXISTS out, out_summary; +SELECT madlib.graph_bfs( + 'vertex', -- Vertex table + NULL, -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + NULL, -- Edge arguments (NULL means use default naming) + 3, -- Source vertex for BFS + 'out'); -- Output table of nodes reachable from source_vertex + -- Default values used for the other arguments +SELECT * FROM out ORDER BY dist,id; +</pre> +<pre class="result"> + id | dist | parent +----+------+-------- + 3 | 0 | + 1 | 1 | 3 + 4 | 1 | 3 + 5 | 1 | 3 + 0 | 2 | 1 + 2 | 2 | 4 + 6 | 3 | 2 +(7 rows) +</pre> +<pre class="syntax"> +SELECT * FROM out_summary; +</pre> +<pre class="result"> + vertex_table | vertex_id | edge_table | edge_args | source_vertex | out_table | max_distance | directed | grouping_cols +--------------+-----------+------------+-----------+---------------+-----------+--------------+----------+--------------- + vertex | NULL | edge | NULL | 3 | out | | | NULL +(1 row) +</pre> + +-# In this example, we use max_distance to limit the search distance. +<pre class="syntax"> +DROP TABLE IF EXISTS out_max, out_max_summary; +SELECT madlib.graph_bfs( + 'vertex', -- Vertex table + NULL, -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + NULL, -- Edge arguments (NULL means use default naming) + 3, -- Source vertex for BFS + 'out_max', -- Output table of nodes reachable from source_vertex + 2); -- Maximum distance to traverse from source_vertex + -- Default values used for the other arguments +SELECT * FROM out_max ORDER BY dist,id; +</pre> +<pre class="result"> + id | dist | parent +----+------+-------- + 3 | 0 | + 1 | 1 | 3 + 4 | 1 | 3 + 5 | 1 | 3 + 0 | 2 | 1 + 2 | 2 | 4 +(6 rows) +</pre> + +-# Now let's do an example using +different column names in the tables (i.e., not the defaults). +Create the vertex and edge tables: +<pre class="syntax"> +DROP TABLE IF EXISTS vertex_alt, edge_alt; +CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex; +CREATE TABLE edge_alt AS SELECT src AS n1, dest AS n2 FROM edge; +</pre> + +-# Run BFS from vertex 8: +<pre class="syntax"> +DROP TABLE IF EXISTS out_alt, out_alt_summary; +SELECT madlib.graph_bfs( + 'vertex_alt', -- Vertex table + 'v_id', -- Vertex id column (NULL means use default naming) + 'edge_alt', -- Edge table + 'src=n1, dest=n2', -- Edge arguments (NULL means use default naming) + 8, -- Source vertex for BFS + 'out_alt'); -- Output table of nodes reachable from source_vertex +SELECT * FROM out_alt ORDER BY v_id; +</pre> +<pre class="result"> + v_id | dist | parent +------+------+-------- + 8 | 0 | + 9 | 1 | 8 + 10 | 1 | 8 + 11 | 2 | 9 +</pre> + +-# Now we show an example where the graph is treated as a directed graph. +<pre class="syntax"> +DROP TABLE IF EXISTS out_alt_dir, out_alt_dir_summary; +SELECT madlib.graph_bfs( + 'vertex_alt', -- Vertex table + 'v_id', -- Vertex id column (NULL means use default naming) + 'edge_alt', -- Edge table + 'src=n1, dest=n2', -- Edge arguments (NULL means use default naming) + 8, -- Source vertex for BFS + 'out_alt_dir', -- Output table of nodes reachable from source_vertex + NULL, -- Maximum distance to traverse from source_vertex + TRUE); -- Flag for specifying directed graph +SELECT * FROM out_alt_dir ORDER BY v_id; +</pre> +<pre class="result"> + v_id | dist | parent +------+------+-------- + 8 | 0 | + 9 | 1 | 8 + 10 | 2 | 9 + 11 | 2 | 9 +(4 rows) +</pre> +Notice that, with the graph being treated as directed, the parent of v_id=10 +is now vertex 9 and not 8 as in the undirected case. + +-# Create a graph with 2 groups: +<pre class="syntax"> +DROP TABLE IF EXISTS edge_gr; +CREATE TABLE edge_gr( + g1 INTEGER, + g2 TEXT, + src INTEGER, + dest INTEGER + ); +INSERT INTO edge_gr VALUES +(100, 'a', 0, 5), +(100, 'a', 1, 0), +(100, 'a', 1, 3), +(100, 'a', 2, 6), +(100, 'a', 3, 4), +(100, 'a', 3, 5), +(100, 'a', 4, 2), +(100, 'a', 8, 9), +(100, 'a', 9, 10), +(100, 'a', 9, 11), +(100, 'a', 10, 8), +(202, 'c', 8, 9), +(202, 'c', 9, 10), +(202, 'c', 9, 11), +(202, 'c', 10, 8) +; +</pre> + +-# Run BFS for all groups from a given source_vertex. +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr, out_gr_summary; +SELECT madlib.graph_bfs( + 'vertex', -- Vertex table + NULL, -- Vertex id column (NULL means use default naming) + 'edge_gr', -- Edge table + NULL, -- Edge arguments (NULL means use default naming) + 8, -- Source vertex for BFS + 'out_gr', -- Output table of nodes reachable from source_vertex + NULL, -- Maximum distance to traverse from source_vertex + NULL, -- Flag for specifying directed graph + 'g1,g2' -- Grouping columns +); +SELECT * FROM out_gr ORDER BY g1,g2,dist,id; +</pre> +<pre class="result"> + g1 | g2 | id | dist | parent +-----+----+----+------+-------- + 100 | a | 8 | 0 | + 100 | a | 9 | 1 | 8 + 100 | a | 10 | 1 | 8 + 100 | a | 11 | 2 | 9 + 202 | c | 8 | 0 | + 202 | c | 9 | 1 | 8 + 202 | c | 10 | 1 | 8 + 202 | c | 11 | 2 | 9 +(8 rows) +</pre> +If source_vertex is not present in +a group, then that group will not appear in the output table. +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr, out_gr_summary; +SELECT madlib.graph_bfs( + 'vertex', -- Vertex table + NULL, -- Vertex id column (NULL means use default naming) + 'edge_gr', -- Edge table + NULL, -- Edge arguments (NULL means use default naming) + 3, -- Source vertex for BFS + 'out_gr', -- Output table of nodes reachable from source_vertex + NULL, -- Maximum distance to traverse from source_vertex + NULL, -- Flag for specifying directed graph + 'g1,g2' -- Grouping columns +); +SELECT * FROM out_gr ORDER BY g1,g2,dist,id; +</pre> +<pre class="result"> + g1 | g2 | id | dist | parent +-----+----+----+------+-------- + 100 | a | 3 | 0 | + 100 | a | 1 | 1 | 3 + 100 | a | 4 | 1 | 3 + 100 | a | 5 | 1 | 3 + 100 | a | 0 | 2 | 1 + 100 | a | 2 | 2 | 4 + 100 | a | 6 | 3 | 2 +(7 rows) +</pre> + +@anchor literature +@par Literature + +[1] Breadth-first Search algorithm. https://en.wikipedia.org/wiki/Breadth-first_search +*/ + +------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + source_vertex INT, + out_table TEXT, + max_distance INT, + directed BOOLEAN, + grouping_cols TEXT +) RETURNS VOID AS $$ + PythonFunction(graph, bfs, graph_bfs) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + source_vertex INT, + out_table TEXT, + max_distance INT, + directed BOOLEAN +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.graph_bfs($1, $2, $3, $4, $5, $6, $7, $8, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + source_vertex INT, + out_table TEXT, + max_distance INT +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.graph_bfs($1, $2, $3, $4, $5, $6, $7, NULL, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + source_vertex INT, + out_table TEXT +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.graph_bfs($1, $2, $3, $4, $5, $6, NULL, NULL, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------- + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(graph, bfs, graph_bfs_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.graph_bfs(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8c9b955c/src/ports/postgres/modules/graph/graph_utils.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in b/src/ports/postgres/modules/graph/graph_utils.py_in index 77378d6..944c301 100644 --- a/src/ports/postgres/modules/graph/graph_utils.py_in +++ b/src/ports/postgres/modules/graph/graph_utils.py_in @@ -38,6 +38,17 @@ from utilities.validate_args import table_exists from utilities.validate_args import columns_exist_in_table from utilities.validate_args import table_is_empty +def _grp_null_checks(grp_list): + + """ + Helper function for generating NULL checks for grouping columns + to be used within a WHERE clause + Args: + @param grp_list The list of grouping columns + """ + return ' AND '.join([" {i} IS NOT NULL ".format(**locals()) + for i in grp_list]) + def _check_groups(tbl1, tbl2, grp_list): """ http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8c9b955c/src/ports/postgres/modules/graph/test/bfs.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/test/bfs.sql_in b/src/ports/postgres/modules/graph/test/bfs.sql_in new file mode 100644 index 0000000..223ab84 --- /dev/null +++ b/src/ports/postgres/modules/graph/test/bfs.sql_in @@ -0,0 +1,278 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + + +DROP TABLE IF EXISTS vertex,edge,edge_grp; + +CREATE TABLE vertex( + id INTEGER + ); +INSERT INTO vertex VALUES +(0), +(1), +(2), +(3), +(4), +(5), +(6), +(7), +(8), +(9), +(10), +(11) +; + +CREATE TABLE edge( + src INTEGER, + dest INTEGER, + weight DOUBLE PRECISION + ); +INSERT INTO edge VALUES +(0, 5, 1), +(1, 0, 1), +(1, 3, 1), +(2, 6, 1), +(3, 4, 1), +(3, 5, 1), +(4, 2, 1), +(8, 9, 1), +(9, 10, 1), +(9, 11, 1), +(10, 8, 1) +; + +CREATE TABLE edge_grp( + g1 INTEGER, + g2 TEXT, + src INTEGER, + dest INTEGER, + weight DOUBLE PRECISION + ); +INSERT INTO edge_grp VALUES +(100, 'a', 0, 5, 1), +(100, 'a', 1, 0, 1), +(100, 'a', 1, 3, 1), +(100, 'a', 2, 6, 1), +(100, 'a', 3, 4, 1), +(100, 'a', 3, 5, 1), +(100, 'a', 4, 2, 1), +(100, 'a', 8, 9, 1), +(100, 'a', 9, 10, 1), +(100, 'a', 9, 11, 1), +(100, 'a', 10, 8, 1), +(100, 'b', 0, 5, 1), +(100, 'b', 1, 0, 1), +(100, 'b', 1, 3, 1), +(100, 'b', 2, 6, 1), +(100, 'b', 3, 4, 1), +(100, 'b', 3, 5, 1), +(100, 'b', 4, 2, 1), +(100, 'b', 8, 9, 1), +(100, 'b', 9, 10, 1), +(100, 'b', 9, 11, 1), +(100, 'b', 10, 8, 1), +(202, 'c', 8, 9, 1), +(202, 'c', 9, 10, 1), +(202, 'c', 9, 11, 1), +(202, 'c', 10, 8, 1), +(NULL, 'b', 8, 9, 1), +(NULL, 'b', 9, 10, 1), +(NULL, 'b', 9, 11, 1), +(NULL, 'b', 10, 8, 1) +; +; + +--------------------- +-- Undirected Graph +--------------------- +-- Results to check against - uncomment if needed +-- -- source_vertex = 3 +-- DROP TABLE IF EXISTS out_actual; +-- CREATE TABLE out_actual (src INT, id INT, dist INT, parent INT); +-- INSERT INTO out_actual VALUES +-- (3,3,0,NULL), +-- (3,1,1,3), +-- (3,4,1,3), +-- (3,5,1,3), +-- (3,0,2,1), +-- (3,2,2,4), +-- (3,6,3,2) +-- ; +-- -- source_vertex = 7 +-- DROP TABLE IF EXISTS out_actual; +-- CREATE TABLE out_actual (src INT, id INT, dist INT, parent INT); +-- +-- -- source_vertex = 8 +-- DROP TABLE IF EXISTS out_actual; +-- CREATE TABLE out_actual (src INT, id INT, dist INT, parent INT); +-- INSERT INTO out_actual VALUES +-- (8,8,0,NULL), +-- (8,9,1,8), +-- (8,10,1,8), +-- (8,11,2,9) +-- ; + +-- Undirected graph +-- source_vertex = 3 +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex',NULL,'edge',NULL,3,'out_frombfs'); + +SELECT assert( + (array_agg(id ORDER BY id) = ARRAY[1,4,5]::INT[]) + AND + (array_agg(parent ORDER BY id) = ARRAY[3,3,3]::INT[]), + 'Wrong output in graph (BFS)') + FROM out_frombfs WHERE dist = 1 GROUP BY dist; + +SELECT assert(max(dist) = 3, 'Wrong output in graph (BFS)') + FROM out_frombfs; + +-- source_vertex = 7 +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex',NULL,'edge',NULL,7,'out_frombfs'); + +SELECT assert(COUNT(*) = 0, 'Wrong output in graph (BFS)') FROM out_frombfs; + +-- source_vertex = 8 +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex',NULL,'edge',NULL,8,'out_frombfs',NULL,FALSE,NULL); + +SELECT assert(parent = 8, 'Wrong output in graph (BFS)') + FROM out_frombfs WHERE dist = 2 AND id = 10; + +------------------- +-- Directed graph +------------------- +-- Results to check against - uncomment if needed + +-- -- source_vertex = 3 +-- DROP TABLE IF EXISTS out_actual; +-- CREATE TABLE out_actual (src INT, id INT, dist INT, parent INT); +-- INSERT INTO out_actual VALUES +-- (3,3,0,NULL), +-- (3,4,1,3), +-- (3,5,1,3), +-- (3,2,2,4), +-- (3,6,3,2) +-- ; +-- -- source_vertex = 7 +-- DROP TABLE IF EXISTS out_actual; +-- CREATE TABLE out_actual (src INT, id INT, dist INT, parent INT); +-- -- source_vertex = 8 +-- DROP TABLE IF EXISTS out_actual; +-- CREATE TABLE out_actual (src INT, id INT, dist INT, parent INT); +-- (8, 8, 0, NULL), +-- (8, 9, 1, 8), +-- (8, 10, 2, 9), +-- (8, 11, 2, 9) +-- ; + +-- source_vertex = 3 +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex','id','edge','src,dest',3,'out_frombfs',NULL,TRUE,NULL); + +SELECT assert( + (array_agg(id ORDER BY id) = ARRAY[4,5]::INT[]) + AND + (array_agg(parent ORDER BY id) = ARRAY[3,3]::INT[]), + 'Wrong output in graph (BFS)') + FROM out_frombfs WHERE dist = 1 GROUP BY dist; +SELECT assert(COUNT(*) = 5, 'Wrong output in graph (BFS)') FROM out_frombfs; + +-- source_vertex = 7 +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex','id','edge','src,dest',7,'out_frombfs',NULL,TRUE,NULL); + +SELECT assert(COUNT(*) = 0, 'Wrong output in graph (BFS)') FROM out_frombfs; + +-- source_vertex = 8 +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex',NULL,'edge',NULL,8,'out_frombfs',NULL,TRUE); + +SELECT assert(COUNT(*) = 2, 'Wrong output in graph (BFS)') + FROM out_frombfs WHERE dist = 2; + +----------------------- +-- max_distance check +----------------------- +-- source_vertex = 3 +-- Undirected graph +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex','id','edge','src,dest',3,'out_frombfs',2,FALSE); + +SELECT assert(MAX(dist) = 2 AND COUNT(*) = 6, + 'Wrong output in graph (BFS)') FROM out_frombfs; + +-- Directed graph +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex',NULL,'edge',NULL,3,'out_frombfs',2,TRUE); + +SELECT assert(MAX(dist) = 2 AND COUNT(*) = 4, + 'Wrong output in graph (BFS)') FROM out_frombfs; + +--------------------- +-- Grouping columns +--------------------- +-- -- source_vertex = 3 +-- -- Undirected graph +-- DROP TABLE IF EXISTS out_actual; +-- CREATE TABLE out_actual (src INT, id INT, dist INT, parent INT); +-- INSERT INTO out_actual VALUES +-- (3,100,'a',3,0,NULL), +-- (3,100,'a',1,1,3), +-- (3,100,'a',4,1,3), +-- (3,100,'a',5,1,3), +-- (3,100,'a',0,2,1), +-- (3,100,'a',2,2,4), +-- (3,100,'a',6,3,2), +-- (3,100,'b',3,0,NULL), +-- (3,100,'b',1,1,3), +-- (3,100,'b',4,1,3), +-- (3,100,'b',5,1,3), +-- (3,100,'b',0,2,1), +-- (3,100,'b',2,2,4), +-- (3,100,'b',6,3,2) +-- ; + +DROP TABLE IF EXISTS out_frombfs, out_frombfs_summary; +SELECT graph_bfs('vertex',NULL,'edge_grp',NULL,3,'out_frombfs',NULL,NULL,'g1,g2'); + +SELECT assert( COUNT(*) = 14,'Wrong output in graph (BFS)') FROM out_frombfs; +SELECT assert(COUNT(*) = 7, + 'Wrong output in graph (BFS)') FROM out_frombfs WHERE g2 = 'a' +; +SELECT assert( + array_agg(id ORDER BY id) = ARRAY[0,2]::INT[] + AND + array_agg(parent ORDER BY id) = ARRAY[1,4]::INT[], + 'Wrong output in graph (BFS)') + FROM out_frombfs WHERE dist = 2 AND g1 = 100 AND g2 = 'a' +; +SELECT assert( + array_agg(id ORDER BY id) = ARRAY[0,2]::INT[] + AND + array_agg(parent ORDER BY id) = ARRAY[1,4]::INT[], + 'Wrong output in graph (BFS)') + FROM out_frombfs WHERE dist = 2 AND g1 = 100 AND g2 = 'b' +; +SELECT assert(MIN(g1) = 100 AND MAX(g1) = 100, + 'Wrong output in graph (BFS)') FROM out_frombfs GROUP BY g1,g2 +;