Repository: incubator-madlib Updated Branches: refs/heads/master 3f599c943 -> 06788cc48
Graph: Add initial set of centrality measures JIRA: MADLIB-1073 This commit adds the following graph measures: - Closeness (uses APSP) - Graph diameter (uses APSP) - Average path length (uses APSP) - In/out degrees Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/06788cc4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/06788cc4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/06788cc4 Branch: refs/heads/master Commit: 06788cc486fbd15391e3d477748a37fe0657baf1 Parents: 3f599c9 Author: Rahul Iyer <ri...@apache.org> Authored: Mon Jul 31 13:54:48 2017 -0700 Committer: Rahul Iyer <ri...@apache.org> Committed: Mon Jul 31 13:54:48 2017 -0700 ---------------------------------------------------------------------- doc/mainpage.dox.in | 23 +- src/ports/postgres/modules/graph/apsp.py_in | 13 +- src/ports/postgres/modules/graph/measures.py_in | 639 +++++++++++++++ .../postgres/modules/graph/measures.sql_in | 816 +++++++++++++++++++ .../postgres/modules/graph/test/measures.sql_in | 150 ++++ 5 files changed, 1623 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/doc/mainpage.dox.in ---------------------------------------------------------------------- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index de70d5d..30f566d 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -123,23 +123,24 @@ complete matrix stored as a distributed table. @defgroup grp_stemmer Stemming @ingroup grp_datatrans -@defgroup grp_graph Graph -@{Contains graph algorithms. @} +@defgroup grp_graph Graph +Contains graph algorithms. +@{ @defgroup grp_apsp All Pairs Shortest Path - @ingroup grp_graph - @defgroup grp_bfs Breadth-First Search - @ingroup grp_graph - + @defgroup grp_graph_measures Measures + Graph Measures + @{ + @defgroup grp_graph_closeness Closeness + @defgroup grp_graph_diameter Graph Diameter + @defgroup grp_graph_avg_path_length Average Path Length + @defgroup grp_graph_vertex_degrees In-Out degree + @} @defgroup grp_pagerank PageRank - @ingroup grp_graph - @defgroup grp_sssp Single Source Shortest Path - @ingroup grp_graph - @defgroup grp_wcc Weakly Connected Components - @ingroup grp_graph +@} @defgroup grp_mdl Model Evaluation @{Contains functions for evaluating accuracy and validation of predictive methods. @} http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/apsp.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/apsp.py_in b/src/ports/postgres/modules/graph/apsp.py_in index ab8a566..42ee698 100644 --- a/src/ports/postgres/modules/graph/apsp.py_in +++ b/src/ports/postgres/modules/graph/apsp.py_in @@ -73,16 +73,16 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table, # Prepare the input for recording in the summary table if vertex_id is None: - v_st = "NULL" + v_st = '' vertex_id = "id" else: v_st = vertex_id if edge_args is None: - e_st = "NULL" + e_st = '' else: e_st = edge_args if grouping_cols is None: - g_st = "NULL" + g_st = '' glist = None else: g_st = grouping_cols @@ -127,8 +127,7 @@ def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table, checkg_vv_sub = "TRUE" grp_by = "" - if grouping_cols is not None: - + if grouping_cols: # We use actual table names in some cases and aliases in others # In some cases, we swap the table names so use of an alias is # necessary. In other cases, they are used to simplify debugging. @@ -526,10 +525,10 @@ def graph_apsp_get_path(schema_madlib, apsp_table, dest = edge_params["dest"] weight = edge_params["weight"] - if vertex_id == "NULL": + if not vertex_id or vertex_id == "NULL": vertex_id = "id" - if grouping_cols == "NULL": + if not grouping_cols or grouping_cols == "NULL": grouping_cols = None select_grps = "" http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/measures.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/measures.py_in b/src/ports/postgres/modules/graph/measures.py_in new file mode 100644 index 0000000..f5c38b4 --- /dev/null +++ b/src/ports/postgres/modules/graph/measures.py_in @@ -0,0 +1,639 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Graph Measures + +""" +@file measures.py_in + +@namespace graph +""" + +import plpy + +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import add_postfix +from utilities.utilities import extract_keyvalue_params + +from utilities.validate_args import get_cols +from utilities.validate_args import unquote_ident +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +from utilities.validate_args import columns_exist_in_table + +from graph_utils import get_graph_usage + +from collections import namedtuple +from functools import partial + + +class Graph(object): + """Class representing a Graph object""" + def __init__(self, + vertex_table, vertex_col_names, + edge_table, edge_col_names, + grouping_cols=None, + should_validate=True, + schema_madlib='madlib'): + self.vertex_table = vertex_table + self.vertex_id_col = vertex_col_names['id'] + + self.edge_table = edge_table + self.edge_params = namedtuple('Edge', 'src dest weight')( + edge_col_names['src'], edge_col_names['dest'], edge_col_names['weight']) + + self.grouping_cols = grouping_cols + self._madlib = schema_madlib + + if should_validate: + self._validate() + + # ---------------------------------------------------------------------- + + @staticmethod + def get_edge_params(edge_arg_str): + params_types = {'src': str, 'dest': str, 'weight': str} + default_args = {'src': 'src', 'dest': 'dest', 'weight': 'weight'} + return extract_keyvalue_params(edge_arg_str, params_types, default_args) + # ---------------------------------------------------------------------- + + def _validate(self): + _assert(self.vertex_table and + self.vertex_table.strip().lower() not in ('null', ''), + "Graph: Invalid vertex table name".format(**locals())) + _assert(table_exists(self.vertex_table), + "Graph: Vertex table ({0}) is missing".format(self.vertex_table)) + _assert(not table_is_empty(self.vertex_table), + "Graph: Vertex table ({0}) is empty".format(self.vertex_table)) + + _assert(self.edge_table and self.edge_table.strip().lower() not in ('null', ''), + "Graph: Invalid edge table name".format(**locals())) + _assert(table_exists(self.edge_table), + "Graph: Edge table ({0}) is missing".format(self.edge_table)) + _assert(not table_is_empty(self.edge_table), + "Graph: Edge table ({0}) is empty".format(self.edge_table)) + + existing_cols = set(unquote_ident(i) for i in get_cols(self.vertex_table)) + _assert(self.vertex_id_col in existing_cols, + "Graph: The vertex column {0} is not present in " + "vertex table ({1})".format(self.vertex_id_col, self.vertex_table)) + + _assert(columns_exist_in_table(self.edge_table, self.edge_params), + "Graph: Not all columns from {0} are present in edge table ({1})". + format(self.edge_params, self.edge_table)) + # ---------------------------------------------------------------------- + + def closeness(self, out_table, apsp_table, vertex_filter_expr=None, **kwargs): + """ Compute various centrality metrics + + Following metrics are computed for a vertex, considering only + reachable vertices for that vertex: + - inverse_sum_dist: Inverse of the sum of shortest distances + - inverse_average_dist: Inverse of the average of shortest distances + - sum_inverse_dist: Sum of the inverse of shortest distances + - k_degree: Total number of reachable vertices + + Args: + @param out_table: str. Name of table to store results in + @param apsp_table: str. APSP output is used to compute these metrics. + @param vertex_filter_expr: str. A WHERE clause can be specified to restrict + the output vertices + Returns: + None + """ + grouping_cols_comma = self.grouping_cols + ", " if self.grouping_cols else '' + e = self.edge_params + filter_clause = "WHERE " + vertex_filter_expr if vertex_filter_expr else '' + + plpy.execute(""" + CREATE TABLE {out_table} AS + SELECT + {grouping_cols_comma} + {e.src}, + 1.0 / sum_dist AS inverse_sum_dist, + CASE WHEN k_degree = 0 THEN NULL + ELSE k_degree::double precision / sum_dist + END AS inverse_avg_dist, + sum_inverse_dist, + k_degree + FROM ( + -- For below measures treat 'Infinity' as NULL so that + -- 'sum' and 'avg' ignore the value. + SELECT + {grouping_cols_comma} + {e.src}, + sum(CASE WHEN {e.weight} = 'Infinity'::double precision THEN NULL + WHEN {e.src} = {e.dest} THEN NULL + ELSE {e.weight} END) AS sum_dist, + sum(CASE WHEN {e.weight} = 'Infinity'::double precision THEN NULL + WHEN {e.weight} = 0::double precision THEN 0. + ELSE 1.0 / {e.weight} END) AS sum_inverse_dist, + count(CASE WHEN {e.weight} = 'Infinity'::double precision THEN NULL + WHEN {e.src} = {e.dest} THEN NULL + ELSE 1 END) AS k_degree + FROM {apsp_table} + {filter_clause} + GROUP BY {grouping_cols_comma} + {e.src} + -- Don't place the 'Infinity' checks above in a WHERE clause + -- since groups with only 'Infinity' rows need to show up + -- in output table + ) AS q + """.format(**locals())) + # -------------------------------------------------------------------------- + + def avg_path_length(self, out_table, apsp_table, vertex_filter_expr=None, **kwargs): + """ Compute the average path length of graph + + This is the average of the shortest path distances between unique vertices. + + Args: + @param out_table: str. Name of table to store results in + @param apsp_table: str. APSP output is used to compute these metrics. + Returns: + None + """ + if self.grouping_cols: + grouping_cols_comma = self.grouping_cols + ", " + group_by_str = 'GROUP BY ' + self.grouping_cols + else: + grouping_cols_comma = group_by_str = '' + e = self.edge_params + filter_clause = "AND " + vertex_filter_expr if vertex_filter_expr else '' + plpy.execute(""" + CREATE TABLE {out_table} AS + SELECT + {grouping_cols_comma} + -- Filtering 'Infinity' occurs in CASE instead of WHERE clause + -- so that the edge is part of the average value i.e. part of + -- the count of paths but zero addition to sum of distances. + AVG(CASE WHEN {e.weight} = 'Infinity'::double precision + THEN 0::double precision + ELSE {e.weight}::double precision + END) as avg_path_length + FROM {apsp_table} + WHERE {e.src} != {e.dest} + {filter_clause} + {group_by_str} + """.format(**locals())) + # ---------------------------------------------------------------------- + + def in_out_degrees(self, out_table, **kwargs): + """ + Args: + @param out_table: str. Name of table to store results + + Returns: + None + """ + # TODO: validate if output columns names are in grouping_cols + if self.grouping_cols: + grouping_cols_comma = self.grouping_cols + ", " + else: + grouping_cols_comma = '' + e = self.edge_params + + plpy.execute(""" + CREATE TABLE {out_table} AS + SELECT + {grouping_cols_comma} + in_q.vertex as {self.vertex_id_col}, + indegree, + outdegree + FROM + ( + SELECT + {grouping_cols_comma} + {e.dest} as vertex, + count(*) as indegree + FROM {self.edge_table} + WHERE {e.src} != {e.dest} + GROUP BY {grouping_cols_comma} + {e.dest} + ) as in_q + JOIN + ( + SELECT + {grouping_cols_comma} + {e.src} as vertex, + count(*) as outdegree + FROM {self.edge_table} + WHERE {e.src} != {e.dest} + GROUP BY {grouping_cols_comma} + {e.src} + ) as out_q + USING ({grouping_cols_comma} vertex) + """.format(**locals())) + # -------------------------------------------------------------------------- + + def diameter(self, out_table, apsp_table, **kwargs): + """ Compute the diameter of graph + + Diameter is defined as the maximum of the shortest path distances between + any pair of vertices + Args: + @param out_table: str. Name of table to store results in + @param apsp_table: str. APSP output is used to compute these metrics. + Returns: + None + """ + if self.grouping_cols: + grouping_cols_comma = self.grouping_cols + ", " + group_by_str = 'GROUP BY ' + self.grouping_cols + else: + grouping_cols_comma = group_by_str = '' + e = self.edge_params + plpy.execute(""" + CREATE TABLE {out_table} AS + SELECT + {grouping_cols_comma} + {e.weight} AS diameter, + {self._madlib}.matrix_agg( + ARRAY[{e.src}, {e.dest}]::double precision[])::integer[] + AS diameter_end_vertices + FROM + {apsp_table} JOIN + ( + SELECT + {grouping_cols_comma} + max({e.weight}) as {e.weight} + FROM {apsp_table} + WHERE {e.weight} != 'Infinity'::double precision + {group_by_str} + ) q + USING ({grouping_cols_comma} {e.weight}) + GROUP BY {grouping_cols_comma} {e.weight} + """.format(**locals())) +# ------------------------------------------------------------------------------ + + +def graph_apsp_measures(schema_madlib, apsp_table, out_table, + measure_name, vertex_filter_expr=None, **kwargs): + """ Define measure that depend on APSP output + + This function acts as a stub for all functions that depend on APSP being run + prior. + + """ + + _assert(table_exists(apsp_table) and not table_is_empty(apsp_table), + "Graph: Invalid APSP table: {0}".format(apsp_table)) + + summary_table_name = add_postfix(apsp_table, "_summary") + summary_cols = ['vertex_table', 'vertex_id', + 'edge_table', 'edge_args', 'grouping_cols'] + _assert(table_exists(summary_table_name), + "Graph: Summary APSP table ({0}) does not exist".format(summary_table_name)) + _assert(columns_exist_in_table(summary_table_name, summary_cols, schema_madlib), + "Graph: Missing some columns from summary table ({0})".format(summary_table_name)) + _assert(out_table and out_table.strip().lower() not in ('null', ''), + "Graph: Invalid output table name ({0})".format(out_table)) + _assert(not table_exists(out_table), + "Graph: Output table ({0}) already exists".format(out_table)) + + with MinWarning('warning'): + s = plpy.execute("SELECT * FROM {0}".format(summary_table_name))[0] + edge_col_names = Graph.get_edge_params(s['edge_args']) + g = Graph(s['vertex_table'], dict([('id', s['vertex_id'])]), + s['edge_table'], edge_col_names, + s['grouping_cols'], + should_validate=False, + schema_madlib=schema_madlib) + try: + measure_func = getattr(g, measure_name) + measure_func(out_table, apsp_table, + vertex_filter_expr=vertex_filter_expr) + except AttributeError: + plpy.error('Measure {0} not implemented yet'.format(measure_name)) +# ---------------------------------------------------------------------- + + +graph_closeness = partial(graph_apsp_measures, measure_name='closeness') +graph_diameter = partial(graph_apsp_measures, measure_name='diameter') +graph_avg_path_length = partial(graph_apsp_measures, measure_name='avg_path_length') + + +def graph_vertex_degrees(schema_madlib, vertex_table, vertex_id, edge_table, + edge_args, out_table, grouping_cols, **kwargs): + """ + Args: + @param schema_madlib: str. Name of MADlib schema + @param vertex_table: str. Table name containing vertex data + @param vertex_id: str. Column name containing ids of vertices + @param edge_table: str. Table name containing edge data + @param edge_args: str. Parameters describing edges + @param out_table: str. Name of table to store results + @param grouping_cols: str. Columns to group computation by + + + Returns: + None + """ + _assert(out_table and out_table.strip().lower() not in ('null', ''), + "Graph: Invalid output table name!".format(**locals())) + _assert(not table_exists(out_table), + "Graph: Output table already exists!".format(**locals())) + + if not vertex_id: + vertex_id = 'id' + edge_col_names = Graph.get_edge_params(edge_args) + g = Graph(vertex_table, dict([('id', vertex_id)]), + edge_table, edge_col_names, grouping_cols, + schema_madlib=schema_madlib) + g.in_out_degrees(out_table) +# ---------------------------------------------------------------------- + +# ----------------------------------------------------------------------- +# All help functions +# ----------------------------------------------------------------------- + + +CREATE_GRAPH_EXAMPLE = """ +-- Create a graph, represented as vertex and edge tables. +DROP TABLE IF EXISTS vertex,edge,out,out_summary,out_path; +CREATE TABLE vertex( + id INTEGER + ); +CREATE TABLE edge( + src INTEGER, + dest INTEGER, + weight DOUBLE PRECISION +); + +INSERT INTO vertex VALUES +(0), +(1), +(2), +(3), +(4), +(5), +(6), +(7) +; +INSERT INTO edge VALUES +(0, 1, 1), +(0, 2, 1), +(0, 4, 10), +(1, 2, 2), +(1, 3, 10), +(2, 3, 1), +(2, 5, 1), +(2, 6, 3), +(3, 0, 1), +(4, 0, -2), +(5, 6, 1), +(6, 7, 1) +; +""" + +COMPUTE_APSP_EXAMPLE = """ +-- Compute the apsp: +DROP TABLE IF EXISTS out; +SELECT graph_apsp( + 'vertex', -- Vertex table + 'id', -- Vertix id column + 'edge', -- Edge table + 'src=src, dest=dest, weight=weight', -- Comma delimited string of edge arguments + 'out' -- Output table of apsp +); +""" + + +def graph_closeness_help(schema_madlib, message, **kwargs): + + intro = """ +The Closeness function returns various closeness centrality measures and the +k-degree for given subset of vertices. The closeness measures are the inverse of +the sum, the inverse of the average, and the sum of inverses of the shortest +distances to all reachable target vertices (excluding the source vertex). + """ + + if not message: + help_string = intro + """ +For more details: + SELECT {schema_madlib}.graph_closeness('usage') + """ + elif message.lower() in ['usage', 'help', '?']: + help_string = intro + """ + +---------------------------------------------------------------------------- + USAGE +---------------------------------------------------------------------------- +SELECT {schema_madlib}.graph_closeness( + apsp_table TEXT, -- Name of table containing APSP results + out_table TEXT, -- Name of table to store Closeness measuress + vertex_filter_expr TEXT -- Valid PostgreSQL expression that describes the + -- vertices to generate closeness measures for. +) + +---------------------------------------------------------------------------- + OUTPUT +---------------------------------------------------------------------------- +The output table contains a row for every vertex of every group and have +the following columns (in addition to the grouping columns): + - inverse_sum_dist : Inverse of the sum of shortest distances to all reachable + vertices. + - inverse_average_dist: Inverse of the average of shortest distances to all + reachable vertices. + - sum_inverse_dist : Sum of the inverse of shortest distances to all reachable + vertices. + - k_degree : Total number of reachable vertices. + + """ + elif message.lower() in ['example', 'examples']: + help_string = """ +---------------------------------------------------------------------------- + EXAMPLES +---------------------------------------------------------------------------- +{create_graph_example} +{compute_apsp_example} + +-# Compute the closeness measure for all nodes: +DROP TABLE IF EXISTS out_closeness; +SELECT {schema_madlib}.graph_closeness('out_apsp', 'out_closeness'); +SELECT * FROM out_closeness; + """ + else: + help_string = "No such option. Use {schema_madlib}.graph_closeness()" + + return help_string.format(schema_madlib=schema_madlib, + create_graph_example=CREATE_GRAPH_EXAMPLE, + compute_apsp_example=COMPUTE_APSP_EXAMPLE) +# ------------------------------------------------------------------------- + + +def graph_diameter_help(schema_madlib, message, **kwargs): + + intro = """ +Diameter is defined as the longest of all shortest paths in a graph. + """ + + if not message: + help_string = intro + """ +For more details: + SELECT {schema_madlib}.graph_diameter('usage') + """ + elif message.lower() in ['usage', 'help', '?']: + help_string = intro + """ + +---------------------------------------------------------------------------- + USAGE +---------------------------------------------------------------------------- +SELECT {schema_madlib}.graph_diameter( + apsp_table TEXT, -- Name of table containing APSP results + out_table TEXT -- Name of table to store Closeness measuress +) + +---------------------------------------------------------------------------- + OUTPUT +---------------------------------------------------------------------------- +It contains a row for every group, the diameter value and the two vertices +that are the farthest apart. + """ + elif message.lower() in ['example', 'examples']: + help_string = """ +---------------------------------------------------------------------------- + EXAMPLES +---------------------------------------------------------------------------- +{create_graph_example} +{compute_apsp_example} + +-# Compute the diameter measure for the graph: +DROP TABLE IF EXISTS out_diameter; +SELECT {schema_madlib}.graph_diameter('out_apsp', 'out_diameter'); +SELECT * FROM out_diameter; + """ + else: + help_string = "No such option. Use {schema_madlib}.graph_diameter()" + + return help_string.format(schema_madlib=schema_madlib, + create_graph_example=CREATE_GRAPH_EXAMPLE, + compute_apsp_example=COMPUTE_APSP_EXAMPLE) +# ------------------------------------------------------------------------- + + +def graph_avg_path_length_help(schema_madlib, message, **kwargs): + + intro = """ +This function computes the average of the shortest paths between each pair of +vertices. Average path length is based on "reachable target vertices", so it +ignores infinite-length paths between vertices that are not connected. + """ + + if not message: + help_string = intro + """ +For more details: + SELECT {schema_madlib}.graph_avg_path_length('usage') + """ + elif message.lower() in ['usage', 'help', '?']: + help_string = intro + """ + +---------------------------------------------------------------------------- + USAGE +---------------------------------------------------------------------------- +SELECT {schema_madlib}.graph_avg_path_length( + apsp_table TEXT, -- Name of table containing APSP results + out_table TEXT -- Name of table to store Closeness measuress +) + +---------------------------------------------------------------------------- + OUTPUT +---------------------------------------------------------------------------- +It contains a row for every group, and the average path value. +""" + elif message.lower() in ['example', 'examples']: + help_string = """ +---------------------------------------------------------------------------- + EXAMPLES +---------------------------------------------------------------------------- +{create_graph_example} +{compute_apsp_example} + +-# Compute the average path length for the graph: +DROP TABLE IF EXISTS out_avg_path_length; +SELECT {schema_madlib}.graph_avg_path_length('out_apsp', 'out_avg_path_length'); +SELECT * FROM out_avg_path_length; + """ + else: + help_string = "No such option. Use {schema_madlib}.graph_avg_path_length()" + + return help_string.format(schema_madlib=schema_madlib, + create_graph_example=CREATE_GRAPH_EXAMPLE, + compute_apsp_example=COMPUTE_APSP_EXAMPLE) +# ------------------------------------------------------------------------- + + +def graph_vertex_degrees_help(schema_madlib, message, **kwargs): + + intro = """ +This function computes the degree of each node. The node degree is the number of +edges adjacent to that node. The node in-degree is the number of edges pointing +in to the node and node out-degree is the number of edges pointing out of the +node. + """ + usage_text = get_graph_usage(schema_madlib, + 'graph_vertex_degrees', + """ + out_table TEXT, -- Name of the table to store the result of apsp. + grouping_cols TEXT -- The list of grouping columns.""") + + if not message: + help_string = intro + """ +For more details: + SELECT {schema_madlib}.graph_vertex_degrees('usage') + """ + elif message.lower() in ['usage', 'help', '?']: + help_string = intro + """ +{graph_usage} + +---------------------------------------------------------------------------- + OUTPUT +---------------------------------------------------------------------------- +It contains a row for every vertex of every group and has the following columns +(in addition to the grouping columns): + - vertex : The id for the source vertex. Will use the input vertex + column 'id' for column naming. + - indegree : Number of incoming edges to the vertex. + - outdegree : Number of outgoing edges from the vertex. + +""" + elif message.lower() in ['example', 'examples']: + help_string = """ +---------------------------------------------------------------------------- + EXAMPLES +---------------------------------------------------------------------------- +{create_graph_example} + +DROP TABLE IF EXISTS degrees; +SELECT {schema_madlib}.graph_vertex_degrees( + 'vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + 'src=src, dest=dest, weight=weight', + 'degrees'); -- Output table of shortest paths +SELECT * FROM degrees ORDER BY id; + """ + else: + help_string = "No such option. Use {schema_madlib}.graph_vertex_degrees()" + + return help_string.format(schema_madlib=schema_madlib, + create_graph_example=CREATE_GRAPH_EXAMPLE, + graph_usage=usage_text) +# ------------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/measures.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/measures.sql_in b/src/ports/postgres/modules/graph/measures.sql_in new file mode 100644 index 0000000..1992139 --- /dev/null +++ b/src/ports/postgres/modules/graph/measures.sql_in @@ -0,0 +1,816 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * @file closeness.sql_in + * + * @brief SQL functions for graph analytics + * @date Jun 2017 + * + * @sa Provides analytical measures for graphs + * + *//* ----------------------------------------------------------------------- */ +m4_include(`SQLCommon.m4') + +/** +@addtogroup grp_graph_closeness +@brief Computes the closeness centrality value of each node in the graph. + +<div class="toc"><b>Contents</b> +<ul> +<li><a href="#closeness">Closeness</a></li> +<li><a href="#examples">Examples</a></li> +</ul> +</div> + +The Closeness function returns various closeness centrality measures and the +k-degree for given subset of vertices. The closeness measures are the inverse of +the sum, the inverse of the average, and the sum of inverses of the shortest +distances to all reachable target vertices (excluding the source vertex). + +@note The closeness measures require a valid output from a prior APSP run - both +the APSP table and the associated output summary table. APSP is a +computationally expensive algorithm because it finds the shortest path between +all nodes in the graph. The worst case run-time for this implementation is O(V^2 +* E) where V is the number of vertices and E is the number of edges. In +practice, run-time will be generally be much less than this, depending on the +graph. + +@anchor closeness +@par Closeness +<pre class="syntax"> +graph_closeness( apsp_table, + output_table, + vertex_filter_expr + ) +</pre> + +\b Arguments +<dl class="arglist"> +<dt>apsp_table</dt> +<dd>TEXT. Name of the output table generated by a prior run of all pairs shortest path (APSP). +</dd> + +<dt>out_table</dt> +<dd>TEXT. Name of the table to store the closeness measures. +It contains a row for every vertex of every group and have +the following columns (in addition to the grouping columns): + - inverse_sum_dist: Inverse of the sum of shortest distances to all reachable + vertices. + - inverse_average_dist: Inverse of the average of shortest distances to all + reachable vertices. + - sum_inverse_dist: Sum of the inverse of shortest distances to all reachable + vertices. + - k_degree: Total number of reachable vertices. +</dd> + +<dt>vertex_filter_expr (optional)</dt> + +<dd>TEXT, default = NULL. Valid PostgreSQL expression that describes the + vertices to generate closeness measures for. If this parameter is not +specified, closeness measures are generated for all vertices in the apsp table. +This input should be treated like a WHERE clause. + +Some example inputs: +- If you want a short list of vertices, say 1, 2 and 3: +<pre>vertex_id IN (1, 2, 3)</pre> +- If you want a range of vertices between 1000 and 2000: +<pre>vertix_id BETWEEN 1000 AND 2000</pre> +- If you want a set of vertices from a separate table satisfying to a condition +<pre>EXISTS (SELECT vertex_id FROM vertices_of_interest + WHERE vertex_id > 5000 AND condition = 'xyz') +</pre> + +</dd> +</dl> + +@anchor examples +@examp + +-# Create vertex and edge tables to represent the graph: +<pre class="syntax"> +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER, + name TEXT + ); +CREATE TABLE edge( + src_id INTEGER, + dest_id INTEGER, + edge_weight FLOAT8 + ); +INSERT INTO vertex VALUES +(0, 'A'), +(1, 'B'), +(2, 'C'), +(3, 'D'), +(4, 'E'), +(5, 'F'), +(6, 'G'), +(7, 'H'); +INSERT INTO edge VALUES +(0, 1, 1.0), +(0, 2, 1.0), +(0, 4, 10.0), +(1, 2, 2.0), +(1, 3, 10.0), +(2, 3, 1.0), +(2, 5, 1.0), +(2, 6, 3.0), +(3, 0, 1.0), +(4, 0, -2.0), +(5, 6, 1.0), +(6, 7, 1.0); +</pre> + +-# Calculate the all-pair shortest paths: +<pre class="syntax"> +DROP TABLE IF EXISTS out_apsp, out_apsp_summary; +SELECT madlib.graph_apsp('vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + -- Edge arguments (NULL means use default naming) + 'out_apsp'); -- Output table of shortest paths +</pre> + +-# Compute the closeness measure for all nodes: +<pre class="syntax"> +DROP TABLE IF EXISTS out_closeness; +SELECT madlib.graph_closeness('out_apsp', 'out_closeness'); +SELECT * FROM out_closeness; +</pre> +<pre class="result"> + src_id | inverse_sum_dist | inverse_avg_dist | sum_inverse_dist | k_degree +\--------+--------------------+-------------------+------------------+---------- + 1 | 0.0285714285714286 | 0.2 | 1.93809523809524 | 7 + 3 | 0.0357142857142857 | 0.25 | 2.87424242424242 | 7 + 4 | -1 | -7 | -1 | 7 + 0 | 0.0434782608695652 | 0.304347826086957 | 3.68333333333333 | 7 + 6 | 1 | 1 | 1 | 1 + 2 | 0.0416666666666667 | 0.291666666666667 | 3.75 | 7 + 5 | 0.333333333333333 | 0.666666666666667 | 1.5 | 2 + 7 | [NULL] | [NULL] | 0 | 0 +(8 rows) +</pre> + +-# Create a graph with 2 groups and find APSP for each group: +<pre class="syntax"> +DROP TABLE IF EXISTS edge_gr; +CREATE TABLE edge_gr AS +( + SELECT *, 0 AS grp FROM edge + UNION + SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6 +); +INSERT INTO edge_gr VALUES +(4,5,-20,1); +</pre> + +-# Find APSP for all groups: +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr, out_gr_summary; +SELECT madlib.graph_apsp( + 'vertex', -- Vertex table + NULL, -- Vertex id column (NULL means use default naming) + 'edge_gr', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + 'out_gr', -- Output table of shortest paths + 'grp' -- Grouping columns +); +</pre> + +-# Compute closeness measure for vertex 0 to vertex 5 in every group +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr_path; +SELECT madlib.graph_closeness('out_gr', 'out_gr_closeness', 'src_id >= 0 and src_id <=5'); +SELECT * FROM out_gr_closeness ORDER BY grp; +</pre> +<pre class="result"> + grp | src_id | inverse_sum_dist | inverse_avg_dist | sum_inverse_dist | k_degree +-------+----------+-----------------------+----------------------+---------------------+------------ + 0 | 0 | 0.0434782608695652 | 0.304347826086957 | 3.68333333333333 | 7 + 0 | 5 | 0.333333333333333 | 0.666666666666667 | 1.5 | 2 + 0 | 4 | -1 | -7 | -1 | 7 + 0 | 3 | 0.0357142857142857 | 0.25 | 2.87424242424242 | 7 + 0 | 1 | 0.0285714285714286 | 0.2 | 1.93809523809524 | 7 + 0 | 2 | 0.0416666666666667 | 0.291666666666667 | 3.75 | 7 + 1 | 3 | 0.142857142857143 | 0.714285714285714 | 1.97979797979798 | 5 + 1 | 5 | [NULL] | [NULL] | 0 | 0 + 1 | 0 | 0.25 | 1.25 | 2.5 | 5 + 1 | 1 | 0.0588235294117647 | 0.294117647058824 | 0.988095238095238 | 5 + 1 | 2 | 0.1 | 0.5 | 1.79166666666667 | 5 + 1 | 4 | -0.0416666666666667 | -0.208333333333333 | -2.55 | 5 +(12 rows) +</pre> + +*/ +------------------------------------------------------------------------- + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness( + apsp_table TEXT, + out_table TEXT, + vertex_filter_expr TEXT +) RETURNS VOID AS $$ + PythonFunction(graph, measures, graph_closeness) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness( + apsp_table TEXT, + out_table TEXT +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.graph_closeness($1, $2, NULL); +$$ LANGUAGE SQL VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------------- + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(graph, measures, graph_closeness_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_closeness() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.graph_closeness(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +/** +@addtogroup grp_graph_diameter +@brief Computes the diameter of a graph. + +<div class="toc"><b>Contents</b> +<ul> +<li><a href="#diameter">Diameter</a></li> +<li><a href="#examples">Examples</a></li> +</ul> +</div> + +Diameter is defined as the longest of all shortest paths in a graph. + +@note This function assumes a valid output from a prior APSP run - both the APSP +table and the associated output summary table. APSP is a computationally +expensive algorithm because it finds the shortest path between all nodes in the +graph. The worst case run-time for this implementation is O(V^2 * E) where V is +the number of vertices and E is the number of edges. In practice, run-time will +be generally be much less than this, depending on the graph. + +@anchor diameter +@par Diameter +<pre class="syntax"> +graph_diameter( apsp_table, + output_table + ) +</pre> + +\b Arguments +<dl class="arglist"> +<dt>apsp_table</dt> +<dd>TEXT. Name of the output table generated by a prior run of all pairs shortest path (APSP). +</dd> + +<dt>out_table</dt> +<dd>TEXT. Name of the table to store the diameter. It contains a row for every +group, the diameter value and the two vertices that are the farthest apart. +</dd> + +</dl> + +@anchor examples +@examp + +-# Create vertex and edge tables to represent the graph: +<pre class="syntax"> +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER, + name TEXT + ); +CREATE TABLE edge( + src_id INTEGER, + dest_id INTEGER, + edge_weight FLOAT8 + ); +INSERT INTO vertex VALUES +(0, 'A'), +(1, 'B'), +(2, 'C'), +(3, 'D'), +(4, 'E'), +(5, 'F'), +(6, 'G'), +(7, 'H'); +INSERT INTO edge VALUES +(0, 1, 1.0), +(0, 2, 1.0), +(0, 4, 10.0), +(1, 2, 2.0), +(1, 3, 10.0), +(2, 3, 1.0), +(2, 5, 1.0), +(2, 6, 3.0), +(3, 0, 1.0), +(4, 0, -2.0), +(5, 6, 1.0), +(6, 7, 1.0); +</pre> + +-# Calculate the all-pair shortest paths: +<pre class="syntax"> +DROP TABLE IF EXISTS out_apsp, out_apsp_summary; +SELECT madlib.graph_apsp('vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + -- Edge arguments (NULL means use default naming) + 'out_apsp'); -- Output table of shortest paths +</pre> + +-# Compute the diameter measure for the graph: +<pre class="syntax"> +DROP TABLE IF EXISTS out_diameter; +SELECT madlib.graph_diameter('out_apsp', 'out_diameter'); +SELECT * FROM out_diameter; +</pre> +<pre class="result"> +diameter | diameter_end_vertices +\---------+----------------------- + 14 | {{1,4}} +(1 row) +</pre> + +-# Create a graph with 2 groups and find APSP for each group: +<pre class="syntax"> +DROP TABLE IF EXISTS edge_gr; +CREATE TABLE edge_gr AS +( + SELECT *, 0 AS grp FROM edge + UNION + SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6 +); +INSERT INTO edge_gr VALUES +(4,5,-20,1); +</pre> + +-# Find APSP for all groups: +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr, out_gr_summary; +SELECT madlib.graph_apsp( + 'vertex', -- Vertex table + NULL, -- Vertex id column (NULL means use default naming) + 'edge_gr', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + 'out_gr', -- Output table of shortest paths + 'grp' -- Grouping columns +); +</pre> + +-# Find the diameter of graph in every group +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr_path; +SELECT madlib.graph_diameter('out_gr', 'out_gr_diameter'); +SELECT * FROM out_gr_diameter ORDER BY grp; +</pre> +<pre class="result"> +grp | diameter | diameter_end_vertices +\------+------------+------------------------- + 0 | 14 | {{1,4}} + 1 | 14 | {{1,4}} +(2 rows) +</pre> + +*/ + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_diameter( + apsp_table TEXT, + out_table TEXT +) RETURNS VOID AS $$ + PythonFunction(graph, measures, graph_diameter) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------------- + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_diameter( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(graph, measures, graph_diameter_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_diameter() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.graph_diameter(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +/** +@addtogroup grp_graph_avg_path_length +@brief Computes the average shortest-path length of a graph. + +<div class="toc"><b>Contents</b> +<ul> +<li><a href="#avg_path_length">Average Path Length</a></li> +<li><a href="#examples">Examples</a></li> +</ul> +</div> + +This function computes the average of the shortest paths between each pair of +vertices. Average path length is based on "reachable target vertices", so it +ignores infinite-length paths between vertices that are not connected. + +@note This function assumes a valid output from a prior APSP run - both the APSP +table and the associated output summary table. APSP is a computationally +expensive algorithm because it finds the shortest path between all nodes in the +graph. The worst case run-time for this implementation is O(V^2 * E) where V is +the number of vertices and E is the number of edges. In practice, run-time will +be generally be much less than this, depending on the graph. + +@anchor avg_path_length +@par Average Path Length +<pre class="syntax"> +graph_avg_path_length( apsp_table, + output_table + ) +</pre> + +\b Arguments +<dl class="arglist"> +<dt>apsp_table</dt> +<dd>TEXT. Name of the output table generated by a prior run of all pairs +shortest path (APSP). +</dd> + +<dt>out_table</dt> +<dd>TEXT. Name of the table to store the average path length. It contains a row +for every group, and the average path value. +</dd> +</dl> + +@anchor examples +@examp + +-# Create vertex and edge tables to represent the graph: +<pre class="syntax"> +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER, + name TEXT + ); +CREATE TABLE edge( + src_id INTEGER, + dest_id INTEGER, + edge_weight FLOAT8 + ); +INSERT INTO vertex VALUES +(0, 'A'), +(1, 'B'), +(2, 'C'), +(3, 'D'), +(4, 'E'), +(5, 'F'), +(6, 'G'), +(7, 'H'); +INSERT INTO edge VALUES +(0, 1, 1.0), +(0, 2, 1.0), +(0, 4, 10.0), +(1, 2, 2.0), +(1, 3, 10.0), +(2, 3, 1.0), +(2, 5, 1.0), +(2, 6, 3.0), +(3, 0, 1.0), +(4, 0, -2.0), +(5, 6, 1.0), +(6, 7, 1.0); +</pre> + +-# Calculate the all-pair shortest paths: +<pre class="syntax"> +DROP TABLE IF EXISTS out_apsp, out_apsp_summary; +SELECT madlib.graph_apsp('vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + -- Edge arguments (NULL means use default naming) + 'out_apsp'); -- Output table of shortest paths +</pre> + +-# Compute the average path length measure: +<pre class="syntax"> +DROP TABLE IF EXISTS out_avg_path_length; +SELECT madlib.graph_avg_path_length('out_apsp', 'out_avg_path_length'); +SELECT * FROM out_avg_path_length; +</pre> +<pre class="result"> + avg_path_length +\------------------ + 2.01785714285714 +(1 row) +</pre> + +-# Create a graph with 2 groups and find APSP for each group: +<pre class="syntax"> +DROP TABLE IF EXISTS edge_gr; +CREATE TABLE edge_gr AS +( + SELECT *, 0 AS grp FROM edge + UNION + SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6 +); +INSERT INTO edge_gr VALUES +(4,5,-20,1); +</pre> + +-# Find APSP for all groups: +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr, out_gr_summary; +SELECT madlib.graph_apsp( + 'vertex', -- Vertex table + NULL, -- Vertex id column (NULL means use default naming) + 'edge_gr', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + 'out_gr', -- Output table of shortest paths + 'grp' -- Grouping columns +); +</pre> + +-# Find the average path length in every group +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr_path; +SELECT madlib.graph_avg_path_length('out_gr', 'out_gr_path'); +SELECT * FROM out_gr_path ORDER BY grp; +</pre> +<pre class="result"> + grp | avg_path_length +\-------+-------------------- + 0 | 2.01785714285714 + 1 | 0.466666666666667 +(2 rows) +</pre> +*/ + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_avg_path_length( + apsp_table TEXT, + out_table TEXT +) RETURNS VOID AS $$ + PythonFunction(graph, measures, graph_avg_path_length) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------------- + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_avg_path_length( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(graph, measures, graph_avg_path_length_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_avg_path_length() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.graph_avg_path_length(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +/** +@addtogroup grp_graph_vertex_degrees +@brief Computes the degrees for each vertex + +<div class="toc"><b>Contents</b> +<ul> +<li><a href="#degrees">In-out degrees</a></li> +<li><a href="#examples">Examples</a></li> +</ul> +</div> + +This function computes the degree of each node. The node degree is the number of +edges adjacent to that node. The node in-degree is the number of edges pointing +in to the node and node out-degree is the number of edges pointing out of the +node. + +@anchor degrees +@par In-out degrees +<pre class="syntax"> +graph_vertex_degrees( + vertex_table, + vertex_id, + edge_table, + edge_args, + out_table, + grouping_cols +) +</pre> + +\b Arguments +<dl class="arglist"> +<dt>vertex_table</dt> +<dd>TEXT. Name of the table containing the vertex data for the graph. Must +contain the column specified in the 'vertex_id' parameter below.</dd> + +<dt>vertex_id</dt> +<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing +vertex ids. The vertex ids are of type INTEGER with no duplicates. +They do not need to be contiguous.</dd> + +<dt>edge_table</dt> +<dd>TEXT. Name of the table containing the edge data. The edge table must +contain columns for source vertex, destination vertex and edge weight. +Column naming convention is described below in the 'edge_args' parameter.</dd> + +<dt>edge_args</dt> +<dd>TEXT. A comma-delimited string containing multiple named arguments of +the form "name=value". The following parameters are supported for +this string argument: + - src (INTEGER): Name of the column containing the source vertex ids in the + edge table. Default column name is 'src'. + - dest (INTEGER): Name of the column containing the destination vertex ids + in the edge table. Default column name is 'dest'. + - weight (FLOAT8): Name of the column containing the edge weights in the + edge table. Default column name is 'weight'.</dd> + +<dt>out_table</dt> +<dd>TEXT. Name of the table to store the result. +It contains a row for every vertex of every group and has +the following columns (in addition to the grouping columns): + - vertex: The id for the source vertex. Will use the input vertex + column 'id' for column naming. + - indegree: Number of incoming edges to the vertex. + - outdegree: Number of outgoing edges from the vertex. + +<dt>grouping_cols</dt> +<dd>TEXT, default = NULL. List of columns used to group the input into discrete +subgraphs. These columns must exist in the edge table. When this value is null, +no grouping is used and a single result is generated. </dd> +</dl> + +@anchor examples +@examp + +-# Create vertex and edge tables to represent the graph: +<pre class="syntax"> +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER, + name TEXT + ); +CREATE TABLE edge( + src_id INTEGER, + dest_id INTEGER, + edge_weight FLOAT8 + ); +INSERT INTO vertex VALUES +(0, 'A'), +(1, 'B'), +(2, 'C'), +(3, 'D'), +(4, 'E'), +(5, 'F'), +(6, 'G'), +(7, 'H'); +INSERT INTO edge VALUES +(0, 1, 1.0), +(0, 2, 1.0), +(0, 4, 10.0), +(1, 2, 2.0), +(1, 3, 10.0), +(2, 3, 1.0), +(2, 5, 1.0), +(2, 6, 3.0), +(3, 0, 1.0), +(4, 0, -2.0), +(5, 6, 1.0), +(6, 7, 1.0); +</pre> + +-# Calculate the in-out degrees for each node: +<pre class="syntax"> +DROP TABLE IF EXISTS degrees; +SELECT madlib.graph_vertex_degrees( + 'vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + 'degrees'); -- Output table of shortest paths +SELECT * FROM degrees ORDER BY id; +</pre> +<pre class="result"> + id | indegree | outdegree +\----+----------+----------- + 0 | 2 | 3 + 1 | 1 | 2 + 2 | 2 | 3 + 3 | 2 | 1 + 4 | 1 | 1 + 5 | 1 | 1 + 6 | 2 | 1 +(7 rows) +</pre> + +-# Create a graph with 2 groups and find degrees for each group: +<pre class="syntax"> +DROP TABLE IF EXISTS edge_gr; +CREATE TABLE edge_gr AS +( + SELECT *, 0 AS grp FROM edge + UNION + SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6 +); +INSERT INTO edge_gr VALUES +(4,5,-20,1); +</pre> + +-# Find APSP for all groups: +<pre class="syntax"> +DROP TABLE IF EXISTS out_gr; +SELECT madlib.graph_vertex_degrees( + 'vertex', -- Vertex table + NULL, -- Vertex id column (NULL means use default naming) + 'edge_gr', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + 'out_gr', -- Output table of shortest paths + 'grp' -- Grouping columns +); +SELECT * FROM out_gr WHERE id < 2 ORDER BY grp, id; +</pre> +<pre class="result"> + grp | id | indegree | outdegree +\-------+------+------------+------------- + 0 | 0 | 2 | 3 + 0 | 1 | 1 | 2 + 1 | 0 | 2 | 3 + 1 | 1 | 1 | 2 +(4 rows) +</pre> +*/ + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + out_table TEXT, + grouping_cols TEXT +) RETURNS VOID AS $$ + PythonFunction(graph, measures, graph_vertex_degrees) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + out_table TEXT +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.graph_vertex_degrees($1, $2, $3, $4, $5, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------------- + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(graph, measures, graph_vertex_degrees_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_vertex_degrees() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.graph_vertex_degrees(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/06788cc4/src/ports/postgres/modules/graph/test/measures.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/test/measures.sql_in b/src/ports/postgres/modules/graph/test/measures.sql_in new file mode 100644 index 0000000..b0bf95e --- /dev/null +++ b/src/ports/postgres/modules/graph/test/measures.sql_in @@ -0,0 +1,150 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ +/* ----------------------------------------------------------------------- */ + +-- Create vertex and edge tables to represent the graph: +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER, + name TEXT + ); +CREATE TABLE edge( + src_id INTEGER, + dest_id INTEGER, + edge_weight FLOAT8 + ); +INSERT INTO vertex VALUES +(0, 'A'), +(1, 'B'), +(2, 'C'), +(3, 'D'), +(4, 'E'), +(5, 'F'), +(6, 'G'), +(7, 'H'); +INSERT INTO edge VALUES +(0, 1, 1.0), +(0, 2, 1.0), +(0, 4, 10.0), +(1, 2, 2.0), +(1, 3, 10.0), +(2, 3, 1.0), +(2, 5, 1.0), +(2, 6, 3.0), +(3, 0, 1.0), +(4, 0, -2.0), +(5, 6, 1.0), +(6, 7, 1.0); + +-- Calculate the all-pair shortest paths: +DROP TABLE IF EXISTS out_apsp, out_apsp_summary; +SELECT graph_apsp('vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + -- Edge arguments (NULL means use default naming) + 'out_apsp'); -- Output table of shortest paths + +-- Compute the closeness measure for all nodes: +DROP TABLE IF EXISTS out_closeness; +SELECT graph_closeness('out_apsp', 'out_closeness'); +SELECT * FROM out_closeness; + +SELECT assert(relative_error(inverse_sum_dist, 0.04347) < 1e-2 and + relative_error(inverse_avg_dist, 0.3043) < 1e-2 and + relative_error(sum_inverse_dist, 3.6833) < 1e-2 and + k_degree = 7, + 'Incorrect value for closeness') +FROM out_closeness +WHERE src_id = 0; + +-- Compute the diameter measure for graph +DROP TABLE IF EXISTS out_diameter; +SELECT graph_diameter('out_apsp', 'out_diameter'); +SELECT * FROM out_diameter; +SELECT assert(diameter=14, 'Invalid value for diameter') FROM out_diameter; + +-- Compute the average path length measure for graph +DROP TABLE IF EXISTS out_avg_path_length; +SELECT graph_avg_path_length('out_apsp', 'out_avg_path_length'); +SELECT * FROM out_avg_path_length; +SELECT assert(relative_error(avg_path_length, 2.0178) < 1e-2, + 'Invalid value for avg_path_length') FROM out_avg_path_length; + +-- Compute the in and out degrees +DROP TABLE IF EXISTS out_degrees; +SELECT graph_vertex_degrees('vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + -- Edge arguments (NULL means use default naming) + 'out_degrees'); +SELECT * FROM out_degrees; +SELECT assert(indegree = 2 and outdegree = 3, 'Invalid value for degrees') +FROM out_degrees +WHERE id = 0; +------------------------------------------------------------------------- +-- Grouping ----------------------------------------------------------- +------------------------------------------------------------------------------ + +-- Create a graph with 2 groups and find APSP for each group: +DROP TABLE IF EXISTS edge_gr; +CREATE TABLE edge_gr AS +( + SELECT *, 0 AS grp FROM edge + UNION + SELECT *, 1 AS grp FROM edge WHERE src_id < 6 AND dest_id < 6 +); +INSERT INTO edge_gr VALUES +(4,5,-20,1); + +-- Find APSP for all groups: +DROP TABLE IF EXISTS out_apsp, out_apsp_summary; +SELECT graph_apsp('vertex', -- Vertex table + 'id', -- Vertex id column (NULL means use default naming) + 'edge_gr', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + 'out_apsp', -- Output table of shortest paths + 'grp' -- Grouping columns +); + +DROP TABLE IF EXISTS out_closeness; +SELECT graph_closeness('out_apsp', 'out_closeness'); +SELECT * FROM out_closeness ORDER BY grp, src_id; + +DROP TABLE IF EXISTS out_diameter; +SELECT graph_diameter('out_apsp', 'out_diameter'); +SELECT * FROM out_diameter ORDER BY grp; + +-- Compute the closeness measure for all nodes: +DROP TABLE IF EXISTS out; +SELECT graph_avg_path_length('out_apsp', 'out'); +SELECT * FROM out ORDER BY grp; + +-- Compute the closeness measure for all nodes: +DROP TABLE IF EXISTS out_degrees; +SELECT graph_vertex_degrees('vertex', -- Vertex table + 'id', -- Vertix id column (NULL means use default naming) + 'edge_gr', -- Edge table + 'src=src_id, dest=dest_id, weight=edge_weight', + 'out_degrees', + 'grp'); +SELECT * FROM out_degrees ORDER BY grp, id;