Feature: PageRank JIRA: MADLIB-1069
- Introduces a new module that computes the PageRank of all nodes in a directed graph. - Implements the original PageRank algorithm that assumes a random surfer model (https://en.wikipedia.org/wiki/PageRank#Damping_factor) - This version does not perform convergence test yet, so the PageRank computation runs through all iterations. Exiting on convergence will be handled as part of MADLIB-1082. The threshold parameter specified will be ignored. Closes #109 Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/6b466ea6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/6b466ea6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/6b466ea6 Branch: refs/heads/latest_release Commit: 6b466ea6d19731e8500cc89058d1a77bf778121a Parents: d344f1f Author: Nandish Jayaram <njaya...@apache.org> Authored: Thu Mar 16 12:02:40 2017 -0700 Committer: Nandish Jayaram <njaya...@apache.org> Committed: Thu Mar 30 17:06:17 2017 -0700 ---------------------------------------------------------------------- doc/mainpage.dox.in | 3 + .../postgres/modules/graph/graph_utils.py_in | 8 +- src/ports/postgres/modules/graph/pagerank.py_in | 288 +++++++++++++++++++ .../postgres/modules/graph/pagerank.sql_in | 271 +++++++++++++++++ .../postgres/modules/graph/test/pagerank.sql_in | 62 ++++ 5 files changed, 628 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/doc/mainpage.dox.in ---------------------------------------------------------------------- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index 9131c10..94950e7 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -122,6 +122,9 @@ complete matrix stored as a distributed table. @ingroup grp_datatrans @defgroup grp_graph Graph @{Contains graph algorithms. @} + @defgroup grp_pagerank PageRank + @ingroup grp_graph + @defgroup grp_sssp Single Source Shortest Path @ingroup grp_graph http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/graph_utils.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in b/src/ports/postgres/modules/graph/graph_utils.py_in index fb43491..2d83301 100644 --- a/src/ports/postgres/modules/graph/graph_utils.py_in +++ b/src/ports/postgres/modules/graph/graph_utils.py_in @@ -69,11 +69,11 @@ def validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params, existing_cols = set(unquote_ident(i) for i in get_cols(vertex_table)) _assert(vertex_id in existing_cols, - """Graph {func_name}: The vertex column {vertex_id} is not present in - vertex table ({vertex_table}) """.format(**locals())) + """Graph {func_name}: The vertex column {vertex_id} is not present in vertex table ({vertex_table}) """. + format(**locals())) _assert(columns_exist_in_table(edge_table, edge_params.values()), - """Graph {func_name}: Not all columns from {cols} present in edge - table ({edge_table})""".format(cols=edge_params.values(), **locals())) + """Graph {func_name}: Not all columns from {cols} present in edge table ({edge_table})""". + format(cols=edge_params.values(), **locals())) return None http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/pagerank.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/pagerank.py_in b/src/ports/postgres/modules/graph/pagerank.py_in new file mode 100644 index 0000000..13cdcc5 --- /dev/null +++ b/src/ports/postgres/modules/graph/pagerank.py_in @@ -0,0 +1,288 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# PageRank + +# Please refer to the pagerank.sql_in file for the documentation + +""" +@file pagerank.py_in + +@namespace graph +""" + +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.control import IterationController2S +from graph_utils import * + +import time + +m4_changequote(`<!', `!>') + +def validate_pagerank_args(vertex_table, vertex_id, edge_table, edge_params, + out_table, damping_factor, max_iter, threshold, module_name): + """ + Function to validate input parameters for PageRank + """ + validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params, + out_table, module_name) + _assert(damping_factor >= 0.0 and damping_factor <= 1.0, + """PageRank: Invalid damping factor value ({0}), must be between 0 and 1.""" + .format(damping_factor)) + _assert(threshold >= 0.0 and threshold <= 1.0, + """PageRank: Invalid threshold value ({0}), must be between 0 and 1.""" + .format(threshold)) + _assert(max_iter > 0, + """PageRank: Invalid max_iter value ({0}), must be a positive integer. """ + .format(max_iter)) + +def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, + out_table, damping_factor, max_iter, threshold, **kwargs): + """ + Function that computes the PageRank + + Args: + @param vertex_table + @param vertex_id + @param edge_table + @param source_vertex + @param dest_vertex + @param out_table + @param damping_factor + @param max_iter + @param threshold + """ + old_msg_level = plpy.execute(""" + SELECT setting + FROM pg_settings + WHERE name='client_min_messages' + """)[0]['setting'] + plpy.execute('SET client_min_messages TO warning') + params_types = {'src': str, 'dest': str} + default_args = {'src': 'src', 'dest': 'dest'} + edge_params = extract_keyvalue_params(edge_args, params_types, default_args) + + # populate default values for optional params if null + if damping_factor is None: + damping_factor = 0.85 + if max_iter is None: + max_iter = 100 + if threshold is None: + threshold = 0.00001 + if vertex_id is None: + vertex_id = "id" + validate_pagerank_args(vertex_table, vertex_id, edge_table, edge_params, + out_table, damping_factor, max_iter, threshold, 'PageRank') + src = edge_params["src"] + dest = edge_params["dest"] + + edge_temp_table = unique_string(desp='temp_edge') + distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>, + <!"DISTRIBUTED BY ({0})".format(dest)!>) + plpy.execute(""" + DROP TABLE IF EXISTS {edge_temp_table}; + CREATE TEMP TABLE {edge_temp_table} AS + SELECT * FROM {edge_table} + {distribution} + """.format(**locals())) + # GPDB and HAWQ have distributed by clauses to help them with indexing. + # For Postgres we add the indices manually. + sql_index = m4_ifdef(<!__POSTGRESQL__!>, + <!"""CREATE INDEX ON {edge_temp_table} ({src}); + """.format(**locals())!>, + <!''!>) + plpy.execute(sql_index) + + nvertices = plpy.execute(""" + SELECT COUNT({0}) AS cnt + FROM {1} + """.format(vertex_id, vertex_table))[0]["cnt"] + init_value = 1.0/nvertices + random_prob = (1.0-damping_factor)/nvertices + cur = unique_string(desp='cur') + message = unique_string(desp='message') + plpy.execute(""" + CREATE TEMP TABLE {cur} AS + SELECT {vertex_id}, {init_value}::DOUBLE PRECISION AS pagerank + FROM {vertex_table} + """.format(**locals())) + v1 = unique_string(desp='v1') + + out_cnts = unique_string(desp='out_cnts') + out_cnts_cnt = unique_string(desp='cnt') + # Compute the out-degree of every node in the graph. + cnts_distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>, + <!"DISTRIBUTED BY ({0})".format(vertex_id)!>) + + plpy.execute(""" + DROP TABLE IF EXISTS {out_cnts}; + CREATE TEMP TABLE {out_cnts} AS + SELECT {src} AS {vertex_id}, COUNT({dest}) AS {out_cnts_cnt} + FROM {edge_table} + GROUP BY {src} + {cnts_distribution} + """.format(**locals())) + + for i in range(max_iter): + ##################################################################### + # PageRank for node 'A' at any given iteration 'i' is given by: + # PR_i(A) = damping_factor(PR_i-1(B)/degree(B) + PR_i-1(C)/degree(C) + ...) + (1-damping_factor)/N + # where 'N' is the number of vertices in the graph, + # B, C are nodes that have edges to node A, and + # degree(node) represents the number of outgoing edges from 'node' + ##################################################################### + # Essentially, the pagerank for a node is based on an aggregate of a + # fraction of the pagerank values of all the nodes that have incoming + # edges to it, along with a small random probability. + # More information can be found at: + # https://en.wikipedia.org/wiki/PageRank#Damping_factor + + # The query below computes the PageRank of each node using the above formula. + plpy.execute(""" + CREATE TABLE {message} AS + SELECT {edge_temp_table}.{dest} AS {vertex_id}, + SUM({v1}.pagerank/{out_cnts}.{out_cnts_cnt})*{damping_factor}+{random_prob} AS pagerank + FROM {edge_temp_table} + INNER JOIN {cur} ON {edge_temp_table}.{dest}={cur}.{vertex_id} + INNER JOIN {out_cnts} ON {out_cnts}.{vertex_id}={edge_temp_table}.{src} + INNER JOIN {cur} AS {v1} ON {v1}.{vertex_id}={edge_temp_table}.{src} + GROUP BY {edge_temp_table}.{dest} + """.format(**locals())) + # If there are nodes that have no incoming edges, they are not captured in the message table. + # Insert entries for such nodes, with random_prob. + plpy.execute(""" + INSERT INTO {message} + SELECT {vertex_id}, {random_prob}::DOUBLE PRECISION AS pagerank + FROM {cur} + WHERE {vertex_id} NOT IN ( + SELECT {vertex_id} + FROM {message} + ) + """.format(**locals())) + # Check for convergence will be done as part of grouping support for pagerank: + # https://issues.apache.org/jira/browse/MADLIB-1082. So, the threshold parameter + # is a dummy variable at the moment, the PageRank computation happens for + # {max_iter} number of times. + plpy.execute(""" + DROP TABLE IF EXISTS {cur}; + ALTER TABLE {message} RENAME TO {cur} + """.format(**locals())) + + plpy.execute("ALTER TABLE {cur} RENAME TO {out_table}".format(**locals())) + + ## Step 4: Cleanup + plpy.execute(""" + DROP TABLE IF EXISTS {0},{1},{2},{3}; + """.format(out_cnts, edge_temp_table, cur, message)) + plpy.execute("SET client_min_messages TO %s" % old_msg_level) + +def pagerank_help(schema_madlib, message, **kwargs): + """ + Help function for pagerank + + Args: + @param schema_madlib + @param message: string, Help message string + @param kwargs + + Returns: + String. Help/usage information + """ + if message is not None and \ + message.lower() in ("usage", "help", "?"): + help_string = "Get from method below" + help_string = get_graph_usage(schema_madlib, 'PageRank', + """out_table TEXT, -- Name of the output table for PageRank + damping_factor, DOUBLE PRECISION, -- Damping factor in random surfer model + -- (DEFAULT = 0.85) + max_iter, INTEGER, -- Maximum iteration number (DEFAULT = 100) + threshold DOUBLE PRECISION -- Stopping criteria (DEFAULT = 1e-5) +""") + else: + if message is not None and \ + message.lower() in ("example", "examples"): + help_string = """ +---------------------------------------------------------------------------- + EXAMPLES +---------------------------------------------------------------------------- +-- Create a graph, represented as vertex and edge tables. +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER + ); +CREATE TABLE edge( + src INTEGER, + dest INTEGER + ); +INSERT INTO vertex VALUES +(0), +(1), +(2), +(3), +(4), +(5), +(6); +INSERT INTO edge VALUES +(0, 1), +(0, 2), +(0, 4), +(1, 2), +(1, 3), +(2, 3), +(2, 5), +(2, 6), +(3, 0), +(4, 0), +(5, 6), +(6, 3); + +-- Compute the PageRank: +DROP TABLE IF EXISTS pagerank_out; +SELECT madlib.pagerank( + 'vertex', -- Vertex table + 'id', -- Vertix id column + 'edge', -- Edge table + 'src=src, dest=dest', -- Comma delimted string of edge arguments + 'pagerank_out') -- Output table of PageRank + +-- View the PageRank of all vertices, sorted by their scores. +SELECT * FROM pagerank_out ORDER BY pagerank desc; +""" + else: + help_string = """ +---------------------------------------------------------------------------- + SUMMARY +---------------------------------------------------------------------------- +Given a directed graph, pagerank algorithm finds the PageRank score of all +the vertices in the graph. +-- +For an overview on usage, run: +SELECT {schema_madlib}.pagerank('usage'); + +For some examples, run: +SELECT {schema_madlib}.pagerank('example') +-- +""" + + return help_string.format(schema_madlib=schema_madlib) +# --------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/pagerank.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in b/src/ports/postgres/modules/graph/pagerank.sql_in new file mode 100644 index 0000000..712d146 --- /dev/null +++ b/src/ports/postgres/modules/graph/pagerank.sql_in @@ -0,0 +1,271 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * + * @file graph.sql_in + * + * @brief SQL functions for graph analytics + * @date Nov 2016 + * + * @sa Provides various graph algorithms. + * + *//* ----------------------------------------------------------------------- */ +m4_include(`SQLCommon.m4') + + +/** +@addtogroup grp_pagerank + +<div class="toc"><b>Contents</b> +<ul> +<li><a href="#pagerank">PageRank</a></li> +<li><a href="#notes">Notes</a></li> +<li><a href="#examples">Examples</a></li> +<li><a href="#literature">Literature</a></li> +</ul> +</div> + +@brief Find the PageRank of all vertices in a directed graph. + +Given a graph, the PageRank algorithm outputs a probability distribution representing the +likelihood that a person randomly traversing the graph will arrive at any particular vertex. +This algorithm was originally used by Google to rank websites where the World Wide Web was +modeled as a directed graph with the vertices representing the websites. + +@anchor pagerank +@par PageRank +<pre class="syntax"> +pagerank( vertex_table, + vertex_id, + edge_table, + edge_args, + out_table, + damping_factor, + max_iter, + threshold + ) +</pre> + +\b Arguments +<dl class="arglist"> +<dt>vertex_table</dt> +<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the +column specified in the 'vertex_id' parameter below.</dd> + +<dt>vertex_id</dt> +<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing +vertex ids. The vertex ids are of type INTEGER with no duplicates. +They do not need to be contiguous.</dd> + +<dt>edge_table</dt> +<dd>TEXT. Name of the table containing the edge data. The edge table must +contain columns for source vertex and destination vertex.</dd> + +<dt>edge_args</dt> +<dd>TEXT. A comma-delimited string containing multiple named arguments of +the form "name=value". The following parameters are supported for +this string argument: + - src (INTEGER): Name of the column containing the source vertex ids in the edge table. + Default column name is 'src'. + - dest (INTEGER): Name of the column containing the destination vertex ids in the edge table. + Default column name is 'dest'.</dd> + +<dt>out_table</dt> +<dd>TEXT. Name of the table to store the result of PageRank. +It will contain a row for every vertex from 'vertex_table' with +the following columns: + - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming. + - pagerank : The vertex's PageRank.</dd> + +<dt>damping_factor</dt> +<dd>FLOAT8, default 0.85. The probability, at any step, that a user will continue following the links in a random surfer model.</dd> + +<dt>max_iter</dt> +<dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd> + +<dt>threshold</dt> +<dd>FLOAT8, default: 1e-5. If the difference between the PageRank of every vertex of two consecutive +iterations is smaller than 'threshold', or the iteration number is larger than 'max_iter', the +computation stops. If you set the threshold to zero, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'.</dd> + +</dl> + +@anchor notes +@par Notes + +The PageRank algorithm proposed by Larry Page and Sergey Brin is used [1]. + +@anchor examples +@examp + +-# Create vertex and edge tables to represent the graph: +<pre class="syntax"> +DROP TABLE IF EXISTS vertex, edge; +CREATE TABLE vertex( + id INTEGER + ); +CREATE TABLE edge( + src INTEGER, + dest INTEGER + ); +INSERT INTO vertex VALUES +(0), +(1), +(2), +(3), +(4), +(5), +(6); +INSERT INTO edge VALUES +(0, 1), +(0, 2), +(0, 4), +(1, 2), +(1, 3), +(2, 3), +(2, 5), +(2, 6), +(3, 0), +(4, 0), +(5, 6), +(6, 3); +</pre> + +-# Compute the PageRank: +<pre class="syntax"> +DROP TABLE IF EXISTS pagerank_out; +SELECT madlib.pagerank( + 'vertex', -- Vertex table + 'id', -- Vertix id column + 'edge', -- Edge table + 'src=src, dest=dest', -- Comma delimted string of edge arguments + 'pagerank_out'); -- Output table of PageRank +SELECT * FROM pagerank_out ORDER BY pagerank desc; +</pre> +<pre class="result"> + id | pagerank +----+-------------------- + 0 | 0.278256122055856 + 3 | 0.201882680839737 + 2 | 0.142878491945534 + 6 | 0.114538731993905 + 1 | 0.100266150276761 + 4 | 0.100266150276761 + 5 | 0.061911672611445 +(7 rows) +</pre> + +-# Run PageRank with a damping factor of 0.5 results in different final values: +<pre class="syntax"> +DROP TABLE IF EXISTS pagerank_out; +SELECT madlib.pagerank( + 'vertex', -- Vertex table + 'id', -- Vertix id column + 'edge', -- Edge table + 'src=src, dest=dest', -- Comma delimted string of edge arguments + 'pagerank_out', -- Output table of PageRank + 0.5); -- Damping factor +SELECT * FROM pagerank_out ORDER BY pagerank desc; +</pre> +<pre class="result"> + id | pagerank +----+------------------- + 0 | 0.221378135793372 + 3 | 0.191574922960784 + 6 | 0.140994575864846 + 2 | 0.135406336658892 + 4 | 0.108324751971412 + 1 | 0.108324751971412 + 5 | 0.093996524779681 +(7 rows) +</pre> + +@anchor literature +@par Literature + +[1] PageRank algorithm. https://en.wikipedia.org/wiki/PageRank +*/ + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + out_table TEXT, + damping_factor FLOAT8, + max_iter INTEGER, + threshold FLOAT8 +) RETURNS VOID AS $$ + PythonFunction(graph, pagerank, pagerank) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + out_table TEXT, + damping_factor FLOAT8, + max_iter INTEGER +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, 0.00001) +$$ LANGUAGE SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + out_table TEXT, + damping_factor FLOAT8 +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, 0.00001) +$$ LANGUAGE SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( + vertex_table TEXT, + vertex_id TEXT, + edge_table TEXT, + edge_args TEXT, + out_table TEXT +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, 0.00001) +$$ LANGUAGE SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +------------------------------------------------------------------------- + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(graph, pagerank, pagerank_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +-------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.pagerank(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); +-------------------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/test/pagerank.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/graph/test/pagerank.sql_in b/src/ports/postgres/modules/graph/test/pagerank.sql_in new file mode 100644 index 0000000..1d695e2 --- /dev/null +++ b/src/ports/postgres/modules/graph/test/pagerank.sql_in @@ -0,0 +1,62 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + +DROP TABLE IF EXISTS vertex, edge, pagerank_out; +CREATE TABLE vertex( + id INTEGER + ); +CREATE TABLE edge( + src INTEGER, + dest INTEGER + ); +INSERT INTO vertex VALUES +(0), +(1), +(2), +(3), +(4), +(5), +(6); +INSERT INTO edge VALUES +(0, 1), +(0, 2), +(0, 4), +(1, 2), +(1, 3), +(2, 3), +(2, 5), +(2, 6), +(3, 0), +(4, 0), +(5, 6), +(6, 3); + +SELECT madlib.pagerank( + 'vertex', -- Vertex table + 'id', -- Vertix id column + 'edge', -- Edge table + 'src=src, dest=dest', -- Edge args + 'pagerank_out'); -- Output table of PageRank + +-- View the PageRank of all vertices, sorted by their scores. +SELECT assert(relative_error(SUM(pagerank), 1) < 0.00001, + 'PageRank: Scores do not sum up to 1.' + ) FROM pagerank_out;