Feature: PageRank

JIRA: MADLIB-1069

- Introduces a new module that computes the PageRank of all nodes
in a directed graph.
- Implements the original PageRank algorithm that assumes a random
surfer model (https://en.wikipedia.org/wiki/PageRank#Damping_factor)
- This version does not perform convergence test yet, so the PageRank
computation runs through all iterations. Exiting on convergence
will be handled as part of MADLIB-1082. The threshold parameter
specified will be ignored.

Closes #109


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/6b466ea6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/6b466ea6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/6b466ea6

Branch: refs/heads/latest_release
Commit: 6b466ea6d19731e8500cc89058d1a77bf778121a
Parents: d344f1f
Author: Nandish Jayaram <njaya...@apache.org>
Authored: Thu Mar 16 12:02:40 2017 -0700
Committer: Nandish Jayaram <njaya...@apache.org>
Committed: Thu Mar 30 17:06:17 2017 -0700

----------------------------------------------------------------------
 doc/mainpage.dox.in                             |   3 +
 .../postgres/modules/graph/graph_utils.py_in    |   8 +-
 src/ports/postgres/modules/graph/pagerank.py_in | 288 +++++++++++++++++++
 .../postgres/modules/graph/pagerank.sql_in      | 271 +++++++++++++++++
 .../postgres/modules/graph/test/pagerank.sql_in |  62 ++++
 5 files changed, 628 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 9131c10..94950e7 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -122,6 +122,9 @@ complete matrix stored as a distributed table.
         @ingroup grp_datatrans
 @defgroup grp_graph Graph
 @{Contains graph algorithms. @}
+    @defgroup grp_pagerank PageRank
+    @ingroup grp_graph
+
     @defgroup grp_sssp Single Source Shortest Path
     @ingroup grp_graph
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/graph_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in 
b/src/ports/postgres/modules/graph/graph_utils.py_in
index fb43491..2d83301 100644
--- a/src/ports/postgres/modules/graph/graph_utils.py_in
+++ b/src/ports/postgres/modules/graph/graph_utils.py_in
@@ -69,11 +69,11 @@ def validate_graph_coding(vertex_table, vertex_id, 
edge_table, edge_params,
 
        existing_cols = set(unquote_ident(i) for i in get_cols(vertex_table))
        _assert(vertex_id in existing_cols,
-               """Graph {func_name}: The vertex column {vertex_id} is not 
present in
-               vertex table ({vertex_table}) """.format(**locals()))
+               """Graph {func_name}: The vertex column {vertex_id} is not 
present in vertex table ({vertex_table}) """.
+               format(**locals()))
        _assert(columns_exist_in_table(edge_table, edge_params.values()),
-               """Graph {func_name}: Not all columns from {cols} present in 
edge
-               table ({edge_table})""".format(cols=edge_params.values(), 
**locals()))
+               """Graph {func_name}: Not all columns from {cols} present in 
edge table ({edge_table})""".
+               format(cols=edge_params.values(), **locals()))
 
        return None
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/pagerank.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.py_in 
b/src/ports/postgres/modules/graph/pagerank.py_in
new file mode 100644
index 0000000..13cdcc5
--- /dev/null
+++ b/src/ports/postgres/modules/graph/pagerank.py_in
@@ -0,0 +1,288 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# PageRank
+
+# Please refer to the pagerank.sql_in file for the documentation
+
+"""
+@file pagerank.py_in
+
+@namespace graph
+"""
+
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.control import IterationController2S
+from graph_utils import *
+
+import time
+
+m4_changequote(`<!', `!>')
+
+def validate_pagerank_args(vertex_table, vertex_id, edge_table, edge_params,
+        out_table, damping_factor, max_iter, threshold, module_name):
+    """
+    Function to validate input parameters for PageRank
+    """
+    validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
+        out_table, module_name)
+    _assert(damping_factor >= 0.0 and damping_factor <= 1.0,
+        """PageRank: Invalid damping factor value ({0}), must be between 0 and 
1."""
+        .format(damping_factor))
+    _assert(threshold >= 0.0 and threshold <= 1.0,
+        """PageRank: Invalid threshold value ({0}), must be between 0 and 1."""
+        .format(threshold))
+    _assert(max_iter > 0,
+        """PageRank: Invalid max_iter value ({0}), must be a positive integer. 
"""
+        .format(max_iter))
+
+def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
+    out_table, damping_factor, max_iter, threshold, **kwargs):
+    """
+    Function that computes the PageRank
+
+    Args:
+        @param vertex_table
+        @param vertex_id
+        @param edge_table
+        @param source_vertex
+        @param dest_vertex
+        @param out_table
+        @param damping_factor
+        @param max_iter
+        @param threshold
+    """
+    old_msg_level = plpy.execute("""
+                                  SELECT setting
+                                  FROM pg_settings
+                                  WHERE name='client_min_messages'
+                                  """)[0]['setting']
+    plpy.execute('SET client_min_messages TO warning')
+    params_types = {'src': str, 'dest': str}
+    default_args = {'src': 'src', 'dest': 'dest'}
+    edge_params = extract_keyvalue_params(edge_args, params_types, 
default_args)
+
+    # populate default values for optional params if null
+    if damping_factor is None:
+        damping_factor = 0.85
+    if max_iter is None:
+        max_iter = 100
+    if threshold is None:
+        threshold = 0.00001
+    if vertex_id is None:
+        vertex_id = "id"
+    validate_pagerank_args(vertex_table, vertex_id, edge_table, edge_params,
+        out_table, damping_factor, max_iter, threshold, 'PageRank')
+    src = edge_params["src"]
+    dest = edge_params["dest"]
+
+    edge_temp_table = unique_string(desp='temp_edge')
+    distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
+        <!"DISTRIBUTED BY ({0})".format(dest)!>)
+    plpy.execute("""
+        DROP TABLE IF EXISTS {edge_temp_table};
+        CREATE TEMP TABLE {edge_temp_table} AS
+        SELECT * FROM {edge_table}
+        {distribution}
+        """.format(**locals()))
+    # GPDB and HAWQ have distributed by clauses to help them with indexing.
+    # For Postgres we add the indices manually.
+    sql_index = m4_ifdef(<!__POSTGRESQL__!>,
+        <!"""CREATE INDEX ON {edge_temp_table} ({src});
+        """.format(**locals())!>,
+        <!''!>)
+    plpy.execute(sql_index)
+
+    nvertices = plpy.execute("""
+            SELECT COUNT({0}) AS cnt
+            FROM {1}
+        """.format(vertex_id, vertex_table))[0]["cnt"]
+    init_value = 1.0/nvertices
+    random_prob = (1.0-damping_factor)/nvertices
+    cur = unique_string(desp='cur')
+    message = unique_string(desp='message')
+    plpy.execute("""
+            CREATE TEMP TABLE {cur} AS
+            SELECT {vertex_id}, {init_value}::DOUBLE PRECISION AS pagerank
+            FROM {vertex_table}
+        """.format(**locals()))
+    v1 = unique_string(desp='v1')
+
+    out_cnts = unique_string(desp='out_cnts')
+    out_cnts_cnt = unique_string(desp='cnt')
+    # Compute the out-degree of every node in the graph.
+    cnts_distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
+        <!"DISTRIBUTED BY ({0})".format(vertex_id)!>)
+
+    plpy.execute("""
+        DROP TABLE IF EXISTS {out_cnts};
+        CREATE TEMP TABLE {out_cnts} AS
+        SELECT {src} AS {vertex_id}, COUNT({dest}) AS {out_cnts_cnt}
+        FROM {edge_table}
+        GROUP BY {src}
+        {cnts_distribution}
+        """.format(**locals()))
+
+    for i in range(max_iter):
+        #####################################################################
+        # PageRank for node 'A' at any given iteration 'i' is given by:
+        # PR_i(A) = damping_factor(PR_i-1(B)/degree(B) + PR_i-1(C)/degree(C) + 
...) + (1-damping_factor)/N
+        # where 'N' is the number of vertices in the graph,
+        # B, C are nodes that have edges to node A, and
+        # degree(node) represents the number of outgoing edges from 'node'
+        #####################################################################
+        # Essentially, the pagerank for a node is based on an aggregate of a
+        # fraction of the pagerank values of all the nodes that have incoming
+        # edges to it, along with a small random probability.
+        # More information can be found at:
+        # https://en.wikipedia.org/wiki/PageRank#Damping_factor
+
+        # The query below computes the PageRank of each node using the above 
formula.
+        plpy.execute("""
+                CREATE TABLE {message} AS
+                SELECT {edge_temp_table}.{dest} AS {vertex_id},
+                        
SUM({v1}.pagerank/{out_cnts}.{out_cnts_cnt})*{damping_factor}+{random_prob} AS 
pagerank
+                FROM {edge_temp_table}
+                    INNER JOIN {cur} ON 
{edge_temp_table}.{dest}={cur}.{vertex_id}
+                    INNER JOIN {out_cnts} ON 
{out_cnts}.{vertex_id}={edge_temp_table}.{src}
+                    INNER JOIN {cur} AS {v1} ON 
{v1}.{vertex_id}={edge_temp_table}.{src}
+                GROUP BY {edge_temp_table}.{dest}
+            """.format(**locals()))
+        # If there are nodes that have no incoming edges, they are not 
captured in the message table.
+        # Insert entries for such nodes, with random_prob.
+        plpy.execute("""
+                INSERT INTO {message}
+                SELECT {vertex_id}, {random_prob}::DOUBLE PRECISION AS pagerank
+                FROM {cur}
+                WHERE {vertex_id} NOT IN (
+                    SELECT {vertex_id}
+                    FROM {message}
+                )
+            """.format(**locals()))
+        # Check for convergence will be done as part of grouping support for 
pagerank:
+        # https://issues.apache.org/jira/browse/MADLIB-1082. So, the threshold 
parameter
+        # is a dummy variable at the moment, the PageRank computation happens 
for
+        # {max_iter} number of times.
+        plpy.execute("""
+                DROP TABLE IF EXISTS {cur};
+                ALTER TABLE {message} RENAME TO {cur}
+            """.format(**locals()))
+
+    plpy.execute("ALTER TABLE {cur} RENAME TO {out_table}".format(**locals()))
+
+    ## Step 4: Cleanup
+    plpy.execute("""
+        DROP TABLE IF EXISTS {0},{1},{2},{3};
+        """.format(out_cnts, edge_temp_table, cur, message))
+    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+
+def pagerank_help(schema_madlib, message, **kwargs):
+    """
+    Help function for pagerank
+
+    Args:
+        @param schema_madlib
+        @param message: string, Help message string
+        @param kwargs
+
+    Returns:
+        String. Help/usage information
+    """
+    if message is not None and \
+            message.lower() in ("usage", "help", "?"):
+        help_string = "Get from method below"
+        help_string = get_graph_usage(schema_madlib, 'PageRank',
+            """out_table       TEXT,  -- Name of the output table for PageRank
+    damping_factor, DOUBLE PRECISION, -- Damping factor in random surfer model
+                                      -- (DEFAULT = 0.85)
+    max_iter,       INTEGER,          -- Maximum iteration number (DEFAULT = 
100)
+    threshold       DOUBLE PRECISION  -- Stopping criteria (DEFAULT = 1e-5)
+""")
+    else:
+        if message is not None and \
+                message.lower() in ("example", "examples"):
+            help_string = """
+----------------------------------------------------------------------------
+                                EXAMPLES
+----------------------------------------------------------------------------
+-- Create a graph, represented as vertex and edge tables.
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+        id INTEGER
+        );
+CREATE TABLE edge(
+        src INTEGER,
+        dest INTEGER
+        );
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6);
+INSERT INTO edge VALUES
+(0, 1),
+(0, 2),
+(0, 4),
+(1, 2),
+(1, 3),
+(2, 3),
+(2, 5),
+(2, 6),
+(3, 0),
+(4, 0),
+(5, 6),
+(6, 3);
+
+-- Compute the PageRank:
+DROP TABLE IF EXISTS pagerank_out;
+SELECT madlib.pagerank(
+             'vertex',             -- Vertex table
+             'id',                 -- Vertix id column
+             'edge',               -- Edge table
+             'src=src, dest=dest', -- Comma delimted string of edge arguments
+             'pagerank_out')       -- Output table of PageRank
+
+-- View the PageRank of all vertices, sorted by their scores.
+SELECT * FROM pagerank_out ORDER BY pagerank desc;
+"""
+        else:
+            help_string = """
+----------------------------------------------------------------------------
+                                SUMMARY
+----------------------------------------------------------------------------
+Given a directed graph, pagerank algorithm finds the PageRank score of all
+the vertices in the graph.
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.pagerank('usage');
+
+For some examples, run:
+SELECT {schema_madlib}.pagerank('example')
+--
+"""
+
+    return help_string.format(schema_madlib=schema_madlib)
+# ---------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/pagerank.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in 
b/src/ports/postgres/modules/graph/pagerank.sql_in
new file mode 100644
index 0000000..712d146
--- /dev/null
+++ b/src/ports/postgres/modules/graph/pagerank.sql_in
@@ -0,0 +1,271 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file graph.sql_in
+ *
+ * @brief SQL functions for graph analytics
+ * @date Nov 2016
+ *
+ * @sa Provides various graph algorithms.
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_pagerank
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#pagerank">PageRank</a></li>
+<li><a href="#notes">Notes</a></li>
+<li><a href="#examples">Examples</a></li>
+<li><a href="#literature">Literature</a></li>
+</ul>
+</div>
+
+@brief Find the PageRank of all vertices in a directed graph.
+
+Given a graph, the PageRank algorithm outputs a probability distribution 
representing the
+likelihood that a person randomly traversing the graph will arrive at any 
particular vertex.
+This algorithm was originally used by Google to rank websites where the World 
Wide Web was
+modeled as a directed graph with the vertices representing the websites.
+
+@anchor pagerank
+@par PageRank
+<pre class="syntax">
+pagerank( vertex_table,
+            vertex_id,
+            edge_table,
+            edge_args,
+            out_table,
+            damping_factor,
+            max_iter,
+            threshold
+          )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>vertex_table</dt>
+<dd>TEXT. Name of the table containing the vertex data for the graph. Must 
contain the
+column specified in the 'vertex_id' parameter below.</dd>
+
+<dt>vertex_id</dt>
+<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
+vertex ids.  The vertex ids are of type INTEGER with no duplicates.
+They do not need to be contiguous.</dd>
+
+<dt>edge_table</dt>
+<dd>TEXT. Name of the table containing the edge data. The edge table must
+contain columns for source vertex and destination vertex.</dd>
+
+<dt>edge_args</dt>
+<dd>TEXT. A comma-delimited string containing multiple named arguments of
+the form "name=value". The following parameters are supported for
+this string argument:
+  - src (INTEGER): Name of the column containing the source vertex ids in the 
edge table.
+                   Default column name is 'src'.
+  - dest (INTEGER): Name of the column containing the destination vertex ids 
in the edge table.
+                    Default column name is 'dest'.</dd>
+
+<dt>out_table</dt>
+<dd>TEXT. Name of the table to store the result of PageRank.
+It will contain a row for every vertex from 'vertex_table' with
+the following columns:
+  - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' 
for column naming.
+  - pagerank : The vertex's PageRank.</dd>
+
+<dt>damping_factor</dt>
+<dd>FLOAT8, default 0.85. The probability, at any step, that a user will 
continue following the links in a random surfer model.</dd>
+
+<dt>max_iter</dt>
+<dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd>
+
+<dt>threshold</dt>
+<dd>FLOAT8, default: 1e-5. If the difference between the PageRank of every 
vertex of two consecutive
+iterations is smaller than 'threshold', or the iteration number is larger than 
'max_iter', the
+computation stops.  If you set the threshold to zero, then you will force the 
algorithm to run for the full number of iterations specified in 'max_iter'.</dd>
+
+</dl>
+
+@anchor notes
+@par Notes
+
+The PageRank algorithm proposed by Larry Page and Sergey Brin is used [1].
+
+@anchor examples
+@examp
+
+-# Create vertex and edge tables to represent the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+        id INTEGER
+        );
+CREATE TABLE edge(
+        src INTEGER,
+        dest INTEGER
+        );
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6);
+INSERT INTO edge VALUES
+(0, 1),
+(0, 2),
+(0, 4),
+(1, 2),
+(1, 3),
+(2, 3),
+(2, 5),
+(2, 6),
+(3, 0),
+(4, 0),
+(5, 6),
+(6, 3);
+</pre>
+
+-# Compute the PageRank:
+<pre class="syntax">
+DROP TABLE IF EXISTS pagerank_out;
+SELECT madlib.pagerank(
+                         'vertex',             -- Vertex table
+                         'id',                 -- Vertix id column
+                         'edge',               -- Edge table
+                         'src=src, dest=dest', -- Comma delimted string of 
edge arguments
+                         'pagerank_out');      -- Output table of PageRank
+SELECT * FROM pagerank_out ORDER BY pagerank desc;
+</pre>
+<pre class="result">
+ id |      pagerank
+----+--------------------
+  0 |  0.278256122055856
+  3 |  0.201882680839737
+  2 |  0.142878491945534
+  6 |  0.114538731993905
+  1 |  0.100266150276761
+  4 |  0.100266150276761
+  5 |  0.061911672611445
+(7 rows)
+</pre>
+
+-# Run PageRank with a damping factor of 0.5 results in different final values:
+<pre class="syntax">
+DROP TABLE IF EXISTS pagerank_out;
+SELECT madlib.pagerank(
+                         'vertex',             -- Vertex table
+                         'id',                 -- Vertix id column
+                         'edge',               -- Edge table
+                         'src=src, dest=dest', -- Comma delimted string of 
edge arguments
+                         'pagerank_out',       -- Output table of PageRank
+                         0.5);                 -- Damping factor
+SELECT * FROM pagerank_out ORDER BY pagerank desc;
+</pre>
+<pre class="result">
+ id |     pagerank      
+----+-------------------
+  0 | 0.221378135793372
+  3 | 0.191574922960784
+  6 | 0.140994575864846
+  2 | 0.135406336658892
+  4 | 0.108324751971412
+  1 | 0.108324751971412
+  5 | 0.093996524779681
+(7 rows)
+</pre>
+
+@anchor literature
+@par Literature
+
+[1] PageRank algorithm. https://en.wikipedia.org/wiki/PageRank
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
+    vertex_table    TEXT,
+    vertex_id       TEXT,
+    edge_table      TEXT,
+    edge_args       TEXT,
+    out_table       TEXT,
+    damping_factor  FLOAT8,
+    max_iter        INTEGER,
+    threshold       FLOAT8
+) RETURNS VOID AS $$
+    PythonFunction(graph, pagerank, pagerank)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
+    vertex_table    TEXT,
+    vertex_id       TEXT,
+    edge_table      TEXT,
+    edge_args       TEXT,
+    out_table       TEXT,
+    damping_factor  FLOAT8,
+    max_iter        INTEGER
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, 0.00001)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
+    vertex_table    TEXT,
+    vertex_id       TEXT,
+    edge_table      TEXT,
+    edge_args       TEXT,
+    out_table       TEXT,
+    damping_factor  FLOAT8
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, 0.00001)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
+    vertex_table    TEXT,
+    vertex_id       TEXT,
+    edge_table      TEXT,
+    edge_args       TEXT,
+    out_table       TEXT
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, 0.00001)
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(graph, pagerank, pagerank_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank()
+RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.pagerank('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+--------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6b466ea6/src/ports/postgres/modules/graph/test/pagerank.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/pagerank.sql_in 
b/src/ports/postgres/modules/graph/test/pagerank.sql_in
new file mode 100644
index 0000000..1d695e2
--- /dev/null
+++ b/src/ports/postgres/modules/graph/test/pagerank.sql_in
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+
+DROP TABLE IF EXISTS vertex, edge, pagerank_out;
+CREATE TABLE vertex(
+        id INTEGER
+        );
+CREATE TABLE edge(
+        src INTEGER,
+        dest INTEGER
+        );
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6);
+INSERT INTO edge VALUES
+(0, 1),
+(0, 2),
+(0, 4),
+(1, 2),
+(1, 3),
+(2, 3),
+(2, 5),
+(2, 6),
+(3, 0),
+(4, 0),
+(5, 6),
+(6, 3);
+
+SELECT madlib.pagerank(
+             'vertex',        -- Vertex table
+             'id',            -- Vertix id column
+             'edge',          -- Edge table
+             'src=src, dest=dest', -- Edge args
+             'pagerank_out'); -- Output table of PageRank
+
+-- View the PageRank of all vertices, sorted by their scores.
+SELECT assert(relative_error(SUM(pagerank), 1) < 0.00001,
+        'PageRank: Scores do not sum up to 1.'
+    ) FROM pagerank_out;

Reply via email to