Github user jingyimei commented on a diff in the pull request:
https://github.com/apache/madlib/pull/244#discussion_r177851780
--- Diff: src/ports/postgres/modules/graph/pagerank.py_in ---
@@ -44,29 +44,62 @@ from utilities.utilities import add_postfix
from utilities.utilities import extract_keyvalue_params
from utilities.utilities import unique_string, split_quoted_delimited_str
from utilities.utilities import is_platform_pg
+from utilities.utilities import py_list_to_sql_string
from utilities.validate_args import columns_exist_in_table,
get_cols_and_types
from utilities.validate_args import table_exists
+
def validate_pagerank_args(schema_madlib, vertex_table, vertex_id,
edge_table,
edge_params, out_table, damping_factor,
max_iter,
- threshold, grouping_cols_list):
+ threshold, grouping_cols_list,
nodes_of_interest):
"""
Function to validate input parameters for PageRank
"""
validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
out_table, 'PageRank')
- ## Validate args such as threshold and max_iter
+ # Validate args such as threshold and max_iter
validate_params_for_link_analysis(schema_madlib, "PageRank",
- threshold, max_iter,
- edge_table, grouping_cols_list)
+ threshold, max_iter,
+ edge_table, grouping_cols_list)
_assert(damping_factor >= 0.0 and damping_factor <= 1.0,
"PageRank: Invalid damping factor value ({0}), must be between
0 and 1.".
format(damping_factor))
-
-def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
- out_table, damping_factor, max_iter, threshold,
grouping_cols, **kwargs):
+ # Validate against the givin set of nodes for Personalized Page Rank
+ if nodes_of_interest:
+ nodes_of_interest_count = len(nodes_of_interest)
+ vertices_count = plpy.execute("""
+ SELECT count(DISTINCT({vertex_id})) AS cnt FROM
{vertex_table}
+ WHERE {vertex_id} = ANY(ARRAY{nodes_of_interest})
+ """.format(**locals()))[0]["cnt"]
+ # Check to see if the given set of nodes exist in vertex table
+ if vertices_count != len(nodes_of_interest):
+ plpy.error("PageRank: Invalid value for {0}, must be a subset
of the vertex_table".format(
--- End diff --
This query tests several invalid scenarios, including duplicate nodes in
nodes_of_interest, in the error msg maybe we can say "Invalid value for {0},
must be a subset of the vertex_table without duplicate nodes".
---