Github user jingyimei commented on a diff in the pull request: https://github.com/apache/madlib/pull/244#discussion_r177892625 --- Diff: src/ports/postgres/modules/graph/pagerank.py_in --- @@ -44,29 +44,62 @@ from utilities.utilities import add_postfix from utilities.utilities import extract_keyvalue_params from utilities.utilities import unique_string, split_quoted_delimited_str from utilities.utilities import is_platform_pg +from utilities.utilities import py_list_to_sql_string from utilities.validate_args import columns_exist_in_table, get_cols_and_types from utilities.validate_args import table_exists + def validate_pagerank_args(schema_madlib, vertex_table, vertex_id, edge_table, edge_params, out_table, damping_factor, max_iter, - threshold, grouping_cols_list): + threshold, grouping_cols_list, nodes_of_interest): """ Function to validate input parameters for PageRank """ validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params, out_table, 'PageRank') - ## Validate args such as threshold and max_iter + # Validate args such as threshold and max_iter validate_params_for_link_analysis(schema_madlib, "PageRank", - threshold, max_iter, - edge_table, grouping_cols_list) + threshold, max_iter, + edge_table, grouping_cols_list) _assert(damping_factor >= 0.0 and damping_factor <= 1.0, "PageRank: Invalid damping factor value ({0}), must be between 0 and 1.". format(damping_factor)) - -def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, - out_table, damping_factor, max_iter, threshold, grouping_cols, **kwargs): + # Validate against the givin set of nodes for Personalized Page Rank + if nodes_of_interest: + nodes_of_interest_count = len(nodes_of_interest) + vertices_count = plpy.execute(""" + SELECT count(DISTINCT({vertex_id})) AS cnt FROM {vertex_table} + WHERE {vertex_id} = ANY(ARRAY{nodes_of_interest}) + """.format(**locals()))[0]["cnt"] + # Check to see if the given set of nodes exist in vertex table + if vertices_count != len(nodes_of_interest): + plpy.error("PageRank: Invalid value for {0}, must be a subset of the vertex_table".format( + nodes_of_interest)) + # Validate given set of nodes against each user group. + # If all the given nodes are not present in the user group + # then throw an error. + if grouping_cols_list: + missing_user_grps = '' + grp_by_column = get_table_qualified_col_str( + edge_table, grouping_cols_list) + grps_without_nodes = plpy.execute(""" + SELECT {grp_by_column} FROM {edge_table} + WHERE src = ANY(ARRAY{nodes_of_interest}) group by {grp_by_column} + having count(DISTINCT(src)) != {nodes_of_interest_count} + """.format(**locals())) + for row in range(grps_without_nodes.nrows()): + missing_user_grps += str(grps_without_nodes[row]['user_id']) + if row < grps_without_nodes.nrows() - 1: + missing_user_grps += ' ,' + if grps_without_nodes.nrows() > 0: + plpy.error("Nodes for Personalizaed Page Rank are missing from these groups: {0} ".format( + missing_user_grps)) + --- End diff -- Here some similar things are test twice - when `if nodes_of_interest`, there is a `count` operation in line 73 and in line 77 there is one test(this is for without grouping). Then when `if grouping_cols_list`, another `count` and `compare` happen in line 90 per group. There might be a way to simplify the logic here so that for grouping, we don't need to do it twice. Besides, if the above query really slow down performance a lot, I would think about doing it simpler by not giving a list of groups missing special nodes but just a warning(optional, depending on how expensive the above query is).
---