Github user jingyimei commented on a diff in the pull request:

    https://github.com/apache/madlib/pull/244#discussion_r177892625
  
    --- Diff: src/ports/postgres/modules/graph/pagerank.py_in ---
    @@ -44,29 +44,62 @@ from utilities.utilities import add_postfix
     from utilities.utilities import extract_keyvalue_params
     from utilities.utilities import unique_string, split_quoted_delimited_str
     from utilities.utilities import is_platform_pg
    +from utilities.utilities import py_list_to_sql_string
     
     from utilities.validate_args import columns_exist_in_table, 
get_cols_and_types
     from utilities.validate_args import table_exists
     
    +
     def validate_pagerank_args(schema_madlib, vertex_table, vertex_id, 
edge_table,
                                edge_params, out_table, damping_factor, 
max_iter,
    -                           threshold, grouping_cols_list):
    +                           threshold, grouping_cols_list, 
nodes_of_interest):
         """
         Function to validate input parameters for PageRank
         """
         validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
                               out_table, 'PageRank')
    -    ## Validate args such as threshold and max_iter
    +    # Validate args such as threshold and max_iter
         validate_params_for_link_analysis(schema_madlib, "PageRank",
    -                                            threshold, max_iter,
    -                                            edge_table, grouping_cols_list)
    +                                      threshold, max_iter,
    +                                      edge_table, grouping_cols_list)
         _assert(damping_factor >= 0.0 and damping_factor <= 1.0,
                 "PageRank: Invalid damping factor value ({0}), must be between 
0 and 1.".
                 format(damping_factor))
     
    -
    -def pagerank(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
    -             out_table, damping_factor, max_iter, threshold, 
grouping_cols, **kwargs):
    +    # Validate against the givin set of nodes for Personalized Page Rank
    +    if nodes_of_interest:
    +        nodes_of_interest_count = len(nodes_of_interest)
    +        vertices_count = plpy.execute("""
    +                   SELECT count(DISTINCT({vertex_id})) AS cnt FROM 
{vertex_table}
    +                   WHERE {vertex_id} = ANY(ARRAY{nodes_of_interest})
    +                   """.format(**locals()))[0]["cnt"]
    +        # Check to see if the given set of nodes exist in vertex table
    +        if vertices_count != len(nodes_of_interest):
    +            plpy.error("PageRank: Invalid value for {0}, must be a subset 
of the vertex_table".format(
    +                nodes_of_interest))
    +        # Validate given set of nodes against each user group.
    +        # If all the given nodes are not present in the user group
    +        # then throw an error.
    +        if grouping_cols_list:
    +            missing_user_grps = ''
    +            grp_by_column = get_table_qualified_col_str(
    +                edge_table, grouping_cols_list)
    +            grps_without_nodes = plpy.execute("""
    +                   SELECT {grp_by_column} FROM {edge_table}
    +                   WHERE src = ANY(ARRAY{nodes_of_interest}) group by 
{grp_by_column}
    +                   having count(DISTINCT(src)) != {nodes_of_interest_count}
    +                   """.format(**locals()))
    +            for row in range(grps_without_nodes.nrows()):
    +                missing_user_grps += 
str(grps_without_nodes[row]['user_id'])
    +                if row < grps_without_nodes.nrows() - 1:
    +                    missing_user_grps += ' ,'
    +            if grps_without_nodes.nrows() > 0:
    +                plpy.error("Nodes for Personalizaed Page Rank are missing 
from these groups: {0} ".format(
    +                    missing_user_grps))
    +
    --- End diff --
    
    Here some similar things are test twice - when `if nodes_of_interest`, 
there is a `count` operation in line 73 and in line 77 there is one test(this 
is for without grouping). Then when `if grouping_cols_list`, another `count` 
and `compare` happen in line 90 per group. There might be a way to simplify the 
logic here so that for grouping, we don't need to do it twice.  Besides, if the 
above query really slow down performance a lot, I would think about doing it 
simpler by not giving a list of groups missing special nodes but just a 
warning(optional, depending on how expensive the above query is).


---

Reply via email to