Github user njayaram2 commented on a diff in the pull request:

    https://github.com/apache/madlib/pull/178#discussion_r137864673
  
    --- Diff: src/ports/postgres/modules/graph/hits.py_in ---
    @@ -0,0 +1,427 @@
    +# coding=utf-8
    +#
    +# Licensed to the Apache Software Foundation (ASF) under one
    +# or more contributor license agreements.  See the NOTICE file
    +# distributed with this work for additional information
    +# regarding copyright ownership.  The ASF licenses this file
    +# to you under the Apache License, Version 2.0 (the
    +# "License"); you may not use this file except in compliance
    +# with the License.  You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing,
    +# software distributed under the License is distributed on an
    +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    +# KIND, either express or implied.  See the License for the
    +# specific language governing permissions and limitations
    +# under the License.
    +
    +# HITS
    +
    +# Please refer to the hits.sql_in file for the documentation
    +
    +"""
    +@file hits.py_in
    +
    +@namespace graph
    +"""
    +
    +import math
    +import plpy
    +from utilities.control import MinWarning
    +from utilities.utilities import _assert
    +from utilities.utilities import add_postfix
    +from utilities.utilities import extract_keyvalue_params
    +from utilities.utilities import unique_string
    +from utilities.utilities import is_platform_pg
    +
    +from graph_utils import *
    +
    +
    +def validate_hits_args(schema_madlib, vertex_table, vertex_id, edge_table,
    +                       edge_params, out_table, max_iter, threshold):
    +    """
    +    Function to validate input parameters for HITS
    +    """
    +    validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
    +                          out_table, 'HITS')
    +    _assert(not threshold or (threshold >= 0.0 and threshold <= 1.0),
    +            "HITS: Invalid threshold value ({0}), must be between 0 and 
1.".
    +            format(threshold))
    +    _assert(max_iter > 0,
    +            """HITS: Invalid max_iter value ({0}), must be a positive 
integer.""".
    +            format(max_iter))
    +
    +
    +def hits(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
    +         out_table, max_iter, threshold, **kwargs):
    +    """
    +    Function that computes the HITS scores
    +
    +    Args:
    +        @param vertex_table
    +        @param vertex_id
    +        @param edge_table
    +        @param source_vertex
    +        @param dest_vertex
    +        @param out_table
    +        @param max_iter
    +        @param threshold
    +    """
    +    with MinWarning('warning'):
    +        params_types = {'src': str, 'dest': str}
    +        default_args = {'src': 'src', 'dest': 'dest'}
    +        edge_params = extract_keyvalue_params(
    +            edge_args, params_types, default_args)
    +
    +        # populate default values for optional params if null
    +        if max_iter is None:
    +            max_iter = 100
    +        if not vertex_id:
    +            vertex_id = "id"
    +
    +        validate_hits_args(schema_madlib, vertex_table, vertex_id, 
edge_table,
    +                           edge_params, out_table, max_iter, threshold)
    +        summary_table = add_postfix(out_table, "_summary")
    +        _assert(not table_exists(summary_table),
    +                "Graph HITS: Output summary table ({summary_table}) 
already exists."
    +                .format(**locals()))
    +
    +        src = edge_params["src"]
    +        dest = edge_params["dest"]
    +        n_vertices = plpy.execute("""
    +                SELECT COUNT({0}) AS cnt
    +                FROM {1}
    +            """.format(vertex_id, vertex_table))[0]["cnt"]
    +
    +        # Assign default threshold value based on number of nodes in the 
graph.
    +        if threshold is None:
    +            threshold = 1.0 / (n_vertices * 1000)
    +
    +        edge_temp_table = unique_string(desp='temp_edge')
    +        distribution = ('' if is_platform_pg() else
    +                        "DISTRIBUTED BY ({0})".format(dest))
    +        plpy.execute("DROP TABLE IF EXISTS {0}".format(edge_temp_table))
    +        plpy.execute("""
    +                CREATE TEMP TABLE {edge_temp_table} AS
    +                SELECT * FROM {edge_table}
    +                {distribution}
    +            """.format(**locals()))
    +
    +        # GPDB and HAWQ have distributed by clauses to help them with 
indexing.
    +        # For Postgres we add the index explicitly.
    +        if is_platform_pg():
    +            plpy.execute("CREATE INDEX ON {0}({1})".format(
    +                edge_temp_table, dest))
    +
    +        # Intermediate tables required.
    +        cur = unique_string(desp='cur')
    +        message = unique_string(desp='message')
    +        v1 = unique_string(desp='v1')
    +        message_unconv_authority = unique_string(
    +            desp='message_unconv_authority')
    --- End diff --
    
    Fix indentation, you can change `message_unconv_authority` -> 
`msg_unconv_auth` and pull it up to the previous line. The `desp` parameter 
need not be very descriptive, as long as your variable name is descriptive 
enough.


---

Reply via email to