Github user njayaram2 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/178#discussion_r137864673
--- Diff: src/ports/postgres/modules/graph/hits.py_in ---
@@ -0,0 +1,427 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# HITS
+
+# Please refer to the hits.sql_in file for the documentation
+
+"""
+@file hits.py_in
+
+@namespace graph
+"""
+
+import math
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import add_postfix
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.utilities import is_platform_pg
+
+from graph_utils import *
+
+
+def validate_hits_args(schema_madlib, vertex_table, vertex_id, edge_table,
+ edge_params, out_table, max_iter, threshold):
+ """
+ Function to validate input parameters for HITS
+ """
+ validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
+ out_table, 'HITS')
+ _assert(not threshold or (threshold >= 0.0 and threshold <= 1.0),
+ "HITS: Invalid threshold value ({0}), must be between 0 and
1.".
+ format(threshold))
+ _assert(max_iter > 0,
+ """HITS: Invalid max_iter value ({0}), must be a positive
integer.""".
+ format(max_iter))
+
+
+def hits(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
+ out_table, max_iter, threshold, **kwargs):
+ """
+ Function that computes the HITS scores
+
+ Args:
+ @param vertex_table
+ @param vertex_id
+ @param edge_table
+ @param source_vertex
+ @param dest_vertex
+ @param out_table
+ @param max_iter
+ @param threshold
+ """
+ with MinWarning('warning'):
+ params_types = {'src': str, 'dest': str}
+ default_args = {'src': 'src', 'dest': 'dest'}
+ edge_params = extract_keyvalue_params(
+ edge_args, params_types, default_args)
+
+ # populate default values for optional params if null
+ if max_iter is None:
+ max_iter = 100
+ if not vertex_id:
+ vertex_id = "id"
+
+ validate_hits_args(schema_madlib, vertex_table, vertex_id,
edge_table,
+ edge_params, out_table, max_iter, threshold)
+ summary_table = add_postfix(out_table, "_summary")
+ _assert(not table_exists(summary_table),
+ "Graph HITS: Output summary table ({summary_table})
already exists."
+ .format(**locals()))
+
+ src = edge_params["src"]
+ dest = edge_params["dest"]
+ n_vertices = plpy.execute("""
+ SELECT COUNT({0}) AS cnt
+ FROM {1}
+ """.format(vertex_id, vertex_table))[0]["cnt"]
+
+ # Assign default threshold value based on number of nodes in the
graph.
+ if threshold is None:
+ threshold = 1.0 / (n_vertices * 1000)
+
+ edge_temp_table = unique_string(desp='temp_edge')
+ distribution = ('' if is_platform_pg() else
+ "DISTRIBUTED BY ({0})".format(dest))
+ plpy.execute("DROP TABLE IF EXISTS {0}".format(edge_temp_table))
+ plpy.execute("""
+ CREATE TEMP TABLE {edge_temp_table} AS
+ SELECT * FROM {edge_table}
+ {distribution}
+ """.format(**locals()))
+
+ # GPDB and HAWQ have distributed by clauses to help them with
indexing.
+ # For Postgres we add the index explicitly.
+ if is_platform_pg():
+ plpy.execute("CREATE INDEX ON {0}({1})".format(
+ edge_temp_table, dest))
+
+ # Intermediate tables required.
+ cur = unique_string(desp='cur')
+ message = unique_string(desp='message')
+ v1 = unique_string(desp='v1')
+ message_unconv_authority = unique_string(
+ desp='message_unconv_authority')
--- End diff --
Fix indentation, you can change `message_unconv_authority` ->
`msg_unconv_auth` and pull it up to the previous line. The `desp` parameter
need not be very descriptive, as long as your variable name is descriptive
enough.
---