Github user njayaram2 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/178#discussion_r138204869
--- Diff: src/ports/postgres/modules/graph/hits.py_in ---
@@ -0,0 +1,417 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# HITS
+
+# Please refer to the hits.sql_in file for the documentation
+
+"""
+@file hits.py_in
+
+@namespace graph
+"""
+
+import math
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import add_postfix
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.utilities import is_platform_pg
+
+from graph_utils import *
+
+
+def validate_hits_args(schema_madlib, vertex_table, vertex_id, edge_table,
+ edge_params, out_table, max_iter, threshold):
+ """
+ Function to validate input parameters for HITS
+ """
+ validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
+ out_table, 'HITS')
+ _assert(not threshold or (threshold >= 0.0 and threshold <= 1.0),
+ "HITS: Invalid threshold value ({0}), must be between 0 and
1.".
+ format(threshold))
+ _assert(max_iter > 0,
+ """HITS: Invalid max_iter value ({0}), must be a positive
integer.""".
+ format(max_iter))
+
+
+def hits(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
+ out_table, max_iter, threshold, **kwargs):
+ """
+ Function that computes the HITS scores
+
+ Args:
+ @param vertex_table
+ @param vertex_id
+ @param edge_table
+ @param source_vertex
+ @param dest_vertex
+ @param out_table
+ @param max_iter
+ @param threshold
+ """
+ with MinWarning('warning'):
+ params_types = {'src': str, 'dest': str}
+ default_args = {'src': 'src', 'dest': 'dest'}
+ edge_params = extract_keyvalue_params(
+ edge_args, params_types, default_args)
+
+ # populate default values for optional params if null
+ if max_iter is None:
+ max_iter = 100
+ if not vertex_id:
+ vertex_id = "id"
+
+ validate_hits_args(schema_madlib, vertex_table, vertex_id,
edge_table,
+ edge_params, out_table, max_iter, threshold)
+ summary_table = add_postfix(out_table, "_summary")
+ _assert(not table_exists(summary_table),
+ "Graph HITS: Output summary table ({summary_table})
already exists."
+ .format(**locals()))
+
+ src = edge_params["src"]
+ dest = edge_params["dest"]
+ n_vertices = plpy.execute("""
+ SELECT COUNT({0}) AS cnt
+ FROM {1}
+ """.format(vertex_id, vertex_table))[0]["cnt"]
+
+ # Assign default threshold value based on number of nodes in the
graph.
+ if threshold is None:
+ threshold = 1.0 / (n_vertices * 1000)
+
+ edge_temp_table = unique_string(desp='temp_edge')
+ distribution = ('' if is_platform_pg() else
+ "DISTRIBUTED BY ({0})".format(dest))
+ plpy.execute("DROP TABLE IF EXISTS {0}".format(edge_temp_table))
+ plpy.execute("""
+ CREATE TEMP TABLE {edge_temp_table} AS
+ SELECT * FROM {edge_table}
+ {distribution}
+ """.format(**locals()))
+
+ # GPDB and HAWQ have distributed by clauses to help them with
indexing.
+ # For Postgres we add the index explicitly.
+ if is_platform_pg():
+ plpy.execute("CREATE INDEX ON {0}({1})".format(
+ edge_temp_table, dest))
+
+ # Intermediate tables required.
+ cur = unique_string(desp='cur')
+ message = unique_string(desp='message')
+ v1 = unique_string(desp='v1')
+ message_unconv_authority = unique_string(
+ desp='message_unconv_authority')
+ message_unconv_hub = unique_string(desp="message_unconv_hub")
+ tmp = unique_string(desp='tmp')
+ tmp2 = unique_string(desp='tmp2')
+ tmp3 = unique_string(desp='tmp3')
+ v2 = unique_string(desp='v2')
+
+ if is_platform_pg():
+ cur_distribution = cnts_distribution = ''
+ else:
+ cur_distribution = cnts_distribution = \
+ "DISTRIBUTED BY ({0})".format(vertex_id)
+ cur_join_clause = " {cur}.{vertex_id} =
{edge_temp_table}.{dest}".format(
+ **locals())
+ v1_join_clause = "{v1}.{vertex_id} =
{edge_temp_table}.{src}".format(
+ **locals())
+
+ authority_init_value = 1.0
+ hub_init_value = 1.0
+ plpy.execute("""
+ CREATE TEMP TABLE {cur} AS
+ SELECT {vertex_id}, {authority_init_value}::DOUBLE
PRECISION AS authority,
+ {hub_init_value}::DOUBLE PRECISION AS hub
+ FROM {vertex_table}
+ {cur_distribution}
+ """.format(**locals()))
+
+ # The summary table contains the total number of iterations.
+ plpy.execute("""
+ CREATE TABLE {summary_table} (
+ __iterations__ INTEGER
+ )
+ """.format(**locals()))
+
+ unconverged_authority_num = 0
+ unconverged_hub_num = 0
+ iteration_num = 0
+ authority_norm = 0
+ hub_norm = 0
--- End diff --
`hub_norm` and `authority_norm` can be initialized to `1`.
---