kaknikhil commented on a change in pull request #441: Kmeans: simplified 
silhouette per point for k-means
URL: https://github.com/apache/madlib/pull/441#discussion_r325342776
 
 

 ##########
 File path: src/ports/postgres/modules/kmeans/kmeans.py_in
 ##########
 @@ -387,5 +396,62 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, 
rel_source,
                             'old_centroid': old_centroid_str}))
     return iterationCtrl.iteration
 
+def simple_silhouette_points(schema_madlib, rel_source, output_table, pid,
+    expr_point, centroids, fn_dist, **kwargs):
+
+    with MinWarning("error"):
+        kmeans_validate_src(schema_madlib, rel_source)
+        output_tbl_valid(output_table, 'kmeans')
+
+        _assert(type(centroids) == list and
+                type(centroids[0]) == list and
+                len(centroids) > 1,
+                'kmeans: invalid centroids shape')
+
+        rel_source, expr_point = _create_temp_view_for_expr(schema_madlib,
+                                                            rel_source,
+                                                            expr_point)
+
+        plpy.execute("""
+            CREATE TABLE {output_table} AS
+                SELECT {pid}, centroids[1] AS centroid_id,
+                centroids[2] AS neighbor_centroid_id,
+                (CASE
+                    WHEN distances[2] = 0 THEN 0
+                    ELSE (distances[2] - distances[1]) / distances[2]
+                END) AS silh
+                FROM
+                (SELECT {pid},
+                       (cc_out).column_ids::integer[] AS centroids,
+                       (cc_out).distances::double precision[] AS distances
+                FROM (
+                    SELECT {pid},
+                           {schema_madlib}._closest_columns(
+                            array{centroids},
+                            {expr_point},
+                            2,
+                            '{fn_dist}'::REGPROC, '{fn_dist}') AS cc_out
+                    FROM {rel_source})q1
+                )q2
+            """.format(**locals()))
+
+def simple_silhouette_points_dbl_wrapper(schema_madlib, rel_source, 
output_table, pid,
+    expr_point, centroids, fn_dist, **kwargs):
+
+    simple_silhouette_points(schema_madlib, rel_source, output_table, pid,
+        expr_point, centroids, fn_dist)
+
+
+def simple_silhouette_points_str_wrapper(schema_madlib, rel_source, 
output_table, pid,
+    expr_point, centroids_table, centroids_col, fn_dist, **kwargs):
+
+    input_tbl_valid(centroids_table, 'kmeans')
+    columns_exist_in_table(centroids_table, centroids_col)
+    centroids = plpy.execute("""
 
 Review comment:
   I am assuming none of our kmeans functions will return a null centroid value 
, right ?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to