[madlib] branch master updated: KNN: Add distances to the output table

okislal Wed, 14 Aug 2019 12:04:18 -0700

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new e420686  KNN: Add distances to the output table
e420686 is described below

commit e4206866dc452ac1d36f8419c7de9aa1cce32ca9
Author: Orhan Kislal <[email protected]>
AuthorDate: Wed Jul 31 14:54:44 2019 -0700

    KNN: Add distances to the output table
    
    JIRA: MADLIB-1370
    
    This commit adds a new column to the output table to expose the
    distances for the nearest neighbors. This column shows up only when the
    user actually requests nearest neighbors.
    
    In addition, this commit adds a safeguard for the distances. We get the
    absolute value of the distance function output in case some floating
    point operations cause the distance function to return a negative value.
    
    Closes #427
    
    Co-authored-by: Frank McQuillan <[email protected]>
---
 src/ports/postgres/modules/knn/knn.py_in       |  30 ++--
 src/ports/postgres/modules/knn/knn.sql_in      | 224 +++++++++++++++++--------
 src/ports/postgres/modules/knn/test/knn.sql_in |   1 +
 3 files changed, 176 insertions(+), 79 deletions(-)

diff --git a/src/ports/postgres/modules/knn/knn.py_in 
b/src/ports/postgres/modules/knn/knn.py_in
index bf64352..6d681e2 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -51,7 +51,7 @@ from utilities.validate_args import is_col_array
 from utilities.validate_args import is_var_valid
 from utilities.validate_args import quote_ident
 
-WEIGHT_FOR_ZERO_DIST = 1e6
+WEIGHT_FOR_ZERO_DIST = 1e107
 BRUTE_FORCE = 'brute_force'
 KD_TREE = 'kd_tree'
 
@@ -121,11 +121,6 @@ def knn_validate_src(schema_madlib, point_source, 
point_column_name, point_id,
 
     if fn_dist:
         fn_dist = fn_dist.lower().strip()
-        dist_functions = set(["{0}.{1}".format(schema_madlib, dist) for dist in
-                              ('dist_norm1', 'dist_norm2',
-                               'squared_dist_norm2', 'dist_angle',
-                               'dist_tanimoto')])
-
         profunc = ("proisagg = TRUE"
                    if is_pg_major_version_less_than(schema_madlib, 11)
                    else "prokind = 'a'")
@@ -136,9 +131,12 @@ def knn_validate_src(schema_madlib, point_source, 
point_column_name, point_id,
             WHERE oid='{fn_dist}(DOUBLE PRECISION[], DOUBLE 
PRECISION[])'::regprocedure;
             """.format(fn_dist=fn_dist, profunc=profunc))[0]['output']
 
-        if is_invalid_func or (fn_dist not in dist_functions):
-            plpy.error("KNN error: Distance function ({0}) has invalid 
signature "
-                       "or is not a simple function.".format(fn_dist))
+        if is_invalid_func:
+            plpy.error("KNN error: Distance function ({0}). Either the 
distance"\
+                " function does not exist or the signature is wrong or it is"\
+                " not a PostgreSQL type UDF. Also note that to use a MADlib"\
+                " built-in distance function you must prepend with 'madlib',"\
+                " schema name e.g., 'madlib.dist_norm2'".format(fn_dist))
     if not is_brute_force:
         if depth <= 0:
             plpy.error("kNN Error: depth={0} is an invalid value, must be "
@@ -313,7 +311,7 @@ def knn_kd_tree(schema_madlib, kd_out, test_source, 
test_column_name, test_id,
 def _create_interim_tbl(schema_madlib, point_source, point_column_name, 
point_id,
     label_name, test_source, test_column_name, test_id, interim_table, k,
     fn_dist, test_id_temp, train_id, dist_inverse, comma_label_out_alias,
-    label_out, r_id, kd_out, train, t_col_name, **kwargs):
+    label_out, r_id, kd_out, train, t_col_name, dist, **kwargs):
     """
         KNN function to create the interim table
         Args:
@@ -357,6 +355,7 @@ def _create_interim_tbl(schema_madlib, point_source, 
point_column_name, point_id
             @param kd_out
             @param train
             @param t_col_name
+            @param dist
     """
     with MinWarning("error"):
         # If r_id is None, we are using the brute force algorithm.
@@ -368,7 +367,6 @@ def _create_interim_tbl(schema_madlib, point_source, 
point_column_name, point_id
         y_temp_table = unique_string(desp='y_temp_table')
         test = unique_string(desp='test')
         r = unique_string(desp='r')
-        dist = unique_string(desp='dist')
 
         if not is_brute_force:
             point_source = kd_out
@@ -389,6 +387,7 @@ def _create_interim_tbl(schema_madlib, point_source, 
point_column_name, point_id
                                 (PARTITION BY {test_id_temp} ORDER BY {dist}) 
AS {r},
                            {test_id_temp},
                            {train_id},
+                           {dist},
                            CASE WHEN {dist} = 0.0 THEN {weight_for_zero_dist}
                                 ELSE 1.0 / {dist}
                            END AS {dist_inverse}
@@ -523,6 +522,7 @@ def knn(schema_madlib, point_source, point_column_name, 
point_id,
         dist_inverse = unique_string(desp='dist_inverse')
         dim = unique_string(desp='dim')
         t_col_name = unique_string(desp='t_col_name')
+        dist = unique_string(desp='dist')
 
         if not fn_dist:
             fn_dist = '{0}.squared_dist_norm2'.format(schema_madlib)
@@ -611,7 +611,9 @@ def knn(schema_madlib, point_source, point_column_name, 
point_id,
 
         if output_neighbors:
             knn_neighbors = (", array_agg(knn_temp.{train_id} ORDER BY "
-                             "knn_temp.{dist_inverse} DESC) AS 
k_nearest_neighbours ").format(**locals())
+                             "knn_temp.{dist_inverse} DESC) AS 
k_nearest_neighbours "
+                             ", array_agg(knn_temp.{dist} ORDER BY "
+                             "knn_temp.{dist_inverse} DESC) AS 
distance").format(**locals())
         else:
             knn_neighbors = ''
 
@@ -635,7 +637,8 @@ def knn(schema_madlib, point_source, point_column_name, 
point_id,
                             point_id, label_name, test_data, test_column_name,
                             test_id, interim_table, k, fn_dist, test_id_temp,
                             train_id, dist_inverse, comma_label_out_alias,
-                            label_out, r_id, kd_output_table, train, 
t_col_name)
+                            label_out, r_id, kd_output_table, train, 
t_col_name,
+                            dist)
         output_sql = """
             CREATE TABLE {output_table} AS
                 {view_def}
@@ -711,6 +714,7 @@ id                  The ids of test data points.
 test_column_name    The test data points.
 prediction          The output of KNN- label in case of classification, 
average value in case of regression.
 k_nearest_neighbours The list of k-nearest neighbors that were used in the 
voting/averaging.
+distance The list of nearest distances, sorted closest to furthest from the 
corresponding test point.
 """
     else:
         help_string = """
diff --git a/src/ports/postgres/modules/knn/knn.sql_in 
b/src/ports/postgres/modules/knn/knn.sql_in
index 0693e94..daeddc8 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -60,6 +60,9 @@ of task. For classification, the output is the majority vote 
of the classes of
 the \f$k\f$ nearest data points. For regression, the output is the average of 
the
 values of \f$k\f$ nearest neighbors of the given test point.
 
+For unsupervised nearest neighbors, set the training set to match the test set 
so the
+nearest neighbor of each point is the point itself, with zero distance.
+
 Both exact and approximate methods are supported. The approximate methods can 
be
 used in the case that run-time is too long using the exact method.
 
@@ -131,8 +134,8 @@ otherwise the result may depend on ordering of the input 
data.</dd>
 
 <dt>output_neighbors (optional) </dt>
 <dd>BOOLEAN default: TRUE. Outputs the list of k-nearest
-neighbors that were used in the voting/averaging, sorted
-from closest to furthest.</dd>
+neighbors (and their respective distances to the target point) that were used
+in the voting/averaging, sorted from closest to furthest.</dd>
 
 <dt>fn_dist (optional)</dt>
 <dd>TEXT, default: 'squared_dist_norm2'. The name of the function
@@ -145,7 +148,11 @@ The following distance functions can be used:
 <li><b>\ref squared_dist_norm2</b>: squared Euclidean distance</li>
 <li><b>\ref dist_angle</b>: angle</li>
 <li><b>\ref dist_tanimoto</b>: tanimoto</li>
-<li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, 
DOUBLE PRECISION[] y -> DOUBLE PRECISION</tt></li></ul></dd>
+<li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, 
DOUBLE PRECISION[] y -> DOUBLE PRECISION.</tt>
+Must return a value greater than or equal to zero.</li></ul></dd>
+@note
+Always qualify the distance function with the schema name.  For example, if 
you install MADlib in a
+schema called 'madlib' then the 'fn_dist' parameter would be 
'madlib.dist_norm2', etc.
 
 <dt>weighted_avg (optional)</dt>
 <dd>BOOLEAN, default: FALSE. Calculates classification or
@@ -205,11 +212,16 @@ The output of the KNN module is a table with the 
following columns:
         <th>prediction</th>
         <td>INTEGER. Label in case of classification, average value in case of 
regression.</td>
     </tr>
-        <tr>
+    <tr>
         <th>k_nearest_neighbours</th>
         <td>INTEGER[]. List of nearest neighbors, sorted closest to furthest
         from the corresponding test point.</td>
     </tr>
+    <tr>
+        <th>distance</th>
+        <td>DOUBLE PRECISION[]. List of distance to nearest neighbors, sorted 
closest to furthest
+        from the corresponding test point.</td>
+    </tr>
 </table>
 
 
@@ -272,7 +284,8 @@ INSERT INTO knn_test_data VALUES
 (6, '{50,45}');
 </pre>
 
--#  Run KNN for classification:
+-#  Run KNN for classification.  Prepend the distance function parameter with 
the schema
+where MADlib is installed (in this example 'madlib.squared_dist_norm2'):
 <pre class="example">
 DROP TABLE IF EXISTS knn_result_classification;
 SELECT * FROM madlib.knn(
@@ -286,20 +299,20 @@ SELECT * FROM madlib.knn(
                 'knn_result_classification',  -- Output table
                  3,                    -- Number of nearest neighbors
                  True,                 -- True to list nearest-neighbors by id
-                 'madlib.squared_dist_norm2' -- Distance function
+                'madlib.squared_dist_norm2' -- Distance function
                 );
 SELECT * from knn_result_classification ORDER BY id;
 </pre>
 Result:
 <pre class="result">
-  id |  data   | prediction | k_nearest_neighbours
-----+---------+------------+----------------------
-  1 | {2,1}   |          1 | {2,1,3}
-  2 | {2,6}   |          1 | {5,4,3}
-  3 | {15,40} |          0 | {7,6,5}
-  4 | {12,1}  |          1 | {4,5,3}
-  5 | {2,90}  |          0 | {9,6,7}
-  6 | {50,45} |          0 | {6,7,8}
+ id |  data   | prediction | k_nearest_neighbours | distance
+----+---------+------------+----------------------+---------------------
+  1 | {2,1}   |          1 | {1,2,3}              | {1,1,5}
+  2 | {2,6}   |          1 | {5,4,3}              | {5,8,10}
+  3 | {15,40} |          0 | {7,6,5}              | {106,125,1346}
+  4 | {12,1}  |          1 | {4,5,3}              | {73,80,85}
+  5 | {2,90}  |          0 | {9,6,7}              | {442,1924,3545}
+  6 | {50,45} |          0 | {6,7,8}              | {925,1796,1985}
 (6 rows)
 </pre>
 Note that the nearest neighbors are sorted from closest
@@ -318,21 +331,21 @@ SELECT * FROM madlib.knn(
                 'id',                  -- Col name of id in test data
                 'knn_result_regression',  -- Output table
                  3,                    -- Number of nearest neighbors
-                True,                  -- True to list nearest-neighbors by id
+                 True,                 -- True to list nearest-neighbors by id
                 'madlib.dist_norm2'    -- Distance function
                 );
 SELECT * FROM knn_result_regression ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id |  data   |    prediction     | k_nearest_neighbours
-----+---------+-------------------+----------------------
-  1 | {2,1}   |                 1 | {2,1,3}
-  2 | {2,6}   |                 1 | {5,4,3}
-  3 | {15,40} | 0.333333333333333 | {7,6,5}
-  4 | {12,1}  |                 1 | {4,5,3}
-  5 | {2,90}  |                 0 | {9,6,7}
-  6 | {50,45} |                 0 | {6,7,8}
+ id |  data   |    prediction     | k_nearest_neighbours |                 
distance
+----+---------+-------------------+----------------------+------------------------------------------------------
+  1 | {2,1}   |                 1 | {1,2,3}              | 
{1,1,2.23606797749979}
+  2 | {2,6}   |                 1 | {5,4,3}              | 
{2.23606797749979,2.82842712474619,3.16227766016838}
+  3 | {15,40} | 0.333333333333333 | {7,6,5}              | 
{10.295630140987,11.1803398874989,36.6878726556883}
+  4 | {12,1}  |                 1 | {4,5,3}              | 
{8.54400374531753,8.94427190999916,9.21954445729289}
+  5 | {2,90}  |                 0 | {9,6,7}              | 
{21.0237960416286,43.8634243989226,59.5399025864168}
+  6 | {50,45} |                 0 | {6,7,8}              | 
{30.4138126514911,42.3792402008342,44.5533388198909}
 (6 rows)
 </pre>
 
@@ -344,25 +357,25 @@ SELECT * FROM madlib.knn(
                 'knn_train_data_reg',  -- Table of training data
                 'data',                -- Col name of training data
                 'id',                  -- Col Name of id in train data
-                NULL,                  -- NULL training labels means just list 
neighbors
+                 NULL,                 -- NULL training labels means just list 
neighbors
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data
                 'knn_result_list_neighbors', -- Output table
-                3                      -- Number of nearest neighbors
+                 3                     -- Number of nearest neighbors
                 );
 SELECT * FROM knn_result_list_neighbors ORDER BY id;
 </pre>
 Result, with neighbors sorted from closest to furthest:
 <pre class="result">
- id |  data   | k_nearest_neighbours
-----+---------+----------------------
-  1 | {2,1}   | {2,1,3}
-  2 | {2,6}   | {5,4,3}
-  3 | {15,40} | {7,6,5}
-  4 | {12,1}  | {4,5,3}
-  5 | {2,90}  | {9,6,7}
-  6 | {50,45} | {6,7,8}
+ id |  data   | k_nearest_neighbours | distance
+----+---------+----------------------+---------------------
+  1 | {2,1}   | {1,2,3}              | {1,1,5}
+  2 | {2,6}   | {5,4,3}              | {5,8,10}
+  3 | {15,40} | {7,6,5}              | {106,125,1346}
+  4 | {12,1}  | {4,5,3}              | {73,80,85}
+  5 | {2,90}  | {9,6,7}              | {442,1924,3545}
+  6 | {50,45} | {6,7,8}              | {925,1796,1985}
 (6 rows)
 </pre>
 
@@ -382,20 +395,20 @@ SELECT * FROM madlib.knn(
                 'knn_result_classification',  -- Output table
                  3,                    -- Number of nearest neighbors
                  True,                 -- True to list nearest-neighbors by id
-                 'madlib.squared_dist_norm2', -- Distance function
+                'madlib.squared_dist_norm2', -- Distance function
                  True                 -- For weighted average
                 );
 SELECT * FROM knn_result_classification ORDER BY id;
 </pre>
 <pre class="result">
- id |  data   |     prediction      | k_nearest_neighbours
-----+---------+---------------------+----------------------
-  1 | {2,1}   |                 1   | {2,1,3}
-  2 | {2,6}   |                 1   | {5,4,3}
-  3 | {15,40} |                 0   | {7,6,5}
-  4 | {12,1}  |                 1   | {4,5,3}
-  5 | {2,90}  |                 0   | {9,6,7}
-  6 | {50,45} |                 0   | {6,7,8}
+ id |  data   | prediction | k_nearest_neighbours | distance
+----+---------+------------+----------------------+---------------------
+  1 | {2,1}   |          1 | {1,2,3}              | {1,1,5}
+  2 | {2,6}   |          1 | {5,4,3}              | {5,8,10}
+  3 | {15,40} |          0 | {7,6,5}              | {106,125,1346}
+  4 | {12,1}  |          1 | {4,5,3}              | {73,80,85}
+  5 | {2,90}  |          0 | {9,6,7}              | {442,1924,3545}
+  6 | {50,45} |          0 | {6,7,8}              | {925,1796,1985}
 (6 rows)
 </pre>
 
@@ -407,29 +420,29 @@ SELECT madlib.knn(
                 'knn_train_data',        -- Table of training data
                 'data',                  -- Col name of training data
                 'id',                    -- Col name of id in train data
-                NULL,                    -- Training labels
+                 NULL,                   -- Training labels
                 'knn_test_data',         -- Table of test data
                 'data',                  -- Col name of test data
                 'id',                    -- Col name of id in test data
                 'knn_result_classification_kd',  -- Output table
                  3,                      -- Number of nearest neighbors
                  True,                   -- True to list nearest-neighbors by 
id
-                 'madlib.squared_dist_norm2', -- Distance function
+                'madlib.squared_dist_norm2', -- Distance function
                  False,                  -- For weighted average
-                 'kd_tree',              -- Use kd-tree
-                 'depth=4, leaf_nodes=8' -- Kd-tree options
+                'kd_tree',               -- Use kd-tree
+                'depth=4, leaf_nodes=8'  -- Kd-tree options
                  );
 SELECT * FROM knn_result_classification_kd ORDER BY id;
 </pre>
 <pre class="result">
- id |  data   | k_nearest_neighbours
-----+---------+----------------------
-  1 | {2,1}   | {1,2,3}
-  2 | {2,6}   | {5,4,3}
-  3 | {15,40} | {7,6,5}
-  4 | {12,1}  | {4,5,3}
-  5 | {2,90}  | {9,6,7}
-  6 | {50,45} | {6,7,8}
+ id |  data   | k_nearest_neighbours | distance
+----+---------+----------------------+---------------------
+  1 | {2,1}   | {1,2,3}              | {1,1,5}
+  2 | {2,6}   | {5,4,3}              | {5,8,10}
+  3 | {15,40} | {7,6,5}              | {106,125,1346}
+  4 | {12,1}  | {4,5,3}              | {73,80,85}
+  5 | {2,90}  | {9,6,7}              | {442,1924,3545}
+  6 | {50,45} | {6,7,8}              | {925,1796,1985}
 (6 rows)
 </pre>
 The result above is the same as brute force. If we search just 1 leaf node,
@@ -441,31 +454,108 @@ SELECT madlib.knn(
                 'knn_train_data',        -- Table of training data
                 'data',                  -- Col name of training data
                 'id',                    -- Col name of id in train data
-                NULL,                    -- Training labels
+                 NULL,                   -- Training labels
                 'knn_test_data',         -- Table of test data
                 'data',                  -- Col name of test data
                 'id',                    -- Col name of id in test data
                 'knn_result_classification_kd',  -- Output table
                  3,                      -- Number of nearest neighbors
                  True,                   -- True to list nearest-neighbors by 
id
-                 'madlib.squared_dist_norm2', -- Distance function
+                'madlib.squared_dist_norm2', -- Distance function
                  False,                  -- For weighted average
-                 'kd_tree',              -- Use kd-tree
-                 'depth=4, leaf_nodes=1' -- Kd-tree options
+                'kd_tree',               -- Use kd-tree
+                'depth=4, leaf_nodes=1'  -- Kd-tree options
                  );
 SELECT * FROM knn_result_classification_kd ORDER BY id;
 </pre>
 <pre class="result">
- id |  data   | k_nearest_neighbours
-----+---------+----------------------
-  1 | {2,1}   | {1}
-  2 | {2,6}   | {3,2}
-  3 | {15,40} | {7}
-  5 | {2,90}  | {3,2}
-  6 | {50,45} | {6,8}
+ id |  data   | k_nearest_neighbours | distance
+----+---------+----------------------+---------------------
+  1 | {2,1}   | {1}                  | {1}
+  2 | {2,6}   | {3,2}                | {10,16}
+  3 | {15,40} | {7}                  | {106}
+  5 | {2,90}  | {3,2}                | {7570,7744}
+  6 | {50,45} | {6,8}                | {925,1985}
 (5 rows)
 </pre>
 
+-#  Unsupervised nearest neighbors.  Here the training set matches the
+test set so the nearest neighbor of each point is the point itself, with a 
zero distance:
+<pre class="example">
+DROP TABLE IF EXISTS knn_result_classification_unsup;
+SELECT * FROM madlib.knn(
+                'knn_train_data',      -- Table of training data
+                'data',                -- Col name of training data
+                'id',                  -- Col name of id in train data
+                 NULL,                 -- NULL training labels means just list 
neighbors
+                'knn_train_data',      -- Table of test data (same as training 
data)
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data
+                'knn_result_classification_unsup',  -- Output table
+                 3,                    -- Number of nearest neighbors
+                 True,                 -- True to list nearest-neighbors by id
+                'madlib.squared_dist_norm2' -- Distance function
+                );
+SELECT * from knn_result_classification_unsup ORDER BY id;
+</pre>
+Result, with point and neighbors sorted from closest to furthest:
+<pre class="result">
+ id |  data   | k_nearest_neighbours |   distance
+----+---------+----------------------+---------------
+  1 | {1,1}   | {1,2,3}              | {0,2,8}
+  2 | {2,2}   | {2,3,1}              | {0,2,2}
+  3 | {3,3}   | {3,2,4}              | {0,2,2}
+  4 | {4,4}   | {4,5,3}              | {0,1,2}
+  5 | {4,5}   | {5,4,3}              | {0,1,5}
+  6 | {20,50} | {6,7,5}              | {0,461,2281}
+  7 | {10,31} | {7,6,5}              | {0,461,712}
+  8 | {81,13} | {8,6,7}              | {0,5090,5365}
+  9 | {1,111} | {9,6,7}              | {0,4082,6481}
+(9 rows)
+</pre>
+
+-#  User defined distance function.  There are several built-in distance
+functions, but you can create your own using a UDF if desired.
+For example, to create a Chebyshev distance function [6], first create the 
function:
+<pre class="example">
+CREATE OR REPLACE FUNCTION chebychev_distance (x double precision[], y double 
precision[])
+  RETURNS double precision
+AS $$
+    from scipy.spatial import distance
+    return distance.chebyshev(x, y)
+$$ LANGUAGE plpythonu;
+</pre>
+Then pass the function as an argument:
+<pre class="example">
+DROP TABLE IF EXISTS knn_result_classification_udf;
+SELECT * FROM madlib.knn(
+                'knn_train_data',      -- Table of training data
+                'data',                -- Col name of training data
+                'id',                  -- Col name of id in train data
+                'label',               -- Training labels
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data
+                'knn_result_classification_udf',  -- Output table
+                 3,                    -- Number of nearest neighbors
+                 True,                 -- True to list nearest-neighbors by id
+                'chebychev_distance'   -- Distance function
+                );
+SELECT * from knn_result_classification_udf ORDER BY id;
+</pre>
+Result, with point and neighbors sorted from closest to furthest:
+<pre class="result">
+ id |  data   | prediction | k_nearest_neighbours |  distance
+----+---------+------------+----------------------+------------
+  1 | {2,1}   |          1 | {1,2,3}              | {1,1,2}
+  2 | {2,6}   |          1 | {5,4,3}              | {2,2,3}
+  3 | {15,40} |          0 | {7,6,5}              | {9,10,35}
+  4 | {12,1}  |          1 | {5,4,3}              | {8,8,9}
+  5 | {2,90}  |          0 | {9,6,7}              | {21,40,59}
+  6 | {50,45} |          0 | {6,8,7}              | {30,32,40}
+(6 rows)
+</pre>
+
 @anchor background
 @par Technical Background
 
@@ -528,13 +618,15 @@ irregularly-spaced data". Proceedings of the 1968 ACM 
National Conference. pp. 5
 [5] Bentley, J. L. (1975). "Multidimensional binary search trees used for
 associative searching". Communications of the ACM. 18 (9): 509. 
doi:10.1145/361002.361007.
 
+@anchor knn-lit-6
+[6] https://en.wikipedia.org/wiki/Chebyshev_distance
+
 
 @internal
 @sa namespace knn (documenting the implementation in Python)
 @endinternal
 */
 
-
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
     point_source VARCHAR,
     point_column_name VARCHAR,
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in 
b/src/ports/postgres/modules/knn/test/knn.sql_in
index 86f0eb4..e4cd578 100644
--- a/src/ports/postgres/modules/knn/test/knn.sql_in
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -97,6 +97,7 @@ SELECT assert(array_agg(prediction ORDER BY 
id)='{1,1,0,1,0,0}', 'Wrong output i
 DROP TABLE if exists madlib_knn_result_classification;
 SELECT 
knn('knn_train_data','data','id','label','knn_test_data','data','id','madlib_knn_result_classification',3);
 SELECT assert(array_agg(x ORDER BY id)= '{1,2,3}','Wrong output in 
classification with k=3') FROM (SELECT unnest(k_nearest_neighbours) AS x, id 
FROM madlib_knn_result_classification WHERE id = 1 ORDER BY x ASC) y;
+SELECT assert(array_agg(x ORDER BY id)= '{1,1,5}','Wrong distances in 
classification with k=3') FROM (SELECT unnest(distance) AS x, id FROM 
madlib_knn_result_classification WHERE id = 1 ORDER BY x ASC) y;
 
 DROP TABLE if exists madlib_knn_result_regression;
 SELECT 
knn('knn_train_data_reg','data','id','label','knn_test_data','data','id','madlib_knn_result_regression',4,False,'MADLIB_SCHEMA.squared_dist_norm2',False);

[madlib] branch master updated: KNN: Add distances to the output table

Reply via email to