Repository: madlib Updated Branches: refs/heads/master 775afd05d -> c73cf8507
K-NN: Fix minor issues in documentation Closes #208 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/c73cf850 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/c73cf850 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/c73cf850 Branch: refs/heads/master Commit: c73cf850791210c47ffea66c87da546c2049265a Parents: 775afd0 Author: Frank McQuillan <[email protected]> Authored: Mon Dec 4 12:24:46 2017 -0800 Committer: Rahul Iyer <[email protected]> Committed: Mon Dec 4 23:44:02 2017 -0800 ---------------------------------------------------------------------- src/ports/postgres/modules/knn/knn.sql_in | 43 +++++++++++++------------- 1 file changed, 22 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/c73cf850/src/ports/postgres/modules/knn/knn.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in index 17d81ad..cdc9704 100644 --- a/src/ports/postgres/modules/knn/knn.sql_in +++ b/src/ports/postgres/modules/knn/knn.sql_in @@ -133,21 +133,18 @@ neighbors that were used in the voting/averaging, sorted from closest to furthest.</dd> <dt>fn_dist (optional)</dt> -<dd>TEXT, default: squared_dist_norm2'. The name of the function to use to calculate the distance from a data point to a centroid. +<dd>TEXT, default: 'squared_dist_norm2'. The name of the function +used to calculate the distance between data points. -The following distance functions can be used (computation of barycenter/mean in parentheses): +The following distance functions can be used: <ul> -<li><b>\ref dist_norm1</b>: 1-norm/Manhattan (element-wise median -[Note that MADlib does not provide a median aggregate function for support and -performance reasons.])</li> -<li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li> -<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li> -<li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li> -<li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points <a href="#kmeans-lit-5">[5]</a>)</li> +<li><b>\ref dist_norm1</b>: 1-norm/Manhattan</li> +<li><b>\ref dist_norm2</b>: 2-norm/Euclidean</li> +<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance</li> +<li><b>\ref dist_angle</b>: angle</li> +<li><b>\ref dist_tanimoto</b>: tanimoto</li> <li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, DOUBLE PRECISION[] y -> DOUBLE PRECISION</tt></li></ul></dd> - - </dl> @@ -168,6 +165,11 @@ The output of the KNN module is a table with the following columns: <th>prediction</th> <td>INTEGER. Label in case of classification, average value in case of regression.</td> </tr> + <tr> + <th>k_nearest_neighbours</th> + <td>INTEGER[]. List of nearest neighbors, sorted closest to furthest + from the corresponding test point.</td> + </tr> </table> @@ -236,14 +238,14 @@ DROP TABLE IF EXISTS knn_result_classification; SELECT * FROM madlib.knn( 'knn_train_data', -- Table of training data 'data', -- Col name of training data - 'id', -- Col Name of id in train data + 'id', -- Col name of id in train data 'label', -- Training labels 'knn_test_data', -- Table of test data 'data', -- Col name of test data 'id', -- Col name of id in test data 'knn_result_classification', -- Output table 3, -- Number of nearest neighbors - True -- True if you want to show Nearest-Neighbors by id, False otherwise + True, -- True to list nearest-neighbors by id 'madlib.squared_dist_norm2' -- Distance function ); SELECT * from knn_result_classification ORDER BY id; @@ -260,7 +262,8 @@ Result: 6 | {50,45} | 0 | {6,7,8} (6 rows) </pre> -Note that the nearest neighbors are sorted from closest to furthest from the corresponding test point. +Note that the nearest neighbors are sorted from closest +to furthest from the corresponding test point. -# Run KNN for regression: <pre class="example"> @@ -275,8 +278,8 @@ SELECT * FROM madlib.knn( 'id', -- Col name of id in test data 'knn_result_regression', -- Output table 3, -- Number of nearest neighbors - True -- True if you want to show Nearest-Neighbors, False otherwise - 'madlib.squared_dist_norm2' -- Distance function + True, -- True to list nearest-neighbors by id + 'madlib.dist_norm2' -- Distance function ); SELECT * FROM knn_result_regression ORDER BY id; </pre> @@ -293,7 +296,8 @@ Result: (6 rows) </pre> --# List nearest neighbors only, without doing classification or regression: +-# List nearest neighbors only, without doing classification +or regression: <pre class="example"> DROP TABLE IF EXISTS knn_result_list_neighbors; SELECT * FROM madlib.knn( @@ -334,9 +338,6 @@ vector (a test point) is classified by assigning the label which is most frequent among the k training samples nearest to that test point. In case of regression, average of the values of these k training samples is assigned to the test point. -The only distance metric supported in this version is MADlib's squared_dist_norm2. -Other distance metrics will be added in a future release of this module. - @anchor literature @literature @@ -535,4 +536,4 @@ BEGIN RETURN returnstring; END; $$ LANGUAGE plpgsql VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
