Repository: madlib
Updated Branches:
  refs/heads/master 47e357446 -> 2a113626d


Modify knn help funtion for easier use : JIRA-1187

This commit does the following:
1. instead of a `RAISE NOTICE` in udf definition, implement it in a
plpython so the help message can always get printed out.
2. add 'example' also as a valid input argument, and make all the input
arguments ignore upper and lower cases
3. Add examples in help funtion while there was no examples before

Closes #215


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/2a113626
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/2a113626
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/2a113626

Branch: refs/heads/master
Commit: 2a113626d37ab62284d93df75b62909b1e6f0a4b
Parents: 47e3574
Author: Jingyi Mei <[email protected]>
Authored: Tue Dec 12 13:55:19 2017 +0800
Committer: Pivotal <[email protected]>
Committed: Wed Dec 13 16:42:06 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/knn/knn.py_in  | 178 +++++++++++++++++++++++++
 src/ports/postgres/modules/knn/knn.sql_in |  75 +++--------
 2 files changed, 193 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/2a113626/src/ports/postgres/modules/knn/knn.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.py_in 
b/src/ports/postgres/modules/knn/knn.py_in
index 7729d2f..da67952 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -227,4 +227,182 @@ def knn(schema_madlib, point_source, point_column_name, 
point_id,
 
         plpy.execute("DROP TABLE IF EXISTS {0}".format(interim_table))
         return
+
+def knn_help(schema_madlib, message, **kwargs):
+    """
+    Help function for knn
+
+    Args:
+        @param schema_madlib
+        @param message: string, Help message string
+        @param kwargs
+
+    Returns:
+        String. Help/usage information
+    """
+    if message is not None and \
+            message.lower() in ("usage", "help", "?"):
+        help_string = """
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.knn(
+    point_source,       -- Training data table having training features as 
vector column and labels
+    point_column_name,  -- Name of column having feature vectors in training 
data table
+    point_id,           -- Name of column having feature vector Ids in train 
data table
+    label_column_name,  -- Name of column having actual label/vlaue for 
corresponding feature vector in training data table
+    test_source,        -- Test data table having features as vector column. 
Id of features is mandatory
+    test_column_name,   -- Name of column having feature vectors in test data 
table
+    test_id,     -- Name of column having feature vector Ids in test data table
+    output_table,       -- Name of output table
+    k,                  -- value of k. Default will go as 1
+    output_neighbors    -- Outputs the list of k-nearest neighbors that were 
used in the voting/averaging.
+    fn_dist             -- The name of the function to use to calculate the 
distance from a data point to a centroid.
+    );
+
+-----------------------------------------------------------------------
+                            OUTPUT
+-----------------------------------------------------------------------
+The output of the KNN module is a table with the following columns:
+
+id                  The ids of test data points.
+test_column_name    The test data points.
+prediction          The output of KNN- label in case of classification, 
average value in case of regression.
+k_nearest_neighbours The list of k-nearest neighbors that were used in the 
voting/averaging.
+"""
+    else:
+        if message is not None and \
+                message.lower() in ("example", "examples"):
+            help_string = """
+----------------------------------------------------------------------------
+                                EXAMPLES
+----------------------------------------------------------------------------
+--  Prepare some training data for classification:
+DROP TABLE IF EXISTS knn_train_data;
+CREATE TABLE knn_train_data (
+                    id integer,
+                    data integer[],
+                    label integer  -- Integer label means for classification
+                    );
+INSERT INTO knn_train_data VALUES
+(1, '{{1,1}}', 1),
+(2, '{{2,2}}', 1),
+(3, '{{3,3}}', 1),
+(4, '{{4,4}}', 1),
+(5, '{{4,5}}', 1),
+(6, '{{20,50}}', 0),
+(7, '{{10,31}}', 0),
+(8, '{{81,13}}', 0),
+(9, '{{1,111}}', 0);
+
+--  Prepare some training data for regression:
+DROP TABLE IF EXISTS knn_train_data_reg;
+CREATE TABLE knn_train_data_reg (
+                    id integer,
+                    data integer[],
+                    label float  -- Float label means for regression
+                    );
+INSERT INTO knn_train_data_reg VALUES
+(1, '{{1,1}}', 1.0),
+(2, '{{2,2}}', 1.0),
+(3, '{{3,3}}', 1.0),
+(4, '{{4,4}}', 1.0),
+(5, '{{4,5}}', 1.0),
+(6, '{{20,50}}', 0.0),
+(7, '{{10,31}}', 0.0),
+(8, '{{81,13}}', 0.0),
+(9, '{{1,111}}', 0.0);
+
+--  Prepare some testing data:
+DROP TABLE IF EXISTS knn_test_data;
+CREATE TABLE knn_test_data (
+                    id integer,
+                    data integer[]
+                    );
+INSERT INTO knn_test_data VALUES
+(1, '{{2,1}}'),
+(2, '{{2,6}}'),
+(3, '{{15,40}}'),
+(4, '{{12,1}}'),
+(5, '{{2,90}}'),
+(6, '{{50,45}}');
+
+--  Run KNN for classification:
+DROP TABLE IF EXISTS knn_result_classification;
+SELECT * FROM {schema_madlib}.knn(
+                'knn_train_data',      -- Table of training data
+                'data',                -- Col name of training data
+                'id',                  -- Col name of id in train data
+                'label',               -- Training labels
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data
+                'knn_result_classification',  -- Output table
+                 3,                    -- Number of nearest neighbors
+                 True,                 -- True to list nearest-neighbors by id
+                 'madlib.squared_dist_norm2' -- Distance function
+                );
+SELECT * from knn_result_classification ORDER BY id;
+
+Note that the nearest neighbors are sorted from closest
+to furthest from the corresponding test point.
+
+--  Run KNN for regression:
+DROP TABLE IF EXISTS knn_result_regression;
+SELECT * FROM {schema_madlib}.knn(
+                'knn_train_data_reg',  -- Table of training data
+                'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data
+                'label',               -- Training labels
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data
+                'knn_result_regression',  -- Output table
+                 3,                    -- Number of nearest neighbors
+                True,                  -- True to list nearest-neighbors by id
+                'madlib.dist_norm2'    -- Distance function
+                );
+SELECT * FROM knn_result_regression ORDER BY id;
+
+--  List nearest neighbors only, without doing classification
+or regression:
+DROP TABLE IF EXISTS knn_result_list_neighbors;
+SELECT * FROM {schema_madlib}.knn(
+                'knn_train_data_reg',  -- Table of training data
+                'data',                -- Col name of training data
+                'id',                  -- Col Name of id in train data
+                NULL,                  -- NULL training labels means just list 
neighbors
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data
+                'knn_result_list_neighbors', -- Output table
+                3                      -- Number of nearest neighbors
+                );
+SELECT * FROM knn_result_list_neighbors ORDER BY id;
+"""
+        else:
+            help_string = """
+----------------------------------------------------------------------------
+                                SUMMARY
+----------------------------------------------------------------------------
+k-Nearest Neighbors is a method for finding k closest points to a given data
+point in terms of a given metric. Its input consist of data points as features
+from testing examples. For a given k, it looks for k closest points in
+training set for each of the data points in test set. Algorithm generates one
+output per testing example. The output of KNN depends on the type of task:
+For Classification, the output is majority vote of the classes of the k
+nearest data points. The testing example gets assigned the most popular class
+among nearest neighbors. For Regression, the output is average of the values
+of k nearest neighbors of the given testing example.
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.knn('usage');
+
+For some examples, run:
+SELECT {schema_madlib}.knn('example')
+--
+"""
+
+    return help_string.format(schema_madlib=schema_madlib)
 # 
------------------------------------------------------------------------------
+

http://git-wip-us.apache.org/repos/asf/madlib/blob/2a113626/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in 
b/src/ports/postgres/modules/knn/knn.sql_in
index d45f0f4..9db702d 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -386,66 +386,6 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.__knn_validate_src(
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
-    arg1 VARCHAR
-) RETURNS VOID AS $$
-BEGIN
-    IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
-    RAISE NOTICE
-'
------------------------------------------------------------------------
-                            USAGE
------------------------------------------------------------------------
-SELECT {schema_madlib}.knn(
-    point_source,       -- Training data table having training features as 
vector column and labels
-    point_column_name,  -- Name of column having feature vectors in training 
data table
-    point_id,           -- Name of column having feature vector Ids in train 
data table
-    label_column_name,  -- Name of column having actual label/vlaue for 
corresponding feature vector in training data table
-    test_source,        -- Test data table having features as vector column. 
Id of features is mandatory
-    test_column_name,   -- Name of column having feature vectors in test data 
table
-    test_id,     -- Name of column having feature vector Ids in test data table
-    output_table,       -- Name of output table
-    k,                  -- value of k. Default will go as 1
-    output_neighbors    -- Outputs the list of k-nearest neighbors that were 
used in the voting/averaging.
-    fn_dist             -- The name of the function to use to calculate the 
distance from a data point to a centroid.
-    );
-
------------------------------------------------------------------------
-                            OUTPUT
------------------------------------------------------------------------
-The output of the KNN module is a table with the following columns:
-
-id                  The ids of test data points.
-test_column_name    The test data points.
-prediction          The output of KNN- label in case of classification, 
average value in case of regression.
-k_nearest_neighbours The list of k-nearest neighbors that were used in the 
voting/averaging.
-';
-    END IF;
-END;
-$$ LANGUAGE plpgsql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
-) RETURNS VOID AS $$
-BEGIN
-    RAISE NOTICE '
-k-Nearest Neighbors is a method for finding k closest points to a given data
-point in terms of a given metric. Its input consist of data points as features
-from testing examples. For a given k, it looks for k closest points in
-training set for each of the data points in test set. Algorithm generates one
-output per testing example. The output of KNN depends on the type of task:
-For Classification, the output is majority vote of the classes of the k
-nearest data points. The testing example gets assigned the most popular class
-among nearest neighbors. For Regression, the output is average of the values
-of k nearest neighbors of the given testing example.
-    ';
-END;
-$$ LANGUAGE plpgsql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-
-
-
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
@@ -540,3 +480,18 @@ BEGIN
 END;
 $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(knn, knn, knn_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn()
+RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.knn('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+

Reply via email to