[24/34] incubator-madlib git commit: Array Operations: Unnest 2-D arrays by one level.

okislal Fri, 16 Jun 2017 13:59:00 -0700

Array Operations: Unnest 2-D arrays by one level.

JIRA: MADLIB-1086


Unnest 2-D arrays by one level (i.e. into rows of 1-D arrays).
Example usage in k-Means shows how to unnest the 2-D centroid array
to get one centroid per row for follow on operations.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/3af18a93
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/3af18a93
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/3af18a93

Branch: refs/heads/latest_release
Commit: 3af18a9388d144920d5bca3e5cde27edee6e0eac
Parents: 20b1158
Author: Rashmi Raghu <rra...@pivotal.io>
Authored: Tue Apr 25 14:41:09 2017 -0700
Committer: Rashmi Raghu <rra...@pivotal.io>
Committed: Wed Apr 26 11:35:23 2017 -0700

----------------------------------------------------------------------
 methods/array_ops/src/pg_gp/array_ops.sql_in    | 102 ++++++++-
 .../array_ops/src/pg_gp/test/array_ops.sql_in   | 218 +++++++++++++++++++
 src/ports/postgres/modules/kmeans/kmeans.sql_in |  89 +++++---
 3 files changed, 375 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3af18a93/methods/array_ops/src/pg_gp/array_ops.sql_in
----------------------------------------------------------------------
diff --git a/methods/array_ops/src/pg_gp/array_ops.sql_in 
b/methods/array_ops/src/pg_gp/array_ops.sql_in
index c83a947..08ba377 100644
--- a/methods/array_ops/src/pg_gp/array_ops.sql_in
+++ b/methods/array_ops/src/pg_gp/array_ops.sql_in
@@ -24,7 +24,7 @@ m4_include(`SQLCommon.m4')
 
 @brief Provides fast array operations supporting other MADlib modules.
 
-This module provides a set of basic array operations implemented in C.
+This module provides a set of basic array operations implemented in C and SQL.
 It is a support module for several machine learning algorithms that
 require fast array operations.
 
@@ -42,6 +42,8 @@ These functions support several numeric types:
     - DOUBLE PRECISION (FLOAT8)
     - NUMERIC (internally casted into FLOAT8, loss of precisions can happen)
 
+Additionally, array_unnest_2d_to_1d() supports other data types such as TEXT 
or VARCHAR.
+
 Several of the function require NO NULL VALUES, while others omit NULLs and 
return results. See details in description of individual functions.
 
 @anchor list
@@ -126,6 +128,11 @@ Several of the function require NO NULL VALUES, while 
others omit NULLs and retu
 
 <tr><th>normalize()</th><td> This function normalizes an array as sum of 
squares to be 1. It requires that the array is 1-D and all the values are 
NON-NULL.
 </td></tr>
+
+<tr><th>array_unnest_2d_to_1d()</th><td> This function takes a 2-D array as 
the input and unnests it by one level. It returns a set of 1-D arrays that 
correspond to rows of
+ the input array as well as an ID column with values corresponding to row 
positions occupied by those 1-D arrays within the 2-D array.
+</td></tr>
+
 </table>
 
 @anchor examples
@@ -220,6 +227,30 @@ Result:
  {1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3}
 (1 row)
 </pre>
+-# Unnest a column of 2-D arrays into sets of 1-D arrays.
+<pre class="example">
+SELECT id, (madlib.array_unnest_2d_to_1d(val)).*
+FROM (
+  SELECT 1::INT AS id, ARRAY[[1.3,2.0,3.2],[10.3,20.0,32.2]]::FLOAT8[][] AS val
+  UNION ALL
+  SELECT 2, ARRAY[[pi(),pi()/2],[2*pi(),pi()],[pi()/4,4*pi()]]::FLOAT8[][]
+) t
+ORDER BY 1,2;
+</pre>
+Result:
+<pre class="result">
+ id | unnest_row_id |            unnest_result
+----+---------------+--------------------------------------
+  1 |             1 | {1.3,2,3.2}
+  1 |             2 | {10.3,20,32.2}
+  2 |             1 | {3.14159265358979,1.5707963267949}
+  2 |             2 | {6.28318530717959,3.14159265358979}
+  2 |             3 | {0.785398163397448,12.5663706143592}
+(5 rows)
+</pre>
+If the function is called without the .* notation then it will return a
+composite record type with two attributes: the row ID and corresponding
+unnested array result.
 
 @anchor related
 @par Related Topics
@@ -636,3 +667,72 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_cum_prod(x 
anyarray) RETURNS anya
 AS 'MODULE_PATHNAME', 'array_cum_prod'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
+
+/**
+ * @brief This function takes a 2-D array as the input and unnests it
+ *        by one level.
+ *        It returns a set of 1-D arrays that correspond to rows of the
+ *        input array as well as an ID column containing row positions 
occupied by
+ *        those 1-D arrays within the 2-D array (the ID column values start 
with
+ *        1 and not 0)
+ *
+ * @param x Array x
+ * @returns Set of 1-D arrays that corrspond to rows of x and an ID column.
+ *
+ */
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_unnest_2d_to_1d(
+  x ANYARRAY,
+  OUT unnest_row_id INT,
+  OUT unnest_result ANYARRAY
+)
+RETURNS SETOF RECORD
+AS
+$BODY$
+  SELECT t2.r::int, array_agg($1[t2.r][t2.c] order by t2.c) FROM
+  (
+    SELECT generate_series(array_lower($1,2),array_upper($1,2)) as c, t1.r
+    FROM
+    (
+      SELECT generate_series(array_lower($1,1),array_upper($1,1)) as r
+    ) t1
+  ) t2
+GROUP BY t2.r
+$BODY$ LANGUAGE SQL IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_unnest_2d_to_1d()
+RETURNS TEXT AS $$
+    return """
+------------------------------------------------------------------
+                        SUMMARY
+------------------------------------------------------------------
+This function takes a 2-D array as the input and unnests it by
+one level.
+It returns a set of 1-D arrays that correspond to rows of the
+input array as well as an ID column containing row positions occupied by
+those 1-D arrays within the 2-D array (the ID column values start with
+1 and not 0).
+
+------------------------------------------------------------------
+                        USAGE
+------------------------------------------------------------------
+
+ SELECT ({schema_madlib}.array_unnest_2d_to_1d(input_array)).* from 
input_table;
+
+If the function is called without the .* notation then it will return a
+composite record type with two attributes: the row ID and corresponding
+unnested array result.
+
+------------------------------------------------------------------
+                        EXAMPLE
+------------------------------------------------------------------
+SELECT id, (madlib.array_unnest_2d_to_1d(val)).*
+FROM (
+  SELECT 1::INT AS id, ARRAY[[1.3,2.0,3.2],[10.3,20.0,32.2]]::FLOAT8[][] AS val
+  UNION ALL
+  SELECT 2, ARRAY[[pi(),pi()/2],[2*pi(),pi()],[pi()/4,4*pi()]]::FLOAT8[][]
+) t
+ORDER BY 1,2;
+        """.format(schema_madlib='MADLIB_SCHEMA')
+$$ LANGUAGE PLPYTHONU IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3af18a93/methods/array_ops/src/pg_gp/test/array_ops.sql_in
----------------------------------------------------------------------
diff --git a/methods/array_ops/src/pg_gp/test/array_ops.sql_in 
b/methods/array_ops/src/pg_gp/test/array_ops.sql_in
index 473e32e..b05d0b7 100644
--- a/methods/array_ops/src/pg_gp/test/array_ops.sql_in
+++ b/methods/array_ops/src/pg_gp/test/array_ops.sql_in
@@ -89,3 +89,221 @@ SELECT array_scalar_mult(
     (1.0/MADLIB_SCHEMA.array_sum(ARRAY[1.,2,3,4]))
 );
 
+--------------------------------------------------------------
+-- TESTING array_unnest_2d_to_1d FUNCTION
+--------------------------------------------------------------
+-- 2-element float8 arrays
+DROP TABLE IF EXISTS unnest_2d_tbl01;
+CREATE TABLE unnest_2d_tbl01 (id INT, val DOUBLE PRECISION[][]);
+INSERT INTO unnest_2d_tbl01 VALUES
+  (1, ARRAY[[1::float8,2],[3::float8,4],[5::float8,6]]),
+  (2, ARRAY[[101::float8,202],[303::float8,404],[505::float8,606]])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl01_groundtruth;
+CREATE TABLE unnest_2d_tbl01_groundtruth (
+  id INT,
+  unnest_row_id INT,
+  val DOUBLE PRECISION[]
+);
+INSERT INTO unnest_2d_tbl01_groundtruth VALUES
+  (1, 1, ARRAY[1::float8,2]),
+  (1, 2, ARRAY[3::float8,4]),
+  (1, 3, ARRAY[5::float8,6]),
+  (2, 1, ARRAY[101::float8,202]),
+  (2, 2, ARRAY[303::float8,404]),
+  (2, 3, ARRAY[505::float8,606])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl01_out;
+CREATE TABLE unnest_2d_tbl01_out AS
+  SELECT id, (array_unnest_2d_to_1d(val)).* FROM unnest_2d_tbl01;
+
+SELECT assert(
+  unnest_result = val,
+  'array_unnest_2d_to_1d: Wrong results for test table "unnest_2d_tbl01"'
+)
+FROM (
+  SELECT * FROM
+    unnest_2d_tbl01_out t1
+    JOIN
+    unnest_2d_tbl01_groundtruth t2
+    USING (id,unnest_row_id)
+) t3;
+
+-- 3-element float8 arrays
+DROP TABLE IF EXISTS unnest_2d_tbl02;
+CREATE TABLE unnest_2d_tbl02 (id INT, val DOUBLE PRECISION[][]);
+INSERT INTO unnest_2d_tbl02 VALUES
+  (1, ARRAY[[1.57::float8,2,3],[4::float8,5,6]]),
+  (2, ARRAY[[101::float8,202,303],[PI(),505,606]]),
+  (3, ARRAY[[1011::float8,2022,3033],[4044,5055,60.66]])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl02_groundtruth;
+CREATE TABLE unnest_2d_tbl02_groundtruth (
+  id INT,
+  unnest_row_id INT,
+  val DOUBLE PRECISION[]
+);
+INSERT INTO unnest_2d_tbl02_groundtruth VALUES
+  (1, 1, array[1.57::float8,2,3]),
+  (1, 2, array[4::float8,5,6]),
+  (2, 1, array[101::float8,202,303]),
+  (2, 2, array[pi(),505,606]),
+  (3, 1, array[1011::float8,2022,3033]),
+  (3, 2, array[4044,5055,60.66])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl02_out;
+CREATE TABLE unnest_2d_tbl02_out AS
+  SELECT id, (array_unnest_2d_to_1d(val)).* FROM unnest_2d_tbl02;
+
+SELECT assert(
+  unnest_result = val,
+  'array_unnest_2d_to_1d: Wrong results for test table "unnest_2d_tbl02"'
+)
+FROM (
+  SELECT * FROM
+    unnest_2d_tbl02_out t1
+    JOIN
+    unnest_2d_tbl02_groundtruth t2
+    USING (id,unnest_row_id)
+) t3;
+
+-- 2-element text arrays
+DROP TABLE IF EXISTS unnest_2d_tbl03;
+CREATE TABLE unnest_2d_tbl03 (id INT, val TEXT[][]);
+INSERT INTO unnest_2d_tbl03 VALUES
+  (1, ARRAY[['a','b'],['c','d'],['e','f']]),
+  (2, ARRAY[['apple','banana'],['cherries','kiwi'],['lemon','mango']])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl03_groundtruth;
+CREATE TABLE unnest_2d_tbl03_groundtruth (
+  id INT,
+  unnest_row_id INT,
+  val TEXT[]
+);
+INSERT INTO unnest_2d_tbl03_groundtruth VALUES
+  (1, 1, ARRAY['a','b']),
+  (1, 2, ARRAY['c','d']),
+  (1, 3, ARRAY['e','f']),
+  (2, 1, ARRAY['apple','banana']),
+  (2, 2, ARRAY['cherries','kiwi']),
+  (2, 3, ARRAY['lemon','mango'])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl03_out;
+CREATE TABLE unnest_2d_tbl03_out AS
+  SELECT id, (array_unnest_2d_to_1d(val)).* FROM unnest_2d_tbl03;
+
+SELECT assert(
+  unnest_result = val,
+  'array_unnest_2d_to_1d: Wrong results for test table "unnest_2d_tbl03"'
+)
+FROM (
+  SELECT * FROM
+    unnest_2d_tbl03_out t1
+    JOIN
+    unnest_2d_tbl03_groundtruth t2
+    USING (id,unnest_row_id)
+) t3;
+
+-- 3-element float8 arrays with some NULLs
+DROP TABLE IF EXISTS unnest_2d_tbl04;
+CREATE TABLE unnest_2d_tbl04 (id INT, val DOUBLE PRECISION[][]);
+INSERT INTO unnest_2d_tbl04 VALUES
+  (1, ARRAY[[1::float8,NULL,3],[4.0,5,NULL]]),
+  (2, ARRAY[[101::float8,NULL,303],
+            [NULL::float8,NULL,NULL]]::double precision[][]),
+  (3, ARRAY[[NULL,2022::float8],[4044::float8,NULL]])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl04_groundtruth;
+CREATE TABLE unnest_2d_tbl04_groundtruth (
+  id INT,
+  unnest_row_id INT,
+  val DOUBLE PRECISION[]
+);
+INSERT INTO unnest_2d_tbl04_groundtruth VALUES
+  (1, 1, ARRAY[1::float8,NULL,3]),
+  (1, 2, ARRAY[4.0::float8,5,NULL]),
+  (2, 1, ARRAY[101::float8,NULL,303]),
+  (2, 2, ARRAY[NULL::float8,NULL,NULL]),
+  (3, 1, ARRAY[NULL,2022::float8]),
+  (3, 2, ARRAY[4044::float8,NULL])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl04_out;
+CREATE TABLE unnest_2d_tbl04_out AS
+  SELECT id, (array_unnest_2d_to_1d(val)).* FROM unnest_2d_tbl04;
+
+SELECT assert(
+  unnest_result = val,
+  'array_unnest_2d_to_1d: Wrong results for test table "unnest_2d_tbl04"'
+)
+FROM (
+  SELECT * FROM
+    unnest_2d_tbl04_out t1
+    JOIN
+    unnest_2d_tbl04_groundtruth t2
+    USING (id,unnest_row_id)
+) t3;
+
+-- 3-element timestamp arrays with NULLs
+DROP TABLE IF EXISTS unnest_2d_tbl05;
+CREATE TABLE unnest_2d_tbl05 (id INT, val TIMESTAMP WITHOUT TIME ZONE[][]);
+INSERT INTO unnest_2d_tbl05 VALUES
+  (1, array[['2017-01-01 11:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+             '2017-01-01 13:00:05',
+             '2017-01-02 11:55:00'],
+            ['2016-10-12 12:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+             '2016-10-12 13:15:22',
+             NULL]]),
+  (2, NULL),
+  (3, array[['2014-02-01 11:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+             '2014-02-01 13:00:05',
+             '2014-02-02 11:55:00'],
+            ['2013-07-12 12:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+             NULL,
+             '2013-07-12 13:15:22']])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl05_groundtruth;
+CREATE TABLE unnest_2d_tbl05_groundtruth (
+  id INT,
+  unnest_row_id INT,
+  val TIMESTAMP WITHOUT TIME ZONE[]
+);
+INSERT INTO unnest_2d_tbl05_groundtruth VALUES
+  (1, 1, ARRAY['2017-01-01 11:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+               '2017-01-01 13:00:05',
+               '2017-01-02 11:55:00']),
+  (1, 2, ARRAY['2016-10-12 12:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+               '2016-10-12 13:15:22',
+               NULL]),
+  (2, NULL, NULL),
+  (3, 1, ARRAY['2014-02-01 11:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+               '2014-02-01 13:00:05',
+               '2014-02-02 11:55:00']),
+  (3, 2, ARRAY['2013-07-12 12:00:02'::TIMESTAMP WITHOUT TIME ZONE,
+               NULL,
+               '2013-07-12 13:15:22'])
+;
+
+DROP TABLE IF EXISTS unnest_2d_tbl05_out;
+CREATE TABLE unnest_2d_tbl05_out AS
+  SELECT id, (array_unnest_2d_to_1d(val)).* FROM unnest_2d_tbl05;
+
+SELECT assert(
+  unnest_result = val,
+  'array_unnest_2d_to_1d: Wrong results for test table "unnest_2d_tbl05"'
+)
+FROM (
+  SELECT * FROM
+    unnest_2d_tbl05_out t1
+    JOIN
+    unnest_2d_tbl05_groundtruth t2
+    USING (id,unnest_row_id)
+) t3;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3af18a93/src/ports/postgres/modules/kmeans/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in 
b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index f689dd6..b3cdd55 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -239,75 +239,98 @@ INSERT INTO km_sample VALUES
 </pre>
 -#  Run k-means clustering using kmeans++ for centroid seeding:
 <pre class="example">
+DROP TABLE IF EXISTS km_result;
+-- Run kmeans algorithm
+CREATE TABLE km_result AS
+SELECT * FROM madlib.kmeanspp('km_sample', 'points', 2,
+                           'madlib.squared_dist_norm2',
+                           'madlib.avg', 20, 0.001);
 \\x on;
-SELECT * FROM madlib.kmeanspp( 'km_sample',   -- Table of source data
-                               'points',      -- Column containing point 
co-ordinates 
-                               2,             -- Number of centroids to 
calculate
-                               'madlib.squared_dist_norm2',   -- Distance 
function
-                               'madlib.avg',  -- Aggregate function
-                               20,            -- Number of iterations
-                               0.001          -- Fraction of centroids 
reassigned to keep iterating 
-                             );
+SELECT * FROM km_result;
 </pre>
 Result:
 <pre class="result">
-centroids        | 
{{13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667,
 3.31833333333333,1020.83333333333},
-                   
{14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75}}
-cluster_variance | {122999.110416013,30561.74805}
-objective_fn     | 153560.858466013
+centroids        | 
{{14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340},{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988}}
+cluster_variance | {60672.638245208,90512.324426408}
+objective_fn     | 151184.962671616
 frac_reassigned  | 0
-num_iterations   | 3
+num_iterations   | 2
 </pre>
 -# Calculate the simplified silhouette coefficient:
 <pre class="example">
 SELECT * FROM madlib.simple_silhouette( 'km_sample',
                                         'points',
-                                        (SELECT centroids FROM
-                                            madlib.kmeanspp('km_sample',
-                                                            'points',
-                                                            2,
-                                                            
'madlib.squared_dist_norm2',
-                                                            'madlib.avg',
-                                                            20,
-                                                            0.001)),
+                                        (SELECT centroids FROM km_result),
                                         'madlib.dist_norm2'
                                       );
 </pre>
 Result:
 <pre class="result">
-simple_silhouette | 0.686314347664694
+simple_silhouette | 0.68978804882941
 </pre>
 
 -#  Find the cluster assignment for each point:
 <pre class="example">
 \\x off;
-DROP TABLE IF EXISTS km_result;
--- Run kmeans algorithm
-CREATE TABLE km_result AS
-SELECT * FROM madlib.kmeanspp('km_sample', 'points', 2,
-                           'madlib.squared_dist_norm2',
-                           'madlib.avg', 20, 0.001); 
 -- Get point assignment
 SELECT data.*,  (madlib.closest_column(centroids, points)).column_id as 
cluster_id
 FROM km_sample as data, km_result
 ORDER BY data.pid;
 </pre>
+Result:
 <pre class="result">
- pid |                               points                               | 
cluster_id 
+ pid |                               points                               | 
cluster_id
 
-----+--------------------------------------------------------------------+------------
-   1 | {14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065}  |    
      0
-   2 | {13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}    |    
      0
+   1 | {14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065}  |    
      1
+   2 | {13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}    |    
      1
    3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} |    
      0
    4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480}   |    
      0
    5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735}     |    
      1
    6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450}  |    
      0
    7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290}    |    
      0
    8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295}  |    
      0
-   9 | {14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045}      |    
      0
-  10 | {13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.2199,1.01,3.55,1045}  |    
      0
+   9 | {14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045}      |    
      1
+  10 | {13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.2199,1.01,3.55,1045}  |    
      1
 (10 rows)
 </pre>
 
+-#  Unnest the cluster centroids 2-D array to get a set of 1-D centroid arrays:
+<pre class="example">
+DROP TABLE IF EXISTS km_centroids_unnest;
+-- Run unnest function
+CREATE TABLE km_centroids_unnest AS
+SELECT (madlib.array_unnest_2d_to_1d(centroids)).*
+FROM km_result;
+SELECT * FROM km_centroids_unnest ORDER BY 1;
+</pre>
+Result:
+<pre class="result">
+ unnest_row_id |                                  unnest_result
+---------------+----------------------------------------------------------------------------------
+             1 | 
{14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340}
+             2 | 
{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988}
+(2 rows)
+</pre>
+Note that the ID column returned by array_unnest_2d_to_1d()
+is not guaranteed to be the same as the cluster ID assigned by k-means.
+See below to create the correct cluster IDs.
+
+-#  Create cluster IDs for 1-D centroid arrays so that cluster ID for any 
centroid
+can be matched to the cluster assignment for the data points:
+<pre class="example">
+SELECT cent.*,  (madlib.closest_column(centroids, unnest_result)).column_id as 
cluster_id
+FROM km_centroids_unnest as cent, km_result
+ORDER BY cent.unnest_row_id;
+</pre>
+Result:
+<pre class="result">
+ unnest_row_id |                                  unnest_result                
                   | cluster_id
+---------------+----------------------------------------------------------------------------------+------------
+             1 | 
{14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340}
 |          0
+             2 | 
{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988} 
 |          1
+(2 rows)
+</pre>
+
 -#  Run the same example as above, but using array input.  Create the input 
table:
 <pre class="example">
 DROP TABLE IF EXISTS km_arrayin CASCADE;

[24/34] incubator-madlib git commit: Array Operations: Unnest 2-D arrays by one level.

Reply via email to