Frank McQuillan created MADLIB-1018:
---------------------------------------

             Summary: Fix K-means support for array input for data points
                 Key: MADLIB-1018
                 URL: https://issues.apache.org/jira/browse/MADLIB-1018
             Project: Apache MADlib
          Issue Type: Bug
          Components: Module: k-Means Clustering
            Reporter: Frank McQuillan
             Fix For: v1.9.2


For k-means, normally you should be able to do array[col1, col2…] for the 2nd 
parameter, but that does not work.  This JIRA is to be able to support 
array[col1, col2…].

{code}
expr_point
TEXT. The name of the column with point coordinates.
{code}

{code}
SELECT madlib.kmeans_random('customers_train',
               'array[creditamount, accountbalance]',
               3
             );
{code}

produces

{code}
---------------------------------------------------------------------------
InternalError                             Traceback (most recent call last)
<ipython-input-50-0b939dd162ef> in <module>()
----> 1 get_ipython().run_cell_magic(u'sql', u'', u"\nSELECT 
madlib.kmeans_random('customers_train',\n               'array[creditamount, 
accountbalance]',\n               3\n             );\n")

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc
 in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/magic.pyc in 
execute(self, line, cell, local_ns)

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc 
in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/magic.pyc in 
execute(self, line, cell, local_ns)

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc 
in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/magic.pyc in 
execute(self, line, cell, local_ns)
     78             return self._persist_dataframe(parsed['sql'], conn, user_ns)
     79         try:
---> 80             result = sql.run.run(conn, parsed['sql'], self, user_ns)
     81             return result
     82         except (ProgrammingError, OperationalError) as e:

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/run.pyc in run(conn, 
sql, config, user_namespace)
    270                 raise Exception("ipython_sql does not support 
transactions")
    271             txt = sqlalchemy.sql.text(statement)
--> 272             result = conn.session.execute(txt, user_namespace)
    273             try:
    274                 conn.session.execute('commit')

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc
 in execute(self, object, *multiparams, **params)
    912                 type(object))
    913         else:
--> 914             return meth(self, multiparams, params)
    915 
    916     def _execute_function(self, func, multiparams, params):

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/sql/elements.pyc
 in _execute_on_connection(self, connection, multiparams, params)
    321 
    322     def _execute_on_connection(self, connection, multiparams, params):
--> 323         return connection._execute_clauseelement(self, multiparams, 
params)
    324 
    325     def unique_params(self, *optionaldict, **kwargs):

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc
 in _execute_clauseelement(self, elem, multiparams, params)
   1008             compiled_sql,
   1009             distilled_params,
-> 1010             compiled_sql, distilled_params
   1011         )
   1012         if self._has_events or self.engine._has_events:

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc
 in _execute_context(self, dialect, constructor, statement, parameters, *args)
   1144                 parameters,
   1145                 cursor,
-> 1146                 context)
   1147 
   1148         if self._has_events or self.engine._has_events:

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc
 in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
   1339                 util.raise_from_cause(
   1340                     sqlalchemy_exception,
-> 1341                     exc_info
   1342                 )
   1343             else:

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/util/compat.pyc
 in raise_from_cause(exception, exc_info)
    197             exc_info = sys.exc_info()
    198         exc_type, exc_value, exc_tb = exc_info
--> 199         reraise(type(exception), exception, tb=exc_tb)
    200 
    201 if py3k:

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc
 in _execute_context(self, dialect, constructor, statement, parameters, *args)
   1137                         statement,
   1138                         parameters,
-> 1139                         context)
   1140         except Exception as e:
   1141             self._handle_dbapi_exception(

/Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/default.pyc
 in do_execute(self, cursor, statement, parameters, context)
    448 
    449     def do_execute(self, cursor, statement, parameters, context=None):
--> 450         cursor.execute(statement, parameters)
    451 
    452     def do_execute_no_params(self, cursor, statement, context=None):

InternalError: (psycopg2.InternalError) plpy.SPIError: syntax error at or near 
"," (plpython.c:4651)
LINE 44: ...                           _src.array[creditamount, accountb...
                                                              ^
QUERY:  
                SELECT
                    1 AS _iteration,
                    madlib.array_to_1d((_state).centroids) AS centroids,
                    (_state).old_centroid_ids,
                    (_state).objective_fn,
                    (_state).frac_reassigned
                FROM
                (
                    SELECT (
                SELECT
                    CAST((
                        madlib.matrix_agg(
                            _centroid::FLOAT8[]
                            ORDER BY _new_centroid_id),
                        array_agg(_new_centroid_id ORDER BY _new_centroid_id),
                        sum(_objective_fn),
                        CAST(sum(_num_reassigned) AS DOUBLE PRECISION)
                            / sum(_num_points)
                    ) AS madlib.kmeans_state)
                FROM (
                    SELECT
                        (_new_centroid).column_id AS _new_centroid_id,
                        sum((_new_centroid).distance) AS _objective_fn,
                        count(*) AS _num_points,
                        sum(
                            CAST(
                                coalesce(
                                    (CAST(
                                        (SELECT (CAST ((madlib.array_to_2d($1), 
$2, $3, $4)
                            AS madlib.kmeans_state)).old_centroid_ids) AS 
INTEGER[]
                                    ))[(_new_centroid).column_id + 1] != 
_old_centroid_id,
                                    TRUE
                                )
                                AS INTEGER
                            )
                        ) AS _num_reassigned,
                        madlib.avg(_point::FLOAT8[]) AS _centroid
                    FROM (
                        SELECT
                            -- PostgreSQL/Greenplum tuning:
                            -- VOLATILE function as optimization fence
                            madlib.noop(),
                            _src.array[creditamount, accountbalance] AS _point,
                            madlib.closest_column(
                                (SELECT (CAST ((madlib.array_to_2d($1), $2, $3, 
$4)
                            AS madlib.kmeans_state)).centroids)
                                , _src.array[creditamount, 
accountbalance]::FLOAT8[]
                                , 'madlib.squared_dist_norm2'
                                
                                )
                            AS _new_centroid,
                            (madlib.closest_column((SELECT (CAST 
((madlib.array_to_2d($5), $6, $7, $8)
                                AS madlib.kmeans_state)).centroids)
                                    , _src.array[creditamount, 
accountbalance]::FLOAT8[]
                                    , 'madlib.squared_dist_norm2'
                                    
                                    )
                                ).column_id
                             AS _old_centroid_id
                        FROM customers_train AS _src
                        WHERE 
abs(coalesce(madlib.svec_elsum(array[creditamount, accountbalance]), 
'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8
                        AND NOT 
madlib.array_contains_null(_src.array[creditamount, accountbalance]::FLOAT8[])
                    ) AS _points_with_assignments
                    GROUP BY (_new_centroid).column_id
                ) AS _new_centroids
                ) AS _state
                ) q
                
CONTEXT:  Traceback (most recent call last):
  PL/Python function "internal_compute_kmeans", line 22, in <module>
    return kmeans.compute_kmeans(**globals())
  PL/Python function "internal_compute_kmeans", line 332, in compute_kmeans
  PL/Python function "internal_compute_kmeans", line 227, in update
PL/Python function "internal_compute_kmeans"
SQL statement "SELECT  madlib.internal_compute_kmeans( '_madlib_kmeans_args', 
'_madlib_kmeans_state', textin(regclassout( $1 )),  $2 , textin(regprocout( $3 
)))"
PL/pgSQL function "kmeans" line 103 at assignment
SQL statement "SELECT  madlib.kmeans(  $1 ,  $2 , madlib.kmeans_random_seeding( 
$1 ,  $2 ,  $3 ), 'madlib.squared_dist_norm2', 'madlib.avg', 20, 0.001)"
PL/pgSQL function "kmeans_random" line 4 at assignment
 [SQL: "SELECT madlib.kmeans_random('customers_train',\n               
'array[creditamount, accountbalance]',\n               3\n             );"]
{code}

The workaround is to create a view:

{code}
CREATE VIEW cluster_params AS (SELECT *, array[creditamount, accountbalance] as 
p1 FROM customers_train);
SELECT madlib.kmeans_random('cluster_params',
               'p1',
               3
             );
{code} 



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to