This is an automated email from the ASF dual-hosted git repository.

jingyimei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
     new a16dffd  DL: Improve minibatch performance by using array_cat
a16dffd is described below

commit a16dffded5dbdb41cf45fcf34fb4cf7a06107e11
Author: Ekta Khanna <[email protected]>
AuthorDate: Thu May 9 12:23:03 2019 -0700

    DL: Improve minibatch performance by using array_cat
    
    JIRA: MADLIB-1334
    
    Prior to this commit, the dl specific minibatch preprocessor uses a
    custom array concat function. This comit replaces it with the Postgres
    built-in array_cat function since it was performing better.
    
    Closes #390
    
    Co-authored-by: Orhan Kislal <[email protected]>
---
 .../modules/deep_learning/input_data_preprocessor.py_in |  8 ++++----
 .../deep_learning/input_data_preprocessor.sql_in        | 17 ++---------------
 .../modules/utilities/minibatch_preprocessing.py_in     |  4 ++--
 3 files changed, 8 insertions(+), 21 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in 
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index 4a2e7e4..1aa34f1 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -187,10 +187,10 @@ class InputDataPreprocessorDL(object):
             SELECT * FROM
             (
                 SELECT {self.schema_madlib}.agg_array_concat(
-                    ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
-                    {self.schema_madlib}.agg_array_concat(
-                    ARRAY[{norm_tbl}.y]) AS {y},
-                    ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS 
buffer_id
+                            ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
+                       {self.schema_madlib}.agg_array_concat(
+                            ARRAY[{norm_tbl}.y]) AS {y},
+                       ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS 
buffer_id
                 FROM {norm_tbl}
                 GROUP BY buffer_id
             ) b
diff --git 
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in 
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
index b9443ff..f2d9591 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
@@ -826,22 +826,9 @@ $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_transition(anyarray, 
anyarray)
-  RETURNS anyarray
-   AS 'select $1 || $2'
-   LANGUAGE SQL
-   IMMUTABLE;
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_merge(anyarray, 
anyarray)
-  RETURNS anyarray
-   AS 'select $1 || $2'
-   LANGUAGE SQL
-   IMMUTABLE
-   RETURNS NULL ON NULL INPUT;
-
 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.agg_array_concat(anyarray);
 CREATE AGGREGATE MADLIB_SCHEMA.agg_array_concat(anyarray) (
-   SFUNC = MADLIB_SCHEMA.agg_array_concat_transition,
+   SFUNC = array_cat,
    STYPE = anyarray,
-   PREFUNC = MADLIB_SCHEMA.agg_array_concat_merge
+   PREFUNC = array_cat
    );
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in 
b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 03c8214..e03bf44 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -286,8 +286,8 @@ class MiniBatchPreProcessor:
             FROM (
                 SELECT (row_number() OVER ({partition_by} ORDER BY random()) - 
1)
                         / {buffer_size}
-                            as {row_id}, * FROM
-                (
+                            as {row_id}, *
+                FROM (
                     {standardize_query}
                  ) sub_query_1
                  WHERE NOT 
{self.schema_madlib}.array_contains_null({dep_colname})

Reply via email to