This is an automated email from the ASF dual-hosted git repository.
jingyimei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new a16dffd DL: Improve minibatch performance by using array_cat
a16dffd is described below
commit a16dffded5dbdb41cf45fcf34fb4cf7a06107e11
Author: Ekta Khanna <[email protected]>
AuthorDate: Thu May 9 12:23:03 2019 -0700
DL: Improve minibatch performance by using array_cat
JIRA: MADLIB-1334
Prior to this commit, the dl specific minibatch preprocessor uses a
custom array concat function. This comit replaces it with the Postgres
built-in array_cat function since it was performing better.
Closes #390
Co-authored-by: Orhan Kislal <[email protected]>
---
.../modules/deep_learning/input_data_preprocessor.py_in | 8 ++++----
.../deep_learning/input_data_preprocessor.sql_in | 17 ++---------------
.../modules/utilities/minibatch_preprocessing.py_in | 4 ++--
3 files changed, 8 insertions(+), 21 deletions(-)
diff --git
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index 4a2e7e4..1aa34f1 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -187,10 +187,10 @@ class InputDataPreprocessorDL(object):
SELECT * FROM
(
SELECT {self.schema_madlib}.agg_array_concat(
- ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
- {self.schema_madlib}.agg_array_concat(
- ARRAY[{norm_tbl}.y]) AS {y},
- ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS
buffer_id
+ ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
+ {self.schema_madlib}.agg_array_concat(
+ ARRAY[{norm_tbl}.y]) AS {y},
+ ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS
buffer_id
FROM {norm_tbl}
GROUP BY buffer_id
) b
diff --git
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
index b9443ff..f2d9591 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
@@ -826,22 +826,9 @@ $$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_transition(anyarray,
anyarray)
- RETURNS anyarray
- AS 'select $1 || $2'
- LANGUAGE SQL
- IMMUTABLE;
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_merge(anyarray,
anyarray)
- RETURNS anyarray
- AS 'select $1 || $2'
- LANGUAGE SQL
- IMMUTABLE
- RETURNS NULL ON NULL INPUT;
-
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.agg_array_concat(anyarray);
CREATE AGGREGATE MADLIB_SCHEMA.agg_array_concat(anyarray) (
- SFUNC = MADLIB_SCHEMA.agg_array_concat_transition,
+ SFUNC = array_cat,
STYPE = anyarray,
- PREFUNC = MADLIB_SCHEMA.agg_array_concat_merge
+ PREFUNC = array_cat
);
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 03c8214..e03bf44 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -286,8 +286,8 @@ class MiniBatchPreProcessor:
FROM (
SELECT (row_number() OVER ({partition_by} ORDER BY random()) -
1)
/ {buffer_size}
- as {row_id}, * FROM
- (
+ as {row_id}, *
+ FROM (
{standardize_query}
) sub_query_1
WHERE NOT
{self.schema_madlib}.array_contains_null({dep_colname})