This is an automated email from the ASF dual-hosted git repository.
okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new 141b4e4 Pivot: Fix array_agg + distinct scaling issue on gpdb
141b4e4 is described below
commit 141b4e4a00a2f64185db54f789384e0ad6cf6ae8
Author: Orhan Kislal <[email protected]>
AuthorDate: Thu May 9 15:38:10 2019 -0700
Pivot: Fix array_agg + distinct scaling issue on gpdb
JIRA: MADLIB-1361
With large datasets, pivot fails because of the array_agg(distinct)
query. This is because array_agg collects the values first and filters
the distinct values later. This causes the array_agg to go out of
memory.
This commit fixes the issue by separating distinct from array_agg.
We use a subquery to get the distinct values. Then we aggregate these
values using array_agg.
Closes #413
Co-authored-by: Ekta Khanna <[email protected]>
---
src/ports/postgres/modules/utilities/pivot.py_in | 36 +++++++++++++++---------
1 file changed, 22 insertions(+), 14 deletions(-)
diff --git a/src/ports/postgres/modules/utilities/pivot.py_in
b/src/ports/postgres/modules/utilities/pivot.py_in
index 6d0ebae..d3d4e91 100644
--- a/src/ports/postgres/modules/utilities/pivot.py_in
+++ b/src/ports/postgres/modules/utilities/pivot.py_in
@@ -135,20 +135,28 @@ def pivot(schema_madlib, source_table, out_table, index,
pivot_cols,
agg_dict = parse_aggregates(pvals, aggregate_func)
validate_output_types(source_table, agg_dict, is_array_output)
-
- # Find the distinct values of pivot_cols
- array_agg_str = ', '.join("array_agg(DISTINCT {pcol}) AS
{pcol}_values".
- format(pcol=pcol) for pcol in pcols)
- if keep_null:
- # Some platforms don't include NULL values as part of the
array_agg(DISTINCT ...)
- # Below clause checks explicitly for NULL values
- null_str = ", " + ', '.join(
- "bool_or(CASE WHEN {pcol} IS NULL THEN True ELSE False END)"
- "AS {pcol}_isnull".format(pcol=pcol) for pcol in pcols)
- else:
- null_str = ""
- distinct_values = plpy.execute("SELECT {0} {1} FROM {2}".
- format(array_agg_str, null_str,
source_table))[0]
+ distinct_values = {}
+ for pcol in pcols:
+ # Find the distinct values of pivot_cols
+ distinct_pcol_values_sql = "(SELECT DISTINCT {pcol} AS
{pcol}_values FROM {source_table})tmp_x".format(**locals())
+
+ array_agg_str = "array_agg({pcol}_values) AS
{pcol}_values".format(pcol=pcol)
+ pcol_name = "{pcol}_values".format(**locals())
+ distinct_values[pcol_name] = (
+ plpy.execute(""" SELECT {array_agg_str}
+ FROM {distinct_pcol_values_sql}
+ """.format(**locals())))[0][pcol_name]
+ if keep_null:
+ # Some platforms don't include NULL values as part of the
array_agg(DISTINCT ...)
+ # Below clause checks explicitly for NULL values
+ pcol_null_name = "{pcol}_isnull".format(**locals())
+ distinct_values[pcol_null_name] = (
+ plpy.execute("""
+ SELECT bool_or(CASE WHEN {pcol} IS NULL THEN True
+ ELSE False END)
+ AS {pcol}_isnull
+ FROM {source_table}
+ """.format(**locals())))[0][pcol_null_name]
# Collect the distinct values for every pivot column into a dictionary
pcol_distinct_values = {}