This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
     new 141b4e4  Pivot: Fix array_agg + distinct scaling issue on gpdb
141b4e4 is described below

commit 141b4e4a00a2f64185db54f789384e0ad6cf6ae8
Author: Orhan Kislal <[email protected]>
AuthorDate: Thu May 9 15:38:10 2019 -0700

    Pivot: Fix array_agg + distinct scaling issue on gpdb
    
    JIRA: MADLIB-1361
    
    With large datasets, pivot fails because of the array_agg(distinct)
    query. This is because array_agg collects the values first and filters
    the distinct values later. This causes the array_agg to go out of
    memory.
    
    This commit fixes the issue by separating distinct from array_agg.
    We use a subquery to get the distinct values. Then we aggregate these
    values using array_agg.
    
    Closes #413
    
    Co-authored-by: Ekta Khanna <[email protected]>
---
 src/ports/postgres/modules/utilities/pivot.py_in | 36 +++++++++++++++---------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/pivot.py_in 
b/src/ports/postgres/modules/utilities/pivot.py_in
index 6d0ebae..d3d4e91 100644
--- a/src/ports/postgres/modules/utilities/pivot.py_in
+++ b/src/ports/postgres/modules/utilities/pivot.py_in
@@ -135,20 +135,28 @@ def pivot(schema_madlib, source_table, out_table, index, 
pivot_cols,
         agg_dict = parse_aggregates(pvals, aggregate_func)
 
         validate_output_types(source_table, agg_dict, is_array_output)
-
-        # Find the distinct values of pivot_cols
-        array_agg_str = ', '.join("array_agg(DISTINCT {pcol}) AS 
{pcol}_values".
-                                  format(pcol=pcol) for pcol in pcols)
-        if keep_null:
-            # Some platforms don't include NULL values as part of the 
array_agg(DISTINCT ...)
-            # Below clause checks explicitly for NULL values
-            null_str = ", " + ', '.join(
-                "bool_or(CASE WHEN {pcol} IS NULL THEN True ELSE False END)"
-                "AS {pcol}_isnull".format(pcol=pcol) for pcol in pcols)
-        else:
-            null_str = ""
-        distinct_values = plpy.execute("SELECT {0} {1} FROM {2}".
-                                       format(array_agg_str, null_str, 
source_table))[0]
+        distinct_values = {}
+        for pcol in pcols:
+            # Find the distinct values of pivot_cols
+            distinct_pcol_values_sql = "(SELECT DISTINCT {pcol} AS 
{pcol}_values FROM {source_table})tmp_x".format(**locals())
+
+            array_agg_str = "array_agg({pcol}_values) AS 
{pcol}_values".format(pcol=pcol)
+            pcol_name = "{pcol}_values".format(**locals())
+            distinct_values[pcol_name] = (
+                plpy.execute(""" SELECT {array_agg_str}
+                                 FROM {distinct_pcol_values_sql}
+                             """.format(**locals())))[0][pcol_name]
+            if keep_null:
+                # Some platforms don't include NULL values as part of the 
array_agg(DISTINCT ...)
+                # Below clause checks explicitly for NULL values
+                pcol_null_name = "{pcol}_isnull".format(**locals())
+                distinct_values[pcol_null_name] = (
+                    plpy.execute("""
+                        SELECT bool_or(CASE WHEN {pcol} IS NULL THEN True
+                                                                ELSE False END)
+                                AS {pcol}_isnull
+                        FROM {source_table}
+                    """.format(**locals())))[0][pcol_null_name]
 
         # Collect the distinct values for every pivot column into a dictionary
         pcol_distinct_values = {}

Reply via email to