Re: Memory-Bounded Hash Aggregation

Adam Lee Tue, 10 Dec 2019 13:35:37 -0800

On Wed, Dec 04, 2019 at 10:57:51PM -0800, Jeff Davis wrote:
> > About the `TODO: project needed attributes only` in your patch, when
> > would the input tuple contain columns not needed? It seems like
> > anything
> > you can project has to be in the group or aggregates.
> 
> If you have a table like:
> 
>    CREATE TABLE foo(i int, j int, x int, y int, z int);
> 
> And do:
> 
>    SELECT i, SUM(j) FROM foo GROUP BY i;
> 
> At least from a logical standpoint, you might expect that we project
> only the attributes we need from foo before feeding them into the
> HashAgg. But that's not quite how postgres works. Instead, it leaves
> the tuples intact (which, in this case, means they have 5 attributes)
> until after aggregation and lazily fetches whatever attributes are
> referenced. Tuples are spilled from the input, at which time they still
> have 5 attributes; so naively copying them is wasteful.
> 
> I'm not sure how often this laziness is really a win in practice,
> especially after the expression evaluation has changed so much in
> recent releases. So it might be better to just project all the
> attributes eagerly, and then none of this would be a problem. If we
> still wanted to be lazy about attribute fetching, that should still be
> possible even if we did a kind of "logical" projection of the tuple so
> that the useless attributes would not be relevant. Regardless, that's
> outside the scope of the patch I'm currently working on.
> 
> What I'd like to do is copy just the attributes needed into a new
> virtual slot, leave the unneeded ones NULL, and then write it out to
> the tuplestore as a MinimalTuple. I just need to be sure to get the
> right attributes.
> 
> Regards,
>       Jeff Davis


Melanie and I tried this, had a installcheck passed patch. The way how
we verify it is composing a wide table with long unnecessary text
columns, then check the size it writes on every iteration.

Please check out the attachment, it's based on your 1204 version.

-- 
Adam Lee

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index f509c8e8f5..fe4a520305 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1291,6 +1291,68 @@ project_aggregates(AggState *aggstate)
        return NULL;
 }
 
+static bool
+find_aggregated_cols_walker(Node *node, Bitmapset **colnos)
+{
+       if (node == NULL)
+               return false;
+
+       if (IsA(node, Var))
+       {
+               Var                *var = (Var *) node;
+
+               *colnos = bms_add_member(*colnos, var->varattno);
+
+               return false;
+       }
+       return expression_tree_walker(node, find_aggregated_cols_walker,
+                                                                 (void *) 
colnos);
+}
+
+/*
+ * find_aggregated_cols
+ *       Construct a bitmapset of the column numbers of aggregated Vars
+ *       appearing in our targetlist and qual (HAVING clause)
+ */
+static Bitmapset *
+find_aggregated_cols(AggState *aggstate)
+{
+       Agg                *node = (Agg *) aggstate->ss.ps.plan;
+       Bitmapset  *colnos = NULL;
+       ListCell   *temp;
+
+       /*
+        * We only want the columns used by aggregations in the targetlist or 
qual
+        */
+       if (node->plan.targetlist != NULL)
+       {
+               foreach(temp, (List *) node->plan.targetlist)
+               {
+                       if (IsA(lfirst(temp), TargetEntry))
+                       {
+                               Node *node = (Node *)((TargetEntry 
*)lfirst(temp))->expr;
+                               if (IsA(node, Aggref) || IsA(node, 
GroupingFunc))
+                                       find_aggregated_cols_walker(node, 
&colnos);
+                       }
+               }
+       }
+
+       if (node->plan.qual != NULL)
+       {
+               foreach(temp, (List *) node->plan.qual)
+               {
+                       if (IsA(lfirst(temp), TargetEntry))
+                       {
+                               Node *node = (Node *)((TargetEntry 
*)lfirst(temp))->expr;
+                               if (IsA(node, Aggref) || IsA(node, 
GroupingFunc))
+                                       find_aggregated_cols_walker(node, 
&colnos);
+                       }
+               }
+       }
+
+       return colnos;
+}
+
 /*
  * find_unaggregated_cols
  *       Construct a bitmapset of the column numbers of un-aggregated Vars
@@ -1520,6 +1582,23 @@ find_hash_columns(AggState *aggstate)
                for (i = 0; i < perhash->numCols; i++)
                        colnos = bms_add_member(colnos, grpColIdx[i]);
 
+               /*
+                * Find the columns used by aggregations
+                *
+                * This is shared by the entire aggregation.
+                */
+               if (aggstate->aggregated_columns == NULL)
+                       aggstate->aggregated_columns = 
find_aggregated_cols(aggstate);
+
+               /*
+                * The necessary columns to spill are either group keys or used 
by
+                * aggregations
+                *
+                * This is the convenient place to calculate the necessary 
columns to
+                * spill, because the group keys are different per hash.
+                */
+               perhash->necessarySpillCols = bms_union(colnos, 
aggstate->aggregated_columns);
+
                /*
                 * First build mapping for columns directly hashed. These are 
the
                 * first, because they'll be accessed when computing hash 
values and
@@ -1861,6 +1940,23 @@ lookup_hash_entries(AggState *aggstate)
                                hash_spill_init(spill, 0, 
perhash->aggnode->numGroups,
                                                                
aggstate->hashentrysize);
 
+                       AggStatePerHash perhash = 
&aggstate->perhash[aggstate->current_set];
+                       for (int ttsno = 0; ttsno < slot->tts_nvalid; ttsno++)
+                       {
+                               /*
+                                * null the column out if it's unnecessary, the 
following
+                                * forming functions will shrink it.
+                                *
+                                * it must be a virtual tuple here, this 
function is only used
+                                * by the first round, tuples are from other 
node but not the
+                                * spilled files.
+                                *
+                                * note: ttsno is zero indexed, cols are one 
indexed.
+                                */
+                               if (!bms_is_member(ttsno+1, 
perhash->necessarySpillCols))
+                                       slot->tts_isnull[ttsno] = true;
+                       }
+
                        aggstate->hash_disk_used += hash_spill_tuple(spill, 0, 
slot, hash);
                }
        }
@@ -2623,7 +2719,15 @@ hash_spill_tuple(HashAggSpill *spill, int input_bits, 
TupleTableSlot *slot,
 
        Assert(spill->partitions != NULL);
 
-       /*TODO: project needed attributes only */
+       /*
+        * heap_form_minimal_tuple() if it's a virtual tuple,
+        * tts_minimal_get_minimal_tuple() if it's a minimal tuple, which is
+        * exactly what we want.
+        *
+        * when we spill the tuples from input, they are virtual tuples with 
some
+        * columns nulled out, when we re-spill the tuples from spilling files,
+        * they are minimal tuples which was already nulled out before.
+        */
        tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
 
        if (spill->partition_bits == 0)
@@ -3072,6 +3176,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
         */
        aggstate->phases = palloc0(numPhases * sizeof(AggStatePerPhaseData));
 
+       aggstate->aggregated_columns = NULL;
        aggstate->num_hashes = numHashes;
        if (numHashes)
        {
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index 68c9e5f540..3b61109b52 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -302,6 +302,7 @@ typedef struct AggStatePerHashData
        AttrNumber *hashGrpColIdxInput; /* hash col indices in input slot */
        AttrNumber *hashGrpColIdxHash;  /* indices in hash table tuples */
        Agg                *aggnode;            /* original Agg node, for 
numGroups etc. */
+       Bitmapset  *necessarySpillCols;  /* the necessary columns if spills */
 }                      AggStatePerHashData;
 
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b9803a28bd..0c034b5f67 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2084,6 +2084,7 @@ typedef struct AggState
        uint64          hash_disk_used; /* bytes of disk space used */
        int                     hash_batches_used;      /* batches used during 
entire execution */
        List       *hash_batches;       /* hash batches remaining to be 
processed */
+       Bitmapset  *aggregated_columns; /* the columns used by aggregations */
 
        AggStatePerHash perhash;        /* array of per-hashtable data */
        AggStatePerGroup *hash_pergroup;        /* grouping set indexed array of

Re: Memory-Bounded Hash Aggregation

Reply via email to