Hi. It seems to me that some of the code in partition.c is better placed somewhere under the executor directory. There was even a suggestion recently [1] to introduce a execPartition.c to house some code around tuple-routing.
IMO, catalog/partition.c should present an interface for handling operations on a *single* partitioned table and avoid pretending to support any operations on the whole partition tree. For example, the PartitionDispatch structure embeds some knowledge about the partition tree it's part of, which is useful when used for tuple-routing, because of the way it works now (lock and form ResultRelInfos of *all* leaf partitions before the first input row is processed). So, let's move that structure, along with the code that creates and manipulates the same, out of partition.c/h and to execPartition.c/h. Attached patch attempts to do that. While doing the same, I didn't move *all* of get_partition_for_tuple() out to execPartition.c, instead modified its signature as shown below: -extern int get_partition_for_tuple(PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate, - PartitionDispatchData **failed_at, - TupleTableSlot **failed_slot); +extern int get_partition_for_tuple(Relation relation, Datum *values, + bool *isnull); That way, we keep the core partition bound comparison logic inside partition.c and move rest of the stuff to its caller ExecFindPartition(), which includes navigating the enveloping PartitionDispatch's. Thoughts? PS: 0001 of the attached is the patch from [2] which is here to be applied on HEAD before applying the main patch (0002) itself Thanks, Amit [1] https://www.postgresql.org/message-id/CA%2BTgmoafr%3DhUrM%3Dcbx-k%3DBDHOF2OfXa w95HQSNAK4mHBwmSjtw%40mail.gmail.com [2] https://www.postgresql.org/message-id/7fe0007b-7ad1-a593-df11-ad05364ebce4%40l ab.ntt.co.jp
From cb16fb6c89f60a8cf6b8f713ca9c831fe3824b2d Mon Sep 17 00:00:00 2001 From: amit <amitlangot...@gmail.com> Date: Fri, 8 Sep 2017 17:35:10 +0900 Subject: [PATCH 1/2] Make RelationGetPartitionDispatch expansion order depth-first This is so as it matches what the planner is doing with partitioning inheritance expansion. Matching with planner order helps because it helps ease matching the executor's per-partition objects with planner-created per-partition nodes. --- src/backend/catalog/partition.c | 242 ++++++++++++++++------------------------ 1 file changed, 99 insertions(+), 143 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 73eff17202..ddb46a80cb 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -147,6 +147,8 @@ static int32 partition_bound_cmp(PartitionKey key, static int partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, void *probe, bool probe_is_bound, bool *is_equal); +static void get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids); /* * RelationBuildPartitionDesc @@ -1192,21 +1194,6 @@ get_partition_qual_relid(Oid relid) } /* - * Append OIDs of rel's partitions to the list 'partoids' and for each OID, - * append pointer rel to the list 'parents'. - */ -#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \ - do\ - {\ - int i;\ - for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\ - {\ - (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\ - (parents) = lappend((parents), (rel));\ - }\ - } while(0) - -/* * RelationGetPartitionDispatchInfo * Returns information necessary to route tuples down a partition tree * @@ -1222,151 +1209,120 @@ PartitionDispatch * RelationGetPartitionDispatchInfo(Relation rel, int *num_parted, List **leaf_part_oids) { + List *pdlist; PartitionDispatchData **pd; - List *all_parts = NIL, - *all_parents = NIL, - *parted_rels, - *parted_rel_parents; - ListCell *lc1, - *lc2; - int i, - k, - offset; + ListCell *lc; + int i; - /* - * We rely on the relcache to traverse the partition tree to build both - * the leaf partition OIDs list and the array of PartitionDispatch objects - * for the partitioned tables in the tree. That means every partitioned - * table in the tree must be locked, which is fine since we require the - * caller to lock all the partitions anyway. - * - * For every partitioned table in the tree, starting with the root - * partitioned table, add its relcache entry to parted_rels, while also - * queuing its partitions (in the order in which they appear in the - * partition descriptor) to be looked at later in the same loop. This is - * a bit tricky but works because the foreach() macro doesn't fetch the - * next list element until the bottom of the loop. - */ - *num_parted = 1; - parted_rels = list_make1(rel); - /* Root partitioned table has no parent, so NULL for parent */ - parted_rel_parents = list_make1(NULL); - APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents); - forboth(lc1, all_parts, lc2, all_parents) + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + + *num_parted = 0; + *leaf_part_oids = NIL; + + get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); + *num_parted = list_length(pdlist); + pd = (PartitionDispatchData **) palloc(*num_parted * + sizeof(PartitionDispatchData *)); + i = 0; + foreach (lc, pdlist) { - Oid partrelid = lfirst_oid(lc1); - Relation parent = lfirst(lc2); + pd[i++] = lfirst(lc); + } - if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE) - { - /* - * Already locked by the caller. Note that it is the - * responsibility of the caller to close the below relcache entry, - * once done using the information being collected here (for - * example, in ExecEndModifyTable). - */ - Relation partrel = heap_open(partrelid, NoLock); + return pd; +} - (*num_parted)++; - parted_rels = lappend(parted_rels, partrel); - parted_rel_parents = lappend(parted_rel_parents, parent); - APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents); - } +/* + * get_partition_dispatch_recurse + * Recursively expand partition tree rooted at rel + * + * As the partition tree is expanded in a depth-first manner, we mantain two + * global lists: of PartitionDispatch objects corresponding to partitioned + * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. + */ +static void +get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids) +{ + TupleDesc tupdesc = RelationGetDescr(rel); + PartitionDesc partdesc = RelationGetPartitionDesc(rel); + PartitionKey partkey = RelationGetPartitionKey(rel); + PartitionDispatch pd; + int i; + + /* Build a PartitionDispatch for this table and add it to *pds. */ + pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); + *pds = lappend(*pds, pd); + pd->reldesc = rel; + pd->key = partkey; + pd->keystate = NIL; + pd->partdesc = partdesc; + if (parent != NULL) + { + /* + * For every partitioned table other than root, we must store a + * tuple table slot initialized with its tuple descriptor and a + * tuple conversion map to convert a tuple from its parent's + * rowtype to its own. That is to make sure that we are looking at + * the correct row using the correct tuple descriptor when + * computing its partition key for tuple routing. + */ + pd->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + tupdesc, + gettext_noop("could not convert row type")); + } + else + { + /* Not required for the root partitioned table */ + pd->tupslot = NULL; + pd->tupmap = NULL; } /* - * We want to create two arrays - one for leaf partitions and another for - * partitioned tables (including the root table and internal partitions). - * While we only create the latter here, leaf partition array of suitable - * objects (such as, ResultRelInfo) is created by the caller using the - * list of OIDs we return. Indexes into these arrays get assigned in a - * breadth-first manner, whereby partitions of any given level are placed - * consecutively in the respective arrays. + * Go look at each partition of this table. If it's a leaf partition, + * simply add its OID to *leaf_part_oids. If it's a partitioned table, + * recursively call get_partition_dispatch_recurse(), so that its + * partitions are processed as well and a corresponding PartitionDispatch + * object gets added to *pds. + * + * About the values in pd->indexes: for a leaf partition, it contains the + * leaf partition's position in the global list *leaf_part_oids minus 1, + * whereas for a partitioned table partition, it contains the partition's + * position in the global list *pds multiplied by -1. The latter is + * multiplied by -1 to distinguish partitioned tables from leaf partitions + * when going through the values in pd->indexes. So, for example, when + * using it during tuple-routing, encountering a value >= 0 means we found + * a leaf partition. It is immediately returned as the index in the array + * of ResultRelInfos of all the leaf partitions, using which we insert the + * tuple into that leaf partition. A negative value means we found a + * partitioned table. The value multiplied back by -1 is returned as the + * index in the array of PartitionDispatch objects of all partitioned + * tables in the tree, using which, search is continued further down the + * partition tree. */ - pd = (PartitionDispatchData **) palloc(*num_parted * - sizeof(PartitionDispatchData *)); - *leaf_part_oids = NIL; - i = k = offset = 0; - forboth(lc1, parted_rels, lc2, parted_rel_parents) + pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); + for (i = 0; i < partdesc->nparts; i++) { - Relation partrel = lfirst(lc1); - Relation parent = lfirst(lc2); - PartitionKey partkey = RelationGetPartitionKey(partrel); - TupleDesc tupdesc = RelationGetDescr(partrel); - PartitionDesc partdesc = RelationGetPartitionDesc(partrel); - int j, - m; - - pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); - pd[i]->reldesc = partrel; - pd[i]->key = partkey; - pd[i]->keystate = NIL; - pd[i]->partdesc = partdesc; - if (parent != NULL) + Oid partrelid = partdesc->oids[i]; + + if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) { - /* - * For every partitioned table other than root, we must store a - * tuple table slot initialized with its tuple descriptor and a - * tuple conversion map to convert a tuple from its parent's - * rowtype to its own. That is to make sure that we are looking at - * the correct row using the correct tuple descriptor when - * computing its partition key for tuple routing. - */ - pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc); - pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent), - tupdesc, - gettext_noop("could not convert row type")); + *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); + pd->indexes[i] = list_length(*leaf_part_oids) - 1; } else { - /* Not required for the root partitioned table */ - pd[i]->tupslot = NULL; - pd[i]->tupmap = NULL; - } - pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); - - /* - * Indexes corresponding to the internal partitions are multiplied by - * -1 to distinguish them from those of leaf partitions. Encountering - * an index >= 0 means we found a leaf partition, which is immediately - * returned as the partition we are looking for. A negative index - * means we found a partitioned table, whose PartitionDispatch object - * is located at the above index multiplied back by -1. Using the - * PartitionDispatch object, search is continued further down the - * partition tree. - */ - m = 0; - for (j = 0; j < partdesc->nparts; j++) - { - Oid partrelid = partdesc->oids[j]; + /* + * We assume all tables in the partition tree were already + * locked by the caller. + */ + Relation partrel = heap_open(partrelid, NoLock); - if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) - { - *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); - pd[i]->indexes[j] = k++; - } - else - { - /* - * offset denotes the number of partitioned tables of upper - * levels including those of the current level. Any partition - * of this table must belong to the next level and hence will - * be placed after the last partitioned table of this level. - */ - pd[i]->indexes[j] = -(1 + offset + m); - m++; - } + pd->indexes[i] = -list_length(*pds); + get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); } - i++; - - /* - * This counts the number of partitioned tables at upper levels - * including those of the current level. - */ - offset += m; } - - return pd; } /* Module-local functions */ -- 2.11.0
From a66ca8b852a8d06c56c4a90a3f654be88ec95a0b Mon Sep 17 00:00:00 2001 From: amit <amitlangot...@gmail.com> Date: Fri, 8 Sep 2017 19:07:38 +0900 Subject: [PATCH 2/2] Move certain partitioning code to the executor --- src/backend/catalog/partition.c | 428 +++++-------------------- src/backend/commands/copy.c | 1 + src/backend/executor/Makefile | 2 +- src/backend/executor/execMain.c | 265 +--------------- src/backend/executor/execPartition.c | 549 +++++++++++++++++++++++++++++++++ src/backend/executor/nodeModifyTable.c | 1 + src/include/catalog/partition.h | 48 +-- src/include/executor/execPartition.h | 65 ++++ src/include/executor/executor.h | 14 +- 9 files changed, 698 insertions(+), 675 deletions(-) create mode 100644 src/backend/executor/execPartition.c create mode 100644 src/include/executor/execPartition.h diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index ddb46a80cb..903c8c4def 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -147,8 +147,6 @@ static int32 partition_bound_cmp(PartitionKey key, static int partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo, void *probe, bool probe_is_bound, bool *is_equal); -static void get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids); /* * RelationBuildPartitionDesc @@ -1193,138 +1191,6 @@ get_partition_qual_relid(Oid relid) return result; } -/* - * RelationGetPartitionDispatchInfo - * Returns information necessary to route tuples down a partition tree - * - * The number of elements in the returned array (that is, the number of - * PartitionDispatch objects for the partitioned tables in the partition tree) - * is returned in *num_parted and a list of the OIDs of all the leaf - * partitions of rel is returned in *leaf_part_oids. - * - * All the relations in the partition tree (including 'rel') must have been - * locked (using at least the AccessShareLock) by the caller. - */ -PartitionDispatch * -RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids) -{ - List *pdlist; - PartitionDispatchData **pd; - ListCell *lc; - int i; - - Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - - *num_parted = 0; - *leaf_part_oids = NIL; - - get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); - *num_parted = list_length(pdlist); - pd = (PartitionDispatchData **) palloc(*num_parted * - sizeof(PartitionDispatchData *)); - i = 0; - foreach (lc, pdlist) - { - pd[i++] = lfirst(lc); - } - - return pd; -} - -/* - * get_partition_dispatch_recurse - * Recursively expand partition tree rooted at rel - * - * As the partition tree is expanded in a depth-first manner, we mantain two - * global lists: of PartitionDispatch objects corresponding to partitioned - * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. - */ -static void -get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids) -{ - TupleDesc tupdesc = RelationGetDescr(rel); - PartitionDesc partdesc = RelationGetPartitionDesc(rel); - PartitionKey partkey = RelationGetPartitionKey(rel); - PartitionDispatch pd; - int i; - - /* Build a PartitionDispatch for this table and add it to *pds. */ - pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); - *pds = lappend(*pds, pd); - pd->reldesc = rel; - pd->key = partkey; - pd->keystate = NIL; - pd->partdesc = partdesc; - if (parent != NULL) - { - /* - * For every partitioned table other than root, we must store a - * tuple table slot initialized with its tuple descriptor and a - * tuple conversion map to convert a tuple from its parent's - * rowtype to its own. That is to make sure that we are looking at - * the correct row using the correct tuple descriptor when - * computing its partition key for tuple routing. - */ - pd->tupslot = MakeSingleTupleTableSlot(tupdesc); - pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), - tupdesc, - gettext_noop("could not convert row type")); - } - else - { - /* Not required for the root partitioned table */ - pd->tupslot = NULL; - pd->tupmap = NULL; - } - - /* - * Go look at each partition of this table. If it's a leaf partition, - * simply add its OID to *leaf_part_oids. If it's a partitioned table, - * recursively call get_partition_dispatch_recurse(), so that its - * partitions are processed as well and a corresponding PartitionDispatch - * object gets added to *pds. - * - * About the values in pd->indexes: for a leaf partition, it contains the - * leaf partition's position in the global list *leaf_part_oids minus 1, - * whereas for a partitioned table partition, it contains the partition's - * position in the global list *pds multiplied by -1. The latter is - * multiplied by -1 to distinguish partitioned tables from leaf partitions - * when going through the values in pd->indexes. So, for example, when - * using it during tuple-routing, encountering a value >= 0 means we found - * a leaf partition. It is immediately returned as the index in the array - * of ResultRelInfos of all the leaf partitions, using which we insert the - * tuple into that leaf partition. A negative value means we found a - * partitioned table. The value multiplied back by -1 is returned as the - * index in the array of PartitionDispatch objects of all partitioned - * tables in the tree, using which, search is continued further down the - * partition tree. - */ - pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); - for (i = 0; i < partdesc->nparts; i++) - { - Oid partrelid = partdesc->oids[i]; - - if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) - { - *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); - pd->indexes[i] = list_length(*leaf_part_oids) - 1; - } - else - { - /* - * We assume all tables in the partition tree were already - * locked by the caller. - */ - Relation partrel = heap_open(partrelid, NoLock); - - pd->indexes[i] = -list_length(*pds); - get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); - } - } -} - /* Module-local functions */ /* @@ -2184,248 +2050,98 @@ generate_partition_qual(Relation rel) return result; } -/* ---------------- - * FormPartitionKeyDatum - * Construct values[] and isnull[] arrays for the partition key - * of a tuple. - * - * pd Partition dispatch object of the partitioned table - * slot Heap tuple from which to extract partition key - * estate executor state for evaluating any partition key - * expressions (must be non-NULL) - * values Array of partition key Datums (output area) - * isnull Array of is-null indicators (output area) - * - * the ecxt_scantuple slot of estate's per-tuple expr context must point to - * the heap tuple passed in. - * ---------------- - */ -void -FormPartitionKeyDatum(PartitionDispatch pd, - TupleTableSlot *slot, - EState *estate, - Datum *values, - bool *isnull) -{ - ListCell *partexpr_item; - int i; - - if (pd->key->partexprs != NIL && pd->keystate == NIL) - { - /* Check caller has set up context correctly */ - Assert(estate != NULL && - GetPerTupleExprContext(estate)->ecxt_scantuple == slot); - - /* First time through, set up expression evaluation state */ - pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); - } - - partexpr_item = list_head(pd->keystate); - for (i = 0; i < pd->key->partnatts; i++) - { - AttrNumber keycol = pd->key->partattrs[i]; - Datum datum; - bool isNull; - - if (keycol != 0) - { - /* Plain column; get the value directly from the heap tuple */ - datum = slot_getattr(slot, keycol, &isNull); - } - else - { - /* Expression; need to evaluate it */ - if (partexpr_item == NULL) - elog(ERROR, "wrong number of partition key expressions"); - datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), - GetPerTupleExprContext(estate), - &isNull); - partexpr_item = lnext(partexpr_item); - } - values[i] = datum; - isnull[i] = isNull; - } - - if (partexpr_item != NULL) - elog(ERROR, "wrong number of partition key expressions"); -} - /* * get_partition_for_tuple - * Finds a leaf partition for tuple contained in *slot + * Finds partition of relation which accepts the partition key specified + * in values and isnull * - * Returned value is the sequence number of the leaf partition thus found, - * or -1 if no leaf partition is found for the tuple. *failed_at is set - * to the OID of the partitioned table whose partition was not found in - * the latter case. + * Return value is index of the partition (>= 0 and < partdesc->nparts) if one + * found or -1 if none found. */ int -get_partition_for_tuple(PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate, - PartitionDispatchData **failed_at, - TupleTableSlot **failed_slot) +get_partition_for_tuple(Relation relation, Datum *values, bool *isnull) { - PartitionDispatch parent; - Datum values[PARTITION_MAX_KEYS]; - bool isnull[PARTITION_MAX_KEYS]; - int result; - ExprContext *ecxt = GetPerTupleExprContext(estate); - TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; - - /* start with the root partitioned table */ - parent = pd[0]; - while (true) - { - PartitionKey key = parent->key; - PartitionDesc partdesc = parent->partdesc; - TupleTableSlot *myslot = parent->tupslot; - TupleConversionMap *map = parent->tupmap; - int cur_index = -1; - - if (myslot != NULL && map != NULL) - { - HeapTuple tuple = ExecFetchSlotTuple(slot); + int bound_offset; + int part_index = -1; + PartitionKey key = RelationGetPartitionKey(relation); + PartitionDesc partdesc = RelationGetPartitionDesc(relation); - ExecClearTuple(myslot); - tuple = do_convert_tuple(tuple, map); - ExecStoreTuple(tuple, myslot, InvalidBuffer, true); - slot = myslot; - } - - /* Quick exit */ - if (partdesc->nparts == 0) - { - *failed_at = parent; - *failed_slot = slot; - result = -1; - goto error_exit; - } - - /* - * Extract partition key from tuple. Expression evaluation machinery - * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to - * point to the correct tuple slot. The slot might have changed from - * what was used for the parent table if the table of the current - * partitioning level has different tuple descriptor from the parent. - * So update ecxt_scantuple accordingly. - */ - ecxt->ecxt_scantuple = slot; - FormPartitionKeyDatum(parent, slot, estate, values, isnull); - - /* Route as appropriate based on partitioning strategy. */ - switch (key->strategy) - { - case PARTITION_STRATEGY_LIST: + /* Route as appropriate based on partitioning strategy. */ + switch (key->strategy) + { + case PARTITION_STRATEGY_LIST: + if (isnull[0]) + { + if (partition_bound_accepts_nulls(partdesc->boundinfo)) + part_index = partdesc->boundinfo->null_index; + } + else + { + bool equal = false; + + bound_offset = partition_bound_bsearch(key, + partdesc->boundinfo, + values, + false, + &equal); + if (bound_offset >= 0 && equal) + part_index = partdesc->boundinfo->indexes[bound_offset]; + } + break; - if (isnull[0]) - { - if (partition_bound_accepts_nulls(partdesc->boundinfo)) - cur_index = partdesc->boundinfo->null_index; - } - else - { - bool equal = false; - int cur_offset; - - cur_offset = partition_bound_bsearch(key, - partdesc->boundinfo, - values, - false, - &equal); - if (cur_offset >= 0 && equal) - cur_index = partdesc->boundinfo->indexes[cur_offset]; - } - break; + case PARTITION_STRATEGY_RANGE: + { + bool equal = false, + range_partkey_has_null = false; + int i; - case PARTITION_STRATEGY_RANGE: + /* + * No range includes NULL, so this will be accepted by the + * default partition if there is one, and otherwise + * rejected. + */ + for (i = 0; i < key->partnatts; i++) { - bool equal = false, - range_partkey_has_null = false; - int cur_offset; - int i; - - /* - * No range includes NULL, so this will be accepted by the - * default partition if there is one, and otherwise - * rejected. - */ - for (i = 0; i < key->partnatts; i++) + if (isnull[i] && + partition_bound_has_default(partdesc->boundinfo)) { - if (isnull[i] && - partition_bound_has_default(partdesc->boundinfo)) - { - range_partkey_has_null = true; - break; - } - else if (isnull[i]) - { - *failed_at = parent; - *failed_slot = slot; - result = -1; - goto error_exit; - } + range_partkey_has_null = true; + part_index = partdesc->boundinfo->default_index; } + } - /* - * No need to search for partition, as the null key will - * be routed to the default partition. - */ - if (range_partkey_has_null) - break; - - cur_offset = partition_bound_bsearch(key, - partdesc->boundinfo, - values, - false, - &equal); + if (!range_partkey_has_null) + { + bound_offset = partition_bound_bsearch(key, + partdesc->boundinfo, + values, + false, + &equal); /* - * The offset returned is such that the bound at - * cur_offset is less than or equal to the tuple value, so - * the bound at offset+1 is the upper bound. + * The bound at bound_offset is less than or equal to the + * tuple value, so the bound at offset+1 is the upper + * bound of the partition we're looking for, if there + * actually exists one. */ - cur_index = partdesc->boundinfo->indexes[cur_offset + 1]; + part_index = partdesc->boundinfo->indexes[bound_offset + 1]; } - break; - - default: - elog(ERROR, "unexpected partition strategy: %d", - (int) key->strategy); - } - - /* - * cur_index < 0 means we failed to find a partition of this parent. - * Use the default partition, if there is one. - */ - if (cur_index < 0) - cur_index = partdesc->boundinfo->default_index; - - /* - * If cur_index is still less than 0 at this point, there's no - * partition for this tuple. Otherwise, we either found the leaf - * partition, or a child partitioned table through which we have to - * route the tuple. - */ - if (cur_index < 0) - { - result = -1; - *failed_at = parent; - *failed_slot = slot; - break; - } - else if (parent->indexes[cur_index] >= 0) - { - result = parent->indexes[cur_index]; + } break; - } - else - parent = pd[-parent->indexes[cur_index]]; + + default: + elog(ERROR, "unexpected partition strategy: %d", + (int) key->strategy); } -error_exit: - ecxt->ecxt_scantuple = ecxt_scantuple_old; - return result; + /* + * part_index < 0 means we failed to find a partition of this parent. + * Use the default partition, if there is one. + */ + if (part_index < 0) + part_index = partdesc->boundinfo->default_index; + + return part_index; } /* diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index cfa3f059c2..7c331bd519 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -29,6 +29,7 @@ #include "commands/copy.h" #include "commands/defrem.h" #include "commands/trigger.h" +#include "executor/execPartition.h" #include "executor/executor.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index 083b20f3fe..cc09895fa5 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -14,7 +14,7 @@ include $(top_builddir)/src/Makefile.global OBJS = execAmi.o execCurrent.o execExpr.o execExprInterp.o \ execGrouping.o execIndexing.o execJunk.o \ - execMain.o execParallel.o execProcnode.o \ + execMain.o execParallel.o execPartition.o execProcnode.o \ execReplication.o execScan.o execSRF.o execTuples.o \ execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \ nodeBitmapAnd.o nodeBitmapOr.o \ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 4b594d489c..dc9f1c1322 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -43,7 +43,6 @@ #include "access/xact.h" #include "catalog/namespace.h" #include "catalog/partition.h" -#include "catalog/pg_inherits_fn.h" #include "catalog/pg_publication.h" #include "commands/matview.h" #include "commands/trigger.h" @@ -98,14 +97,8 @@ static char *ExecBuildSlotValueDescription(Oid reloid, TupleDesc tupdesc, Bitmapset *modifiedCols, int maxfieldlen); -static char *ExecBuildSlotPartitionKeyDescription(Relation rel, - Datum *values, - bool *isnull, - int maxfieldlen); static void EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree); -static void ExecPartitionCheck(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate); /* * Note that GetUpdatedColumns() also exists in commands/trigger.c. There does @@ -1849,8 +1842,10 @@ ExecRelCheck(ResultRelInfo *resultRelInfo, /* * ExecPartitionCheck --- check that tuple meets the partition constraint. + * + * Exported in executor.h for outside use. */ -static void +void ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate) { @@ -3237,257 +3232,3 @@ EvalPlanQualEnd(EPQState *epqstate) epqstate->planstate = NULL; epqstate->origslot = NULL; } - -/* - * ExecSetupPartitionTupleRouting - set up information needed during - * tuple routing for partitioned tables - * - * Output arguments: - * 'pd' receives an array of PartitionDispatch objects with one entry for - * every partitioned table in the partition tree - * 'partitions' receives an array of ResultRelInfo objects with one entry for - * every leaf partition in the partition tree - * 'tup_conv_maps' receives an array of TupleConversionMap objects with one - * entry for every leaf partition (required to convert input tuple based - * on the root table's rowtype to a leaf partition's rowtype after tuple - * routing is done) - * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used - * to manipulate any given leaf partition's rowtype after that partition - * is chosen by tuple-routing. - * 'num_parted' receives the number of partitioned tables in the partition - * tree (= the number of entries in the 'pd' output array) - * 'num_partitions' receives the number of leaf partitions in the partition - * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' - * output arrays - * - * Note that all the relations in the partition tree are locked using the - * RowExclusiveLock mode upon return from this function. - */ -void -ExecSetupPartitionTupleRouting(Relation rel, - Index resultRTindex, - EState *estate, - PartitionDispatch **pd, - ResultRelInfo **partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions) -{ - TupleDesc tupDesc = RelationGetDescr(rel); - List *leaf_parts; - ListCell *cell; - int i; - ResultRelInfo *leaf_part_rri; - - /* - * Get the information about the partition tree after locking all the - * partitions. - */ - (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); - *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); - *num_partitions = list_length(leaf_parts); - *partitions = (ResultRelInfo *) palloc(*num_partitions * - sizeof(ResultRelInfo)); - *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * - sizeof(TupleConversionMap *)); - - /* - * Initialize an empty slot that will be used to manipulate tuples of any - * given partition's rowtype. It is attached to the caller-specified node - * (such as ModifyTableState) and released when the node finishes - * processing. - */ - *partition_tuple_slot = MakeTupleTableSlot(); - - leaf_part_rri = *partitions; - i = 0; - foreach(cell, leaf_parts) - { - Relation partrel; - TupleDesc part_tupdesc; - - /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in *partitions are eventually - * closed by the caller. - */ - partrel = heap_open(lfirst_oid(cell), NoLock); - part_tupdesc = RelationGetDescr(partrel); - - /* - * Save a tuple conversion map to convert a tuple routed to this - * partition from the parent's type to the partition's. - */ - (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, - gettext_noop("could not convert row type")); - - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - estate->es_instrument); - - /* - * Verify result relation is a valid target for INSERT. - */ - CheckValidResultRel(leaf_part_rri, CMD_INSERT); - - /* - * Open partition indices (remember we do not support ON CONFLICT in - * case of partitioned tables, so we do not need support information - * for speculative insertion) - */ - if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && - leaf_part_rri->ri_IndexRelationDescs == NULL) - ExecOpenIndices(leaf_part_rri, false); - - estate->es_leaf_result_relations = - lappend(estate->es_leaf_result_relations, leaf_part_rri); - - leaf_part_rri++; - i++; - } -} - -/* - * ExecFindPartition -- Find a leaf partition in the partition tree rooted - * at parent, for the heap tuple contained in *slot - * - * estate must be non-NULL; we'll need it to compute any expressions in the - * partition key(s) - * - * If no leaf partition is found, this routine errors out with the appropriate - * error message, else it returns the leaf partition sequence number returned - * by get_partition_for_tuple() unchanged. - */ -int -ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, - TupleTableSlot *slot, EState *estate) -{ - int result; - PartitionDispatchData *failed_at; - TupleTableSlot *failed_slot; - - /* - * First check the root table's partition constraint, if any. No point in - * routing the tuple if it doesn't belong in the root table itself. - */ - if (resultRelInfo->ri_PartitionCheck) - ExecPartitionCheck(resultRelInfo, slot, estate); - - result = get_partition_for_tuple(pd, slot, estate, - &failed_at, &failed_slot); - if (result < 0) - { - Relation failed_rel; - Datum key_values[PARTITION_MAX_KEYS]; - bool key_isnull[PARTITION_MAX_KEYS]; - char *val_desc; - ExprContext *ecxt = GetPerTupleExprContext(estate); - - failed_rel = failed_at->reldesc; - ecxt->ecxt_scantuple = failed_slot; - FormPartitionKeyDatum(failed_at, failed_slot, estate, - key_values, key_isnull); - val_desc = ExecBuildSlotPartitionKeyDescription(failed_rel, - key_values, - key_isnull, - 64); - Assert(OidIsValid(RelationGetRelid(failed_rel))); - ereport(ERROR, - (errcode(ERRCODE_CHECK_VIOLATION), - errmsg("no partition of relation \"%s\" found for row", - RelationGetRelationName(failed_rel)), - val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); - } - - return result; -} - -/* - * BuildSlotPartitionKeyDescription - * - * This works very much like BuildIndexValueDescription() and is currently - * used for building error messages when ExecFindPartition() fails to find - * partition for a row. - */ -static char * -ExecBuildSlotPartitionKeyDescription(Relation rel, - Datum *values, - bool *isnull, - int maxfieldlen) -{ - StringInfoData buf; - PartitionKey key = RelationGetPartitionKey(rel); - int partnatts = get_partition_natts(key); - int i; - Oid relid = RelationGetRelid(rel); - AclResult aclresult; - - if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED) - return NULL; - - /* If the user has table-level access, just go build the description. */ - aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); - if (aclresult != ACLCHECK_OK) - { - /* - * Step through the columns of the partition key and make sure the - * user has SELECT rights on all of them. - */ - for (i = 0; i < partnatts; i++) - { - AttrNumber attnum = get_partition_col_attnum(key, i); - - /* - * If this partition key column is an expression, we return no - * detail rather than try to figure out what column(s) the - * expression includes and if the user has SELECT rights on them. - */ - if (attnum == InvalidAttrNumber || - pg_attribute_aclcheck(relid, attnum, GetUserId(), - ACL_SELECT) != ACLCHECK_OK) - return NULL; - } - } - - initStringInfo(&buf); - appendStringInfo(&buf, "(%s) = (", - pg_get_partkeydef_columns(relid, true)); - - for (i = 0; i < partnatts; i++) - { - char *val; - int vallen; - - if (isnull[i]) - val = "null"; - else - { - Oid foutoid; - bool typisvarlena; - - getTypeOutputInfo(get_partition_col_typid(key, i), - &foutoid, &typisvarlena); - val = OidOutputFunctionCall(foutoid, values[i]); - } - - if (i > 0) - appendStringInfoString(&buf, ", "); - - /* truncate if needed */ - vallen = strlen(val); - if (vallen <= maxfieldlen) - appendStringInfoString(&buf, val); - else - { - vallen = pg_mbcliplen(val, vallen, maxfieldlen); - appendBinaryStringInfo(&buf, val, vallen); - appendStringInfoString(&buf, "..."); - } - } - - appendStringInfoChar(&buf, ')'); - - return buf.data; -} diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c new file mode 100644 index 0000000000..53a8a0f62c --- /dev/null +++ b/src/backend/executor/execPartition.c @@ -0,0 +1,549 @@ +/*------------------------------------------------------------------------- + * + * execPartition.c + * Support routines for partitioning. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/execPartition.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/pg_inherits_fn.h" +#include "executor/execPartition.h" +#include "executor/executor.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/rls.h" +#include "utils/ruleutils.h" + +static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, + int *num_parted, List **leaf_part_oids); +static void get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids); +static void FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); +static char *ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen); + +/* + * ExecSetupPartitionTupleRouting - set up information needed during + * tuple routing for partitioned tables + * + * Output arguments: + * 'pd' receives an array of PartitionDispatch objects with one entry for + * every partitioned table in the partition tree + * 'partitions' receives an array of ResultRelInfo objects with one entry for + * every leaf partition in the partition tree + * 'tup_conv_maps' receives an array of TupleConversionMap objects with one + * entry for every leaf partition (required to convert input tuple based + * on the root table's rowtype to a leaf partition's rowtype after tuple + * routing is done) + * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used + * to manipulate any given leaf partition's rowtype after that partition + * is chosen by tuple-routing. + * 'num_parted' receives the number of partitioned tables in the partition + * tree (= the number of entries in the 'pd' output array) + * 'num_partitions' receives the number of leaf partitions in the partition + * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' + * output arrays + * + * Note that all the relations in the partition tree are locked using the + * RowExclusiveLock mode upon return from this function. + */ +void +ExecSetupPartitionTupleRouting(Relation rel, + Index resultRTindex, + EState *estate, + PartitionDispatch **pd, + ResultRelInfo **partitions, + TupleConversionMap ***tup_conv_maps, + TupleTableSlot **partition_tuple_slot, + int *num_parted, int *num_partitions) +{ + TupleDesc tupDesc = RelationGetDescr(rel); + List *leaf_parts; + ListCell *cell; + int i; + ResultRelInfo *leaf_part_rri; + + /* + * Get the information about the partition tree after locking all the + * partitions. + */ + (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); + *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); + *num_partitions = list_length(leaf_parts); + *partitions = (ResultRelInfo *) palloc(*num_partitions * + sizeof(ResultRelInfo)); + *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * + sizeof(TupleConversionMap *)); + + /* + * Initialize an empty slot that will be used to manipulate tuples of any + * given partition's rowtype. It is attached to the caller-specified node + * (such as ModifyTableState) and released when the node finishes + * processing. + */ + *partition_tuple_slot = MakeTupleTableSlot(); + + leaf_part_rri = *partitions; + i = 0; + foreach(cell, leaf_parts) + { + Relation partrel; + TupleDesc part_tupdesc; + + /* + * We locked all the partitions above including the leaf partitions. + * Note that each of the relations in *partitions are eventually + * closed by the caller. + */ + partrel = heap_open(lfirst_oid(cell), NoLock); + part_tupdesc = RelationGetDescr(partrel); + + /* + * Save a tuple conversion map to convert a tuple routed to this + * partition from the parent's type to the partition's. + */ + (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, + gettext_noop("could not convert row type")); + + InitResultRelInfo(leaf_part_rri, + partrel, + resultRTindex, + rel, + estate->es_instrument); + + /* + * Verify result relation is a valid target for INSERT. + */ + CheckValidResultRel(leaf_part_rri, CMD_INSERT); + + /* + * Open partition indices (remember we do not support ON CONFLICT in + * case of partitioned tables, so we do not need support information + * for speculative insertion) + */ + if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && + leaf_part_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_part_rri, false); + + estate->es_leaf_result_relations = + lappend(estate->es_leaf_result_relations, leaf_part_rri); + + leaf_part_rri++; + i++; + } +} + +/* + * ExecFindPartition -- Find a leaf partition in the partition tree rooted + * at parent, for the heap tuple contained in *slot + * + * estate must be non-NULL; we'll need it to compute any expressions in the + * partition key(s) + * + * If no leaf partition is found, this routine errors out with the appropriate + * error message, else it returns the leaf partition sequence number to be + * as an index into the array of (ResultRelInfos of) all leaf partitions in + * the partition tree. + */ +int +ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, + TupleTableSlot *slot, EState *estate) +{ + int result; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + Relation rel; + PartitionDispatch parent; + ExprContext *ecxt = GetPerTupleExprContext(estate); + TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; + + /* + * First check the root table's partition constraint, if any. No point in + * routing the tuple if it doesn't belong in the root table itself. + */ + if (resultRelInfo->ri_PartitionCheck) + ExecPartitionCheck(resultRelInfo, slot, estate); + + /* start with the root partitioned table */ + parent = pd[0]; + while (true) + { + PartitionDesc partdesc; + TupleTableSlot *myslot = parent->tupslot; + TupleConversionMap *map = parent->tupmap; + int cur_index = -1; + + rel = parent->reldesc; + partdesc = RelationGetPartitionDesc(rel); + + /* + * Convert the tuple to this parent's layout so that we can do certain + * things we do below. + */ + if (myslot != NULL && map != NULL) + { + HeapTuple tuple = ExecFetchSlotTuple(slot); + + ExecClearTuple(myslot); + tuple = do_convert_tuple(tuple, map); + ExecStoreTuple(tuple, myslot, InvalidBuffer, true); + slot = myslot; + } + + /* Quick exit */ + if (partdesc->nparts == 0) + { + result = -1; + break; + } + + /* + * Extract partition key from tuple. Expression evaluation machinery + * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to + * point to the correct tuple slot. The slot might have changed from + * what was used for the parent table if the table of the current + * partitioning level has different tuple descriptor from the parent. + * So update ecxt_scantuple accordingly. + */ + ecxt->ecxt_scantuple = slot; + FormPartitionKeyDatum(parent, slot, estate, values, isnull); + cur_index = get_partition_for_tuple(rel, values, isnull); + + /* + * cur_index < 0 means we failed to find a partition of this parent. + * cur_index >= 0 means we either found the leaf partition, or the + * next parent to find a partition of. + */ + if (cur_index < 0) + { + result = -1; + break; + } + else if (parent->indexes[cur_index] >= 0) + { + result = parent->indexes[cur_index]; + break; + } + else + parent = pd[-parent->indexes[cur_index]]; + } + + /* A partition was not found. */ + if (result < 0) + { + char *val_desc; + + val_desc = ExecBuildSlotPartitionKeyDescription(rel, + values, isnull, 64); + Assert(OidIsValid(RelationGetRelid(rel))); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + RelationGetRelationName(rel)), + val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); + } + + ecxt->ecxt_scantuple = ecxt_scantuple_old; + return result; +} + +/* + * RelationGetPartitionDispatchInfo + * Returns information necessary to route tuples down a partition tree + * + * The number of elements in the returned array (that is, the number of + * PartitionDispatch objects for the partitioned tables in the partition tree) + * is returned in *num_parted and a list of the OIDs of all the leaf + * partitions of rel is returned in *leaf_part_oids. + * + * All the relations in the partition tree (including 'rel') must have been + * locked (using at least the AccessShareLock) by the caller. + */ +static PartitionDispatch * +RelationGetPartitionDispatchInfo(Relation rel, + int *num_parted, List **leaf_part_oids) +{ + List *pdlist; + PartitionDispatchData **pd; + ListCell *lc; + int i; + + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + + *num_parted = 0; + *leaf_part_oids = NIL; + + get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); + *num_parted = list_length(pdlist); + pd = (PartitionDispatchData **) palloc(*num_parted * + sizeof(PartitionDispatchData *)); + i = 0; + foreach (lc, pdlist) + { + pd[i++] = lfirst(lc); + } + + return pd; +} + +/* + * get_partition_dispatch_recurse + * Recursively expand partition tree rooted at rel + * + * As the partition tree is expanded in a depth-first manner, we mantain two + * global lists: of PartitionDispatch objects corresponding to partitioned + * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. + */ +static void +get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids) +{ + TupleDesc tupdesc = RelationGetDescr(rel); + PartitionDesc partdesc = RelationGetPartitionDesc(rel); + PartitionKey partkey = RelationGetPartitionKey(rel); + PartitionDispatch pd; + int i; + + /* Build a PartitionDispatch for this table and add it to *pds. */ + pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); + *pds = lappend(*pds, pd); + pd->reldesc = rel; + pd->key = partkey; + pd->keystate = NIL; + pd->partdesc = partdesc; + if (parent != NULL) + { + /* + * For every partitioned table other than root, we must store a + * tuple table slot initialized with its tuple descriptor and a + * tuple conversion map to convert a tuple from its parent's + * rowtype to its own. That is to make sure that we are looking at + * the correct row using the correct tuple descriptor when + * computing its partition key for tuple routing. + */ + pd->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + tupdesc, + gettext_noop("could not convert row type")); + } + else + { + /* Not required for the root partitioned table */ + pd->tupslot = NULL; + pd->tupmap = NULL; + } + + /* + * Go look at each partition of this table. If it's a leaf partition, + * simply add its OID to *leaf_part_oids. If it's a partitioned table, + * recursively call get_partition_dispatch_recurse(), so that its + * partitions are processed as well and a corresponding PartitionDispatch + * object gets added to *pds. + * + * About the values in pd->indexes: for a leaf partition, it contains the + * leaf partition's position in the global list *leaf_part_oids minus 1, + * whereas for a partitioned table partition, it contains the partition's + * position in the global list *pds multiplied by -1. The latter is + * multiplied by -1 to distinguish partitioned tables from leaf partitions + * when going through the values in pd->indexes. So, for example, when + * using it during tuple-routing, encountering a value >= 0 means we found + * a leaf partition. It is immediately returned as the index in the array + * of ResultRelInfos of all the leaf partitions, using which we insert the + * tuple into that leaf partition. A negative value means we found a + * partitioned table. The value multiplied back by -1 is returned as the + * index in the array of PartitionDispatch objects of all partitioned + * tables in the tree, using which, search is continued further down the + * partition tree. + */ + pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); + for (i = 0; i < partdesc->nparts; i++) + { + Oid partrelid = partdesc->oids[i]; + + if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) + { + *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); + pd->indexes[i] = list_length(*leaf_part_oids) - 1; + } + else + { + /* + * We assume all tables in the partition tree were already + * locked by the caller. + */ + Relation partrel = heap_open(partrelid, NoLock); + + pd->indexes[i] = -list_length(*pds); + get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); + } + } +} + +/* ---------------- + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for the partition key + * of a tuple. + * + * pd Partition dispatch object of the partitioned table + * slot Heap tuple from which to extract partition key + * estate executor state for evaluating any partition key + * expressions (must be non-NULL) + * values Array of partition key Datums (output area) + * isnull Array of is-null indicators (output area) + * + * the ecxt_scantuple slot of estate's per-tuple expr context must point to + * the heap tuple passed in. + * ---------------- + */ +static void +FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pd->key->partexprs != NIL && pd->keystate == NIL) + { + /* Check caller has set up context correctly */ + Assert(estate != NULL && + GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + + /* First time through, set up expression evaluation state */ + pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); + } + + partexpr_item = list_head(pd->keystate); + for (i = 0; i < pd->key->partnatts; i++) + { + AttrNumber keycol = pd->key->partattrs[i]; + Datum datum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + datum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull); + partexpr_item = lnext(partexpr_item); + } + values[i] = datum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * BuildSlotPartitionKeyDescription + * + * This works very much like BuildIndexValueDescription() and is currently + * used for building error messages when ExecFindPartition() fails to find + * partition for a row. + */ +static char * +ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen) +{ + StringInfoData buf; + PartitionKey key = RelationGetPartitionKey(rel); + int partnatts = get_partition_natts(key); + int i; + Oid relid = RelationGetRelid(rel); + AclResult aclresult; + + if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED) + return NULL; + + /* If the user has table-level access, just go build the description. */ + aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); + if (aclresult != ACLCHECK_OK) + { + /* + * Step through the columns of the partition key and make sure the + * user has SELECT rights on all of them. + */ + for (i = 0; i < partnatts; i++) + { + AttrNumber attnum = get_partition_col_attnum(key, i); + + /* + * If this partition key column is an expression, we return no + * detail rather than try to figure out what column(s) the + * expression includes and if the user has SELECT rights on them. + */ + if (attnum == InvalidAttrNumber || + pg_attribute_aclcheck(relid, attnum, GetUserId(), + ACL_SELECT) != ACLCHECK_OK) + return NULL; + } + } + + initStringInfo(&buf); + appendStringInfo(&buf, "(%s) = (", + pg_get_partkeydef_columns(relid, true)); + + for (i = 0; i < partnatts; i++) + { + char *val; + int vallen; + + if (isnull[i]) + val = "null"; + else + { + Oid foutoid; + bool typisvarlena; + + getTypeOutputInfo(get_partition_col_typid(key, i), + &foutoid, &typisvarlena); + val = OidOutputFunctionCall(foutoid, values[i]); + } + + if (i > 0) + appendStringInfoString(&buf, ", "); + + /* truncate if needed */ + vallen = strlen(val); + if (vallen <= maxfieldlen) + appendStringInfoString(&buf, val); + else + { + vallen = pg_mbcliplen(val, vallen, maxfieldlen); + appendBinaryStringInfo(&buf, val, vallen); + appendStringInfoString(&buf, "..."); + } + } + + appendStringInfoChar(&buf, ')'); + + return buf.data; +} diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 49586a3c03..314c43916a 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -40,6 +40,7 @@ #include "access/htup_details.h" #include "access/xact.h" #include "commands/trigger.h" +#include "executor/execPartition.h" #include "executor/executor.h" #include "executor/nodeModifyTable.h" #include "foreign/fdwapi.h" diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 454a940a23..154df89e54 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -39,37 +39,6 @@ typedef struct PartitionDescData typedef struct PartitionDescData *PartitionDesc; -/*----------------------- - * PartitionDispatch - information about one partitioned table in a partition - * hierarchy required to route a tuple to one of its partitions - * - * reldesc Relation descriptor of the table - * key Partition key information of the table - * keystate Execution state required for expressions in the partition key - * partdesc Partition descriptor of the table - * tupslot A standalone TupleTableSlot initialized with this table's tuple - * descriptor - * tupmap TupleConversionMap to convert from the parent's rowtype to - * this table's rowtype (when extracting the partition key of a - * tuple just before routing it through this table) - * indexes Array with partdesc->nparts members (for details on what - * individual members represent, see how they are set in - * RelationGetPartitionDispatchInfo()) - *----------------------- - */ -typedef struct PartitionDispatchData -{ - Relation reldesc; - PartitionKey key; - List *keystate; /* list of ExprState */ - PartitionDesc partdesc; - TupleTableSlot *tupslot; - TupleConversionMap *tupmap; - int *indexes; -} PartitionDispatchData; - -typedef struct PartitionDispatchData *PartitionDispatch; - extern void RelationBuildPartitionDesc(Relation relation); extern bool partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, PartitionBoundInfo b1, @@ -86,19 +55,6 @@ extern List *map_partition_varattnos(List *expr, int target_varno, extern List *RelationGetPartitionQual(Relation rel); extern Expr *get_partition_qual_relid(Oid relid); -/* For tuple routing */ -extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids); -extern void FormPartitionKeyDatum(PartitionDispatch pd, - TupleTableSlot *slot, - EState *estate, - Datum *values, - bool *isnull); -extern int get_partition_for_tuple(PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate, - PartitionDispatchData **failed_at, - TupleTableSlot **failed_slot); extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc); extern Oid get_default_partition_oid(Oid parentId); extern void update_default_partition_oid(Oid parentId, Oid defaultPartId); @@ -106,4 +62,8 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel, PartitionBoundSpec *new_spec); extern List *get_proposed_default_constraint(List *new_part_constaints); +/* For tuple routing */ +extern int get_partition_for_tuple(Relation relation, Datum *values, + bool *isnull); + #endif /* PARTITION_H */ diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h new file mode 100644 index 0000000000..8c9ab2f1a9 --- /dev/null +++ b/src/include/executor/execPartition.h @@ -0,0 +1,65 @@ +/*-------------------------------------------------------------------- + * execPartition.h + * POSTGRES partitioning executor interface + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/executor/execPartition.h + *-------------------------------------------------------------------- + */ + +#ifndef EXECPARTITION_H +#define EXECPARTITION_H + +#include "catalog/partition.h" +#include "nodes/execnodes.h" +#include "nodes/parsenodes.h" +#include "nodes/plannodes.h" + +/*----------------------- + * PartitionDispatch - information about one partitioned table in a partition + * hierarchy required to route a tuple to one of its partitions + * + * reldesc Relation descriptor of the table + * key Partition key information of the table + * keystate Execution state required for expressions in the partition key + * partdesc Partition descriptor of the table + * tupslot A standalone TupleTableSlot initialized with this table's tuple + * descriptor + * tupmap TupleConversionMap to convert from the parent's rowtype to + * this table's rowtype (when extracting the partition key of a + * tuple just before routing it through this table) + * indexes Array with partdesc->nparts members (for details on what + * individual members represent, see how they are set in + * get_partition_dispatch_recurse()) + *----------------------- + */ +typedef struct PartitionDispatchData +{ + Relation reldesc; + PartitionKey key; + List *keystate; /* list of ExprState */ + PartitionDesc partdesc; + TupleTableSlot *tupslot; + TupleConversionMap *tupmap; + int *indexes; +} PartitionDispatchData; + +typedef struct PartitionDispatchData *PartitionDispatch; + +extern void ExecSetupPartitionTupleRouting(Relation rel, + Index resultRTindex, + EState *estate, + PartitionDispatch **pd, + ResultRelInfo **partitions, + TupleConversionMap ***tup_conv_maps, + TupleTableSlot **partition_tuple_slot, + int *num_parted, int *num_partitions); +extern int ExecFindPartition(ResultRelInfo *resultRelInfo, + PartitionDispatch *pd, + TupleTableSlot *slot, + EState *estate); + +#endif /* EXECPARTITION_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 770881849c..ee2ea156d1 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -188,6 +188,8 @@ extern void ExecCleanUpTriggerState(EState *estate); extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); extern void ExecConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); +extern void ExecPartitionCheck(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, EState *estate); extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern LockTupleMode ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo); @@ -206,18 +208,6 @@ extern void EvalPlanQualSetPlan(EPQState *epqstate, extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, HeapTuple tuple); extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); -extern void ExecSetupPartitionTupleRouting(Relation rel, - Index resultRTindex, - EState *estate, - PartitionDispatch **pd, - ResultRelInfo **partitions, - TupleConversionMap ***tup_conv_maps, - TupleTableSlot **partition_tuple_slot, - int *num_parted, int *num_partitions); -extern int ExecFindPartition(ResultRelInfo *resultRelInfo, - PartitionDispatch *pd, - TupleTableSlot *slot, - EState *estate); #define EvalPlanQualSetSlot(epqstate, slot) ((epqstate)->origslot = (slot)) extern void EvalPlanQualFetchRowMarks(EPQState *epqstate); -- 2.11.0
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers