On 2018/11/15 4:27, Robert Haas wrote: > RelationBuildPartitionDesc doesn't lock the children > whose relpartbounds it is fetching (!), so unless we're guaranteed to > have already locked them children earlier for some other reason, we > could grab the partition bound at this point and then it could change > again before we get a lock on them.
Hmm, I think that RelationBuildPartitionDesc doesn't need to lock a partition before fetching its relpartbound, because the latter can't change if the caller is holding a lock on the parent, which it must be if we're in RelationBuildPartitionDesc for parent at all. Am I missing something? As Michael pointed out, the first cleanup patch needs to be rebased due to a recent commit [1]. I did that to see if something we did in that commit made things worse for your patch, but seems fine. I had to go and change things outside RelationBuildPartitionDesc as I rebased, due to the aforementioned commit, but they're simple changes such as changing List * arguments of some newly added functions to PartitionBoundSpec **. Please find the rebased patches attached with this email. Thanks, Amit [1] https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=b52b7dc2
From 3e7642c236f04bc65f3f39e7de98d7ff85166e03 Mon Sep 17 00:00:00 2001 From: Robert Haas <rh...@postgresql.org> Date: Wed, 14 Nov 2018 11:11:58 -0500 Subject: [PATCH 1/2] Reduce unnecessary list construction in RelationBuildPartitionDesc. The 'partoids' list which was constructed by the previous version of this code was necessarily identical to 'inhoids'. There's no point to duplicating the list, so avoid that. Instead, construct the array representation directly from the original 'inhoids' list. Also, use an array rather than a list for 'boundspecs'. We know exactly how many items we need to store, so there's really no reason to use a list. Using an array instead reduces the number of memory allocations we perform. --- src/backend/partitioning/partbounds.c | 66 +++++++++++++++-------------------- src/backend/utils/cache/partcache.c | 36 +++++++++++-------- src/include/partitioning/partbounds.h | 5 ++- 3 files changed, 51 insertions(+), 56 deletions(-) diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 0b5e0dd89f..a8f4a1a685 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -70,15 +70,12 @@ static int32 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg); static int32 qsort_partition_rbound_cmp(const void *a, const void *b, void *arg); -static PartitionBoundInfo create_hash_bounds(List *boundspecs, - PartitionKey key, - int **mapping); -static PartitionBoundInfo create_list_bounds(List *boundspecs, - PartitionKey key, - int **mapping); -static PartitionBoundInfo create_range_bounds(List *boundspecs, - PartitionKey key, - int **mapping); +static PartitionBoundInfo create_hash_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); +static PartitionBoundInfo create_list_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); +static PartitionBoundInfo create_range_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); static PartitionRangeBound *make_one_partition_rbound(PartitionKey key, int index, List *datums, bool lower); static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2, @@ -169,9 +166,9 @@ get_qual_from_partbound(Relation rel, Relation parent, * current memory context. */ PartitionBoundInfo -partition_bounds_create(List *boundspecs, PartitionKey key, int **mapping) +partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { - int nparts = list_length(boundspecs); int i; Assert(nparts > 0); @@ -199,13 +196,13 @@ partition_bounds_create(List *boundspecs, PartitionKey key, int **mapping) switch (key->strategy) { case PARTITION_STRATEGY_HASH: - return create_hash_bounds(boundspecs, key, mapping); + return create_hash_bounds(boundspecs, nparts, key, mapping); case PARTITION_STRATEGY_LIST: - return create_list_bounds(boundspecs, key, mapping); + return create_list_bounds(boundspecs, nparts, key, mapping); case PARTITION_STRATEGY_RANGE: - return create_range_bounds(boundspecs, key, mapping); + return create_range_bounds(boundspecs, nparts, key, mapping); default: elog(ERROR, "unexpected partition strategy: %d", @@ -222,13 +219,12 @@ partition_bounds_create(List *boundspecs, PartitionKey key, int **mapping) * Create a PartitionBoundInfo for a hash partitioned table */ static PartitionBoundInfo -create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) +create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { PartitionBoundInfo boundinfo; PartitionHashBound **hbounds = NULL; - ListCell *cell; - int i, - nparts = list_length(boundspecs); + int i; int ndatums = 0; int greatest_modulus; @@ -244,10 +240,9 @@ create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) palloc(nparts * sizeof(PartitionHashBound *)); /* Convert from node to the internal representation */ - i = 0; - foreach(cell, boundspecs) + for (i = 0; i < nparts; i++) { - PartitionBoundSpec *spec = castNode(PartitionBoundSpec, lfirst(cell)); + PartitionBoundSpec *spec = boundspecs[i]; if (spec->strategy != PARTITION_STRATEGY_HASH) elog(ERROR, "invalid strategy in partition bound spec"); @@ -256,7 +251,6 @@ create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) hbounds[i]->modulus = spec->modulus; hbounds[i]->remainder = spec->remainder; hbounds[i]->index = i; - i++; } /* Sort all the bounds in ascending order */ @@ -307,7 +301,8 @@ create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) * Create a PartitionBoundInfo for a list partitioned table */ static PartitionBoundInfo -create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) +create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { PartitionBoundInfo boundinfo; PartitionListValue **all_values = NULL; @@ -327,9 +322,9 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) boundinfo->default_index = -1; /* Create a unified list of non-null values across all partitions. */ - foreach(cell, boundspecs) + for (i = 0; i < nparts; i++) { - PartitionBoundSpec *spec = castNode(PartitionBoundSpec, lfirst(cell)); + PartitionBoundSpec *spec = boundspecs[i]; ListCell *c; if (spec->strategy != PARTITION_STRATEGY_LIST) @@ -343,7 +338,6 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) if (spec->is_default) { default_index = i; - i++; continue; } @@ -374,8 +368,6 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) if (list_value) non_null_values = lappend(non_null_values, list_value); } - - i++; } ndatums = list_length(non_null_values); @@ -458,7 +450,7 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) } /* All partition must now have been assigned canonical indexes. */ - Assert(next_index == list_length(boundspecs)); + Assert(next_index == nparts); return boundinfo; } @@ -467,16 +459,15 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) * Create a PartitionBoundInfo for a range partitioned table */ static PartitionBoundInfo -create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) +create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { PartitionBoundInfo boundinfo; PartitionRangeBound **rbounds = NULL; PartitionRangeBound **all_bounds, *prev; - ListCell *cell; int i, - k, - nparts = list_length(boundspecs); + k; int ndatums = 0; int default_index = -1; int next_index = 0; @@ -493,10 +484,10 @@ create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) palloc0(2 * nparts * sizeof(PartitionRangeBound *)); /* Create a unified list of range bounds across all the partitions. */ - i = ndatums = 0; - foreach(cell, boundspecs) + ndatums = 0; + for (i = 0; i < nparts; i++) { - PartitionBoundSpec *spec = castNode(PartitionBoundSpec, lfirst(cell)); + PartitionBoundSpec *spec = boundspecs[i]; PartitionRangeBound *lower, *upper; @@ -510,7 +501,7 @@ create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) */ if (spec->is_default) { - default_index = i++; + default_index = i; continue; } @@ -518,7 +509,6 @@ create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) upper = make_one_partition_rbound(key, i, spec->upperdatums, false); all_bounds[ndatums++] = lower; all_bounds[ndatums++] = upper; - i++; } Assert(ndatums == nparts * 2 || diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index 07653f312b..0d732b4b84 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -257,26 +257,34 @@ RelationBuildPartitionDesc(Relation rel) PartitionDesc partdesc; PartitionBoundInfo boundinfo; List *inhoids; - List *boundspecs = NIL; + PartitionBoundSpec **boundspecs = NULL; + Oid *oids; ListCell *cell; int i, nparts; PartitionKey key = RelationGetPartitionKey(rel); MemoryContext oldcxt; - Oid *oids_orig; int *mapping; /* Get partition oids from pg_inherits */ inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); + nparts = list_length(inhoids); - /* Collect bound spec nodes in a list */ + if (nparts > 0) + { + oids = palloc(nparts * sizeof(Oid)); + boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + } + + /* Collect bound spec nodes for each partition */ + i = 0; foreach(cell, inhoids) { Oid inhrelid = lfirst_oid(cell); HeapTuple tuple; Datum datum; bool isnull; - Node *boundspec; + PartitionBoundSpec *boundspec; tuple = SearchSysCache1(RELOID, inhrelid); if (!HeapTupleIsValid(tuple)) @@ -287,14 +295,16 @@ RelationBuildPartitionDesc(Relation rel) &isnull); if (isnull) elog(ERROR, "null relpartbound for relation %u", inhrelid); - boundspec = (Node *) stringToNode(TextDatumGetCString(datum)); + boundspec = stringToNode(TextDatumGetCString(datum)); + if (!IsA(boundspec, PartitionBoundSpec)) + elog(ERROR, "invalid relpartbound for relation %u", inhrelid); /* * Sanity check: If the PartitionBoundSpec says this is the default * partition, its OID should correspond to whatever's stored in * pg_partitioned_table.partdefid; if not, the catalog is corrupt. */ - if (castNode(PartitionBoundSpec, boundspec)->is_default) + if (boundspec->is_default) { Oid partdefid; @@ -304,12 +314,12 @@ RelationBuildPartitionDesc(Relation rel) inhrelid, partdefid); } - boundspecs = lappend(boundspecs, boundspec); + oids[i] = inhrelid; + boundspecs[i] = boundspec; + ++i; ReleaseSysCache(tuple); } - nparts = list_length(boundspecs); - /* Now build the actual relcache partition descriptor */ rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext, "partition descriptor", @@ -330,11 +340,7 @@ RelationBuildPartitionDesc(Relation rel) } /* First create PartitionBoundInfo */ - boundinfo = partition_bounds_create(boundspecs, key, &mapping); - oids_orig = (Oid *) palloc(sizeof(Oid) * partdesc->nparts); - i = 0; - foreach(cell, inhoids) - oids_orig[i++] = lfirst_oid(cell); + boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping); /* Now copy boundinfo and oids into partdesc. */ oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt); @@ -348,7 +354,7 @@ RelationBuildPartitionDesc(Relation rel) * canonicalized representation of the partition bounds. */ for (i = 0; i < partdesc->nparts; i++) - partdesc->oids[mapping[i]] = oids_orig[i]; + partdesc->oids[mapping[i]] = oids[i]; MemoryContextSwitchTo(oldcxt); rel->rd_partdesc = partdesc; diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h index 7a697d1c0a..36fb584e23 100644 --- a/src/include/partitioning/partbounds.h +++ b/src/include/partitioning/partbounds.h @@ -80,9 +80,8 @@ extern uint64 compute_partition_hash_value(int partnatts, FmgrInfo *partsupfunc, Datum *values, bool *isnull); extern List *get_qual_from_partbound(Relation rel, Relation parent, PartitionBoundSpec *spec); -extern PartitionBoundInfo partition_bounds_create(List *boundspecs, - PartitionKey key, - int **mapping); +extern PartitionBoundInfo partition_bounds_create(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); extern bool partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, PartitionBoundInfo b1, PartitionBoundInfo b2); -- 2.11.0
From eb5682bde2b070bac09f69b817b2be4dc73e3332 Mon Sep 17 00:00:00 2001 From: Robert Haas <rh...@postgresql.org> Date: Wed, 14 Nov 2018 12:15:44 -0500 Subject: [PATCH 2/2] Ensure that RelationBuildPartitionDesc sees a consistent view. If partitions are added or removed concurrently, make sure that we nevertheless get a view of the partition list and the partition descriptor for each partition which is consistent with the system state at some single point in the commit history. To do this, reuse an idea first invented by Noah Misch back in commit 4240e429d0c2d889d0cda23c618f94e12c13ade7. --- src/backend/utils/cache/partcache.c | 135 ++++++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index 0d732b4b84..623b156dd4 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -28,8 +28,10 @@ #include "optimizer/clauses.h" #include "optimizer/planner.h" #include "partitioning/partbounds.h" +#include "storage/sinval.h" #include "utils/builtins.h" #include "utils/datum.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/partcache.h" @@ -266,45 +268,113 @@ RelationBuildPartitionDesc(Relation rel) MemoryContext oldcxt; int *mapping; - /* Get partition oids from pg_inherits */ - inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); - nparts = list_length(inhoids); - - if (nparts > 0) + /* + * Fetch catalog information. Since we want to allow partitions to be + * added and removed without holding AccessExclusiveLock on the parent + * table, it's possible that the catalog contents could be changing under + * us. That means that by by the time we fetch the partition bound for a + * partition returned by find_inheritance_children, it might no longer be + * a partition or might even be a partition of some other table. + * + * To ensure that we get a consistent view of the catalog data, we first + * fetch everything we need and then call AcceptInvalidationMessages. If + * SharedInvalidMessageCounter advances between the time we start fetching + * information and the time AcceptInvalidationMessages() completes, that + * means something may have changed under us, so we start over and do it + * all again. + */ + for (;;) { - oids = palloc(nparts * sizeof(Oid)); - boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + uint64 inval_count = SharedInvalidMessageCounter; + + /* Get partition oids from pg_inherits */ + inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); + nparts = list_length(inhoids); + + if (nparts > 0) + { + oids = palloc(nparts * sizeof(Oid)); + boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + } + + /* Collect bound spec nodes for each partition */ + i = 0; + foreach(cell, inhoids) + { + Oid inhrelid = lfirst_oid(cell); + HeapTuple tuple; + PartitionBoundSpec *boundspec = NULL; + + /* + * Don't put any sanity checks here that might fail as a result of + * concurrent DDL, such as a check that relpartbound is not NULL. + * We could transiently see such states as a result of concurrent + * DDL. Such checks can be performed only after we're sure we got + * a consistent view of the underlying data. + */ + tuple = SearchSysCache1(RELOID, inhrelid); + if (HeapTupleIsValid(tuple)) + { + Datum datum; + bool isnull; + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + if (!isnull) + boundspec = stringToNode(TextDatumGetCString(datum)); + ReleaseSysCache(tuple); + } + + oids[i] = inhrelid; + boundspecs[i] = boundspec; + ++i; + } + + /* + * If no relevant catalog changes have occurred (see comments at the + * top of this loop, then we got a consistent view of our partition + * list and can stop now. + */ + AcceptInvalidationMessages(); + if (inval_count == SharedInvalidMessageCounter) + break; + + /* Something changed, so retry from the top. */ + if (oids != NULL) + { + pfree(oids); + oids = NULL; + } + if (boundspecs != NULL) + { + pfree(boundspecs); + boundspecs = NULL; + } + if (inhoids != NIL) + list_free(inhoids); } - /* Collect bound spec nodes for each partition */ - i = 0; - foreach(cell, inhoids) + /* + * At this point, we should have a consistent view of the data we got from + * pg_inherits and pg_class, so it's safe to perform some sanity checks. + */ + for (i = 0; i < nparts; ++i) { - Oid inhrelid = lfirst_oid(cell); - HeapTuple tuple; - Datum datum; - bool isnull; - PartitionBoundSpec *boundspec; + Oid inhrelid = oids[i]; + PartitionBoundSpec *spec = boundspecs[i]; - tuple = SearchSysCache1(RELOID, inhrelid); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for relation %u", inhrelid); - - datum = SysCacheGetAttr(RELOID, tuple, - Anum_pg_class_relpartbound, - &isnull); - if (isnull) - elog(ERROR, "null relpartbound for relation %u", inhrelid); - boundspec = stringToNode(TextDatumGetCString(datum)); - if (!IsA(boundspec, PartitionBoundSpec)) + if (!spec) + elog(ERROR, "missing relpartbound for relation %u", inhrelid); + if (!IsA(spec, PartitionBoundSpec)) elog(ERROR, "invalid relpartbound for relation %u", inhrelid); /* - * Sanity check: If the PartitionBoundSpec says this is the default - * partition, its OID should correspond to whatever's stored in - * pg_partitioned_table.partdefid; if not, the catalog is corrupt. + * If the PartitionBoundSpec says this is the default partition, its + * OID should match pg_partitioned_table.partdefid; if not, the + * catalog is corrupt. */ - if (boundspec->is_default) + if (spec->is_default) { Oid partdefid; @@ -313,11 +383,6 @@ RelationBuildPartitionDesc(Relation rel) elog(ERROR, "expected partdefid %u, but got %u", inhrelid, partdefid); } - - oids[i] = inhrelid; - boundspecs[i] = boundspec; - ++i; - ReleaseSysCache(tuple); } /* Now build the actual relcache partition descriptor */ -- 2.11.0