On 2018/11/15 11:03, Amit Langote wrote: > As Michael pointed out, the first cleanup patch needs to be rebased due to > a recent commit [1]. I did that to see if something we did in that commit > made things worse for your patch, but seems fine. I had to go and change > things outside RelationBuildPartitionDesc as I rebased, due to the > aforementioned commit, but they're simple changes such as changing List * > arguments of some newly added functions to PartitionBoundSpec **. Please > find the rebased patches attached with this email. > > [1] https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=b52b7dc2
I noticed that the regression tests containing partitioned tables fail randomly with the rebased patches I posted, whereas they didn't if I apply them to HEAD without [1]. It seems to be due to the slightly confused memory context handling in RelationBuildPartitionDesc after [1], which Alvaro had expressed some doubts about yesterday. I've fixed 0001 again to re-order the code so that allocations happen the correct context and now tests pass with the rebased patches. By the way, I noticed that the oids array added by Robert's original 0001 patch wasn't initialized to NULL, which could lead to calling pfree on a garbage value of oids after the 2nd patch. Thanks, Amit [2] https://www.postgresql.org/message-id/20181113135915.v4r77tdthlajdlqq%40alvherre.pgsql
From bff9ca17e43d6538c5e01e5de7a95f6e426e0d55 Mon Sep 17 00:00:00 2001 From: Robert Haas <rh...@postgresql.org> Date: Wed, 14 Nov 2018 11:11:58 -0500 Subject: [PATCH 1/2] Reduce unnecessary list construction in RelationBuildPartitionDesc. The 'partoids' list which was constructed by the previous version of this code was necessarily identical to 'inhoids'. There's no point to duplicating the list, so avoid that. Instead, construct the array representation directly from the original 'inhoids' list. Also, use an array rather than a list for 'boundspecs'. We know exactly how many items we need to store, so there's really no reason to use a list. Using an array instead reduces the number of memory allocations we perform. --- src/backend/partitioning/partbounds.c | 66 ++++++++++++++----------------- src/backend/utils/cache/partcache.c | 73 ++++++++++++++++++----------------- src/include/partitioning/partbounds.h | 5 +-- 3 files changed, 67 insertions(+), 77 deletions(-) diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 0b5e0dd89f..a8f4a1a685 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -70,15 +70,12 @@ static int32 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg); static int32 qsort_partition_rbound_cmp(const void *a, const void *b, void *arg); -static PartitionBoundInfo create_hash_bounds(List *boundspecs, - PartitionKey key, - int **mapping); -static PartitionBoundInfo create_list_bounds(List *boundspecs, - PartitionKey key, - int **mapping); -static PartitionBoundInfo create_range_bounds(List *boundspecs, - PartitionKey key, - int **mapping); +static PartitionBoundInfo create_hash_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); +static PartitionBoundInfo create_list_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); +static PartitionBoundInfo create_range_bounds(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); static PartitionRangeBound *make_one_partition_rbound(PartitionKey key, int index, List *datums, bool lower); static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2, @@ -169,9 +166,9 @@ get_qual_from_partbound(Relation rel, Relation parent, * current memory context. */ PartitionBoundInfo -partition_bounds_create(List *boundspecs, PartitionKey key, int **mapping) +partition_bounds_create(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { - int nparts = list_length(boundspecs); int i; Assert(nparts > 0); @@ -199,13 +196,13 @@ partition_bounds_create(List *boundspecs, PartitionKey key, int **mapping) switch (key->strategy) { case PARTITION_STRATEGY_HASH: - return create_hash_bounds(boundspecs, key, mapping); + return create_hash_bounds(boundspecs, nparts, key, mapping); case PARTITION_STRATEGY_LIST: - return create_list_bounds(boundspecs, key, mapping); + return create_list_bounds(boundspecs, nparts, key, mapping); case PARTITION_STRATEGY_RANGE: - return create_range_bounds(boundspecs, key, mapping); + return create_range_bounds(boundspecs, nparts, key, mapping); default: elog(ERROR, "unexpected partition strategy: %d", @@ -222,13 +219,12 @@ partition_bounds_create(List *boundspecs, PartitionKey key, int **mapping) * Create a PartitionBoundInfo for a hash partitioned table */ static PartitionBoundInfo -create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) +create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { PartitionBoundInfo boundinfo; PartitionHashBound **hbounds = NULL; - ListCell *cell; - int i, - nparts = list_length(boundspecs); + int i; int ndatums = 0; int greatest_modulus; @@ -244,10 +240,9 @@ create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) palloc(nparts * sizeof(PartitionHashBound *)); /* Convert from node to the internal representation */ - i = 0; - foreach(cell, boundspecs) + for (i = 0; i < nparts; i++) { - PartitionBoundSpec *spec = castNode(PartitionBoundSpec, lfirst(cell)); + PartitionBoundSpec *spec = boundspecs[i]; if (spec->strategy != PARTITION_STRATEGY_HASH) elog(ERROR, "invalid strategy in partition bound spec"); @@ -256,7 +251,6 @@ create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) hbounds[i]->modulus = spec->modulus; hbounds[i]->remainder = spec->remainder; hbounds[i]->index = i; - i++; } /* Sort all the bounds in ascending order */ @@ -307,7 +301,8 @@ create_hash_bounds(List *boundspecs, PartitionKey key, int **mapping) * Create a PartitionBoundInfo for a list partitioned table */ static PartitionBoundInfo -create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) +create_list_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { PartitionBoundInfo boundinfo; PartitionListValue **all_values = NULL; @@ -327,9 +322,9 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) boundinfo->default_index = -1; /* Create a unified list of non-null values across all partitions. */ - foreach(cell, boundspecs) + for (i = 0; i < nparts; i++) { - PartitionBoundSpec *spec = castNode(PartitionBoundSpec, lfirst(cell)); + PartitionBoundSpec *spec = boundspecs[i]; ListCell *c; if (spec->strategy != PARTITION_STRATEGY_LIST) @@ -343,7 +338,6 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) if (spec->is_default) { default_index = i; - i++; continue; } @@ -374,8 +368,6 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) if (list_value) non_null_values = lappend(non_null_values, list_value); } - - i++; } ndatums = list_length(non_null_values); @@ -458,7 +450,7 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) } /* All partition must now have been assigned canonical indexes. */ - Assert(next_index == list_length(boundspecs)); + Assert(next_index == nparts); return boundinfo; } @@ -467,16 +459,15 @@ create_list_bounds(List *boundspecs, PartitionKey key, int **mapping) * Create a PartitionBoundInfo for a range partitioned table */ static PartitionBoundInfo -create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) +create_range_bounds(PartitionBoundSpec **boundspecs, int nparts, + PartitionKey key, int **mapping) { PartitionBoundInfo boundinfo; PartitionRangeBound **rbounds = NULL; PartitionRangeBound **all_bounds, *prev; - ListCell *cell; int i, - k, - nparts = list_length(boundspecs); + k; int ndatums = 0; int default_index = -1; int next_index = 0; @@ -493,10 +484,10 @@ create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) palloc0(2 * nparts * sizeof(PartitionRangeBound *)); /* Create a unified list of range bounds across all the partitions. */ - i = ndatums = 0; - foreach(cell, boundspecs) + ndatums = 0; + for (i = 0; i < nparts; i++) { - PartitionBoundSpec *spec = castNode(PartitionBoundSpec, lfirst(cell)); + PartitionBoundSpec *spec = boundspecs[i]; PartitionRangeBound *lower, *upper; @@ -510,7 +501,7 @@ create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) */ if (spec->is_default) { - default_index = i++; + default_index = i; continue; } @@ -518,7 +509,6 @@ create_range_bounds(List *boundspecs, PartitionKey key, int **mapping) upper = make_one_partition_rbound(key, i, spec->upperdatums, false); all_bounds[ndatums++] = lower; all_bounds[ndatums++] = upper; - i++; } Assert(ndatums == nparts * 2 || diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index 07653f312b..729b887442 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -255,28 +255,36 @@ void RelationBuildPartitionDesc(Relation rel) { PartitionDesc partdesc; - PartitionBoundInfo boundinfo; + PartitionBoundInfo boundinfo = NULL; List *inhoids; - List *boundspecs = NIL; + PartitionBoundSpec **boundspecs = NULL; + Oid *oids = NULL; ListCell *cell; int i, nparts; PartitionKey key = RelationGetPartitionKey(rel); MemoryContext oldcxt; - Oid *oids_orig; int *mapping; /* Get partition oids from pg_inherits */ inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); + nparts = list_length(inhoids); - /* Collect bound spec nodes in a list */ + if (nparts > 0) + { + oids = palloc(nparts * sizeof(Oid)); + boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + } + + /* Collect bound spec nodes for each partition */ + i = 0; foreach(cell, inhoids) { Oid inhrelid = lfirst_oid(cell); HeapTuple tuple; Datum datum; bool isnull; - Node *boundspec; + PartitionBoundSpec *boundspec; tuple = SearchSysCache1(RELOID, inhrelid); if (!HeapTupleIsValid(tuple)) @@ -287,14 +295,16 @@ RelationBuildPartitionDesc(Relation rel) &isnull); if (isnull) elog(ERROR, "null relpartbound for relation %u", inhrelid); - boundspec = (Node *) stringToNode(TextDatumGetCString(datum)); + boundspec = stringToNode(TextDatumGetCString(datum)); + if (!IsA(boundspec, PartitionBoundSpec)) + elog(ERROR, "invalid relpartbound for relation %u", inhrelid); /* * Sanity check: If the PartitionBoundSpec says this is the default * partition, its OID should correspond to whatever's stored in * pg_partitioned_table.partdefid; if not, the catalog is corrupt. */ - if (castNode(PartitionBoundSpec, boundspec)->is_default) + if (boundspec->is_default) { Oid partdefid; @@ -304,11 +314,16 @@ RelationBuildPartitionDesc(Relation rel) inhrelid, partdefid); } - boundspecs = lappend(boundspecs, boundspec); + oids[i] = inhrelid; + boundspecs[i] = boundspec; + ++i; ReleaseSysCache(tuple); } - nparts = list_length(boundspecs); + /* First create PartitionBoundInfo */ + if (nparts > 0) + boundinfo = partition_bounds_create(boundspecs, nparts, key, + &mapping); /* Now build the actual relcache partition descriptor */ rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext, @@ -316,39 +331,25 @@ RelationBuildPartitionDesc(Relation rel) ALLOCSET_DEFAULT_SIZES); MemoryContextCopyAndSetIdentifier(rel->rd_pdcxt, RelationGetRelationName(rel)); + /* Make a copy of oids and boundinfo in the cache context. */ oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt); partdesc = (PartitionDescData *) palloc0(sizeof(PartitionDescData)); partdesc->nparts = nparts; - /* oids and boundinfo are allocated below. */ - - MemoryContextSwitchTo(oldcxt); - - if (nparts == 0) + if (nparts > 0) { - rel->rd_partdesc = partdesc; - return; + partdesc->boundinfo = partition_bounds_copy(boundinfo, key); + partdesc->oids = (Oid *) palloc(partdesc->nparts * sizeof(Oid)); + + /* + * Now assign OIDs from the original array into mapped indexes of the + * result array. Order of OIDs in the former is defined by the + * catalog scan that retrieved them, whereas that in the latter is + * defined by canonicalized representation of the partition bounds. + */ + for (i = 0; i < partdesc->nparts; i++) + partdesc->oids[mapping[i]] = oids[i]; } - /* First create PartitionBoundInfo */ - boundinfo = partition_bounds_create(boundspecs, key, &mapping); - oids_orig = (Oid *) palloc(sizeof(Oid) * partdesc->nparts); - i = 0; - foreach(cell, inhoids) - oids_orig[i++] = lfirst_oid(cell); - - /* Now copy boundinfo and oids into partdesc. */ - oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt); - partdesc->boundinfo = partition_bounds_copy(boundinfo, key); - partdesc->oids = (Oid *) palloc(partdesc->nparts * sizeof(Oid)); - - /* - * Now assign OIDs from the original array into mapped indexes of the - * result array. Order of OIDs in the former is defined by the catalog - * scan that retrieved them, whereas that in the latter is defined by - * canonicalized representation of the partition bounds. - */ - for (i = 0; i < partdesc->nparts; i++) - partdesc->oids[mapping[i]] = oids_orig[i]; MemoryContextSwitchTo(oldcxt); rel->rd_partdesc = partdesc; diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h index 7a697d1c0a..36fb584e23 100644 --- a/src/include/partitioning/partbounds.h +++ b/src/include/partitioning/partbounds.h @@ -80,9 +80,8 @@ extern uint64 compute_partition_hash_value(int partnatts, FmgrInfo *partsupfunc, Datum *values, bool *isnull); extern List *get_qual_from_partbound(Relation rel, Relation parent, PartitionBoundSpec *spec); -extern PartitionBoundInfo partition_bounds_create(List *boundspecs, - PartitionKey key, - int **mapping); +extern PartitionBoundInfo partition_bounds_create(PartitionBoundSpec **boundspecs, + int nparts, PartitionKey key, int **mapping); extern bool partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval, PartitionBoundInfo b1, PartitionBoundInfo b2); -- 2.11.0
From fb8ac535b92b62454224356e5fa4892c5b16f327 Mon Sep 17 00:00:00 2001 From: Robert Haas <rh...@postgresql.org> Date: Wed, 14 Nov 2018 12:15:44 -0500 Subject: [PATCH 2/2] Ensure that RelationBuildPartitionDesc sees a consistent view. If partitions are added or removed concurrently, make sure that we nevertheless get a view of the partition list and the partition descriptor for each partition which is consistent with the system state at some single point in the commit history. To do this, reuse an idea first invented by Noah Misch back in commit 4240e429d0c2d889d0cda23c618f94e12c13ade7. --- src/backend/utils/cache/partcache.c | 135 ++++++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index 729b887442..91d56d9dfa 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -28,8 +28,10 @@ #include "optimizer/clauses.h" #include "optimizer/planner.h" #include "partitioning/partbounds.h" +#include "storage/sinval.h" #include "utils/builtins.h" #include "utils/datum.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/partcache.h" @@ -266,45 +268,113 @@ RelationBuildPartitionDesc(Relation rel) MemoryContext oldcxt; int *mapping; - /* Get partition oids from pg_inherits */ - inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); - nparts = list_length(inhoids); - - if (nparts > 0) + /* + * Fetch catalog information. Since we want to allow partitions to be + * added and removed without holding AccessExclusiveLock on the parent + * table, it's possible that the catalog contents could be changing under + * us. That means that by by the time we fetch the partition bound for a + * partition returned by find_inheritance_children, it might no longer be + * a partition or might even be a partition of some other table. + * + * To ensure that we get a consistent view of the catalog data, we first + * fetch everything we need and then call AcceptInvalidationMessages. If + * SharedInvalidMessageCounter advances between the time we start fetching + * information and the time AcceptInvalidationMessages() completes, that + * means something may have changed under us, so we start over and do it + * all again. + */ + for (;;) { - oids = palloc(nparts * sizeof(Oid)); - boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + uint64 inval_count = SharedInvalidMessageCounter; + + /* Get partition oids from pg_inherits */ + inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); + nparts = list_length(inhoids); + + if (nparts > 0) + { + oids = palloc(nparts * sizeof(Oid)); + boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + } + + /* Collect bound spec nodes for each partition */ + i = 0; + foreach(cell, inhoids) + { + Oid inhrelid = lfirst_oid(cell); + HeapTuple tuple; + PartitionBoundSpec *boundspec = NULL; + + /* + * Don't put any sanity checks here that might fail as a result of + * concurrent DDL, such as a check that relpartbound is not NULL. + * We could transiently see such states as a result of concurrent + * DDL. Such checks can be performed only after we're sure we got + * a consistent view of the underlying data. + */ + tuple = SearchSysCache1(RELOID, inhrelid); + if (HeapTupleIsValid(tuple)) + { + Datum datum; + bool isnull; + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + if (!isnull) + boundspec = stringToNode(TextDatumGetCString(datum)); + ReleaseSysCache(tuple); + } + + oids[i] = inhrelid; + boundspecs[i] = boundspec; + ++i; + } + + /* + * If no relevant catalog changes have occurred (see comments at the + * top of this loop, then we got a consistent view of our partition + * list and can stop now. + */ + AcceptInvalidationMessages(); + if (inval_count == SharedInvalidMessageCounter) + break; + + /* Something changed, so retry from the top. */ + if (oids != NULL) + { + pfree(oids); + oids = NULL; + } + if (boundspecs != NULL) + { + pfree(boundspecs); + boundspecs = NULL; + } + if (inhoids != NIL) + list_free(inhoids); } - /* Collect bound spec nodes for each partition */ - i = 0; - foreach(cell, inhoids) + /* + * At this point, we should have a consistent view of the data we got from + * pg_inherits and pg_class, so it's safe to perform some sanity checks. + */ + for (i = 0; i < nparts; ++i) { - Oid inhrelid = lfirst_oid(cell); - HeapTuple tuple; - Datum datum; - bool isnull; - PartitionBoundSpec *boundspec; + Oid inhrelid = oids[i]; + PartitionBoundSpec *spec = boundspecs[i]; - tuple = SearchSysCache1(RELOID, inhrelid); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for relation %u", inhrelid); - - datum = SysCacheGetAttr(RELOID, tuple, - Anum_pg_class_relpartbound, - &isnull); - if (isnull) - elog(ERROR, "null relpartbound for relation %u", inhrelid); - boundspec = stringToNode(TextDatumGetCString(datum)); - if (!IsA(boundspec, PartitionBoundSpec)) + if (!spec) + elog(ERROR, "missing relpartbound for relation %u", inhrelid); + if (!IsA(spec, PartitionBoundSpec)) elog(ERROR, "invalid relpartbound for relation %u", inhrelid); /* - * Sanity check: If the PartitionBoundSpec says this is the default - * partition, its OID should correspond to whatever's stored in - * pg_partitioned_table.partdefid; if not, the catalog is corrupt. + * If the PartitionBoundSpec says this is the default partition, its + * OID should match pg_partitioned_table.partdefid; if not, the + * catalog is corrupt. */ - if (boundspec->is_default) + if (spec->is_default) { Oid partdefid; @@ -313,11 +383,6 @@ RelationBuildPartitionDesc(Relation rel) elog(ERROR, "expected partdefid %u, but got %u", inhrelid, partdefid); } - - oids[i] = inhrelid; - boundspecs[i] = boundspec; - ++i; - ReleaseSysCache(tuple); } /* First create PartitionBoundInfo */ -- 2.11.0