On Mon, Mar 27, 2023 at 01:28:24PM +0300, Alexander Pyhalov wrote: > Justin Pryzby писал 2023-03-26 17:51: > > On Sun, Dec 04, 2022 at 01:09:35PM -0600, Justin Pryzby wrote: > > > This currently handles partitions with a loop around the whole CIC > > > implementation, which means that things like WaitForLockers() happen > > > once for each index, the same as REINDEX CONCURRENTLY on a partitioned > > > table. Contrast that with ReindexRelationConcurrently(), which handles > > > all the indexes on a table in one pass by looping around indexes within > > > each phase. > > > > Rebased over the progress reporting fix (27f5c712b). > > > > I added a list of (intermediate) partitioned tables, rather than looping > > over the list of inheritors again, to save calling rel_get_relkind(). > > > > I think this patch is done. > > Overall looks good to me. However, I think that using 'partitioned' as list > of partitioned index oids in DefineIndex() is a bit misleading - we've just > used it as boolean, specifying if we are dealing with a partitioned > relation.
Right. This is also rebased on 8c852ba9a4 (Allow some exclusion constraints on partitions). -- Justin
>From 3f60cbdd12b67115f86854ff60a4009028b8b99f Mon Sep 17 00:00:00 2001 From: Justin Pryzby <pryz...@telsasoft.com> Date: Sat, 6 Jun 2020 17:42:23 -0500 Subject: [PATCH] Allow CREATE INDEX CONCURRENTLY on partitioned table https://www.postgresql.org/message-id/flat/20201031063117.gf3...@telsasoft.com --- doc/src/sgml/ddl.sgml | 4 +- doc/src/sgml/ref/create_index.sgml | 14 +- src/backend/commands/indexcmds.c | 201 ++++++++++++++++++------- src/test/regress/expected/indexing.out | 127 +++++++++++++++- src/test/regress/sql/indexing.sql | 26 +++- 5 files changed, 297 insertions(+), 75 deletions(-) diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index 58aaa691c6a..afa982154a8 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -4161,9 +4161,7 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 so that they are applied automatically to the entire hierarchy. This is very convenient, as not only will the existing partitions become indexed, but - also any partitions that are created in the future will. One limitation is - that it's not possible to use the <literal>CONCURRENTLY</literal> - qualifier when creating such a partitioned index. To avoid long lock + also any partitions that are created in the future will. To avoid long lock times, it is possible to use <command>CREATE INDEX ON ONLY</command> the partitioned table; such an index is marked invalid, and the partitions do not get the index applied automatically. The indexes on partitions can diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 40986aa502f..b05102efdaf 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -645,7 +645,10 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class= <para> If a problem arises while scanning the table, such as a deadlock or a uniqueness violation in a unique index, the <command>CREATE INDEX</command> - command will fail but leave behind an <quote>invalid</quote> index. This index + command will fail but leave behind an <quote>invalid</quote> index. + If this happens while build an index concurrently on a partitioned + table, the command can also leave behind <quote>valid</quote> or + <quote>invalid</quote> indexes on table partitions. The invalid index will be ignored for querying purposes because it might be incomplete; however it will still consume update overhead. The <application>psql</application> <command>\d</command> command will report such an index as <literal>INVALID</literal>: @@ -692,15 +695,6 @@ Indexes: cannot. </para> - <para> - Concurrent builds for indexes on partitioned tables are currently not - supported. However, you may concurrently build the index on each - partition individually and then finally create the partitioned index - non-concurrently in order to reduce the time where writes to the - partitioned table will be locked out. In this case, building the - partitioned index is a metadata only operation. - </para> - </refsect2> </refsect1> diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index baf3e6e57a5..dfe64052b81 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -92,6 +92,11 @@ static char *ChooseIndexName(const char *tabname, Oid namespaceId, bool primary, bool isconstraint); static char *ChooseIndexNameAddition(List *colnames); static List *ChooseIndexColumnNames(List *indexElems); +static void DefineIndexConcurrentInternal(Oid relationId, + Oid indexRelationId, + IndexInfo *indexInfo, + LOCKTAG heaplocktag, + LockRelId heaprelid); static void ReindexIndex(RangeVar *indexRelation, ReindexParams *params, bool isTopLevel); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, @@ -555,7 +560,6 @@ DefineIndex(Oid relationId, bool amissummarizing; amoptions_function amoptions; bool partitioned; - bool safe_index; Datum reloptions; int16 *coloptions; IndexInfo *indexInfo; @@ -563,12 +567,10 @@ DefineIndex(Oid relationId, bits16 constr_flags; int numberOfAttributes; int numberOfKeyAttributes; - TransactionId limitXmin; ObjectAddress address; LockRelId heaprelid; LOCKTAG heaplocktag; LOCKMODE lockmode; - Snapshot snapshot; Oid root_save_userid; int root_save_sec_context; int root_save_nestlevel; @@ -699,20 +701,6 @@ DefineIndex(Oid relationId, * partition. */ partitioned = rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE; - if (partitioned) - { - /* - * Note: we check 'stmt->concurrent' rather than 'concurrent', so that - * the error is thrown also for temporary tables. Seems better to be - * consistent, even though we could do it on temporary table because - * we're not actually doing it concurrently. - */ - if (stmt->concurrent) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot create index on partitioned table \"%s\" concurrently", - RelationGetRelationName(rel)))); - } /* * Don't try to CREATE INDEX on temp tables of other backends. @@ -1102,10 +1090,6 @@ DefineIndex(Oid relationId, } } - /* Is index safe for others to ignore? See set_indexsafe_procflags() */ - safe_index = indexInfo->ii_Expressions == NIL && - indexInfo->ii_Predicate == NIL; - /* * Report index creation if appropriate (delay this till after most of the * error checks) @@ -1170,6 +1154,11 @@ DefineIndex(Oid relationId, if (pd->nparts != 0) flags |= INDEX_CREATE_INVALID; } + else if (concurrent && OidIsValid(parentIndexId)) + { + /* If concurrent, initially build index partitions as "invalid" */ + flags |= INDEX_CREATE_INVALID; + } if (stmt->deferrable) constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE; @@ -1519,21 +1508,7 @@ DefineIndex(Oid relationId, */ if (invalidate_parent) { - Relation pg_index = table_open(IndexRelationId, RowExclusiveLock); - HeapTuple tup, - newtup; - - tup = SearchSysCache1(INDEXRELID, - ObjectIdGetDatum(indexRelationId)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for index %u", - indexRelationId); - newtup = heap_copytuple(tup); - ((Form_pg_index) GETSTRUCT(newtup))->indisvalid = false; - CatalogTupleUpdate(pg_index, &tup->t_self, newtup); - ReleaseSysCache(tup); - table_close(pg_index, RowExclusiveLock); - heap_freetuple(newtup); + index_set_state_flags(indexRelationId, INDEX_DROP_CLEAR_VALID); /* * CCI here to make this update visible, in case this recurses @@ -1545,37 +1520,49 @@ DefineIndex(Oid relationId, /* * Indexes on partitioned tables are not themselves built, so we're - * done here. + * done here in the non-concurrent case. */ - AtEOXact_GUC(false, root_save_nestlevel); - SetUserIdAndSecContext(root_save_userid, root_save_sec_context); - table_close(rel, NoLock); - if (!OidIsValid(parentIndexId)) - pgstat_progress_end_command(); - else + if (!concurrent) { - /* Update progress for an intermediate partitioned index itself */ - pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); - } + AtEOXact_GUC(false, root_save_nestlevel); + SetUserIdAndSecContext(root_save_userid, root_save_sec_context); + table_close(rel, NoLock); - return address; + if (!OidIsValid(parentIndexId)) + pgstat_progress_end_command(); + else + { + /* + * Update progress for an intermediate partitioned index + * itself + */ + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + } + + return address; + } } AtEOXact_GUC(false, root_save_nestlevel); SetUserIdAndSecContext(root_save_userid, root_save_sec_context); - if (!concurrent) + /* + * All done in the non-concurrent case, and when building catalog entries + * of partitions for CIC. + */ + if (!concurrent || OidIsValid(parentIndexId)) { - /* Close the heap and we're done, in the non-concurrent case */ table_close(rel, NoLock); /* * If this is the top-level index, the command is done overall; - * otherwise, increment progress to report one child index is done. + * otherwise (when being called recursively), increment progress to + * report that one child index is done. Except in the concurrent + * (catalog-only) case, which is handled later. */ if (!OidIsValid(parentIndexId)) pgstat_progress_end_command(); - else + else if (!concurrent) pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); return address; @@ -1586,6 +1573,114 @@ DefineIndex(Oid relationId, SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); table_close(rel, NoLock); + if (!partitioned) + { + /* CREATE INDEX CONCURRENTLY on a nonpartitioned table */ + DefineIndexConcurrentInternal(relationId, indexRelationId, + indexInfo, heaplocktag, heaprelid); + pgstat_progress_end_command(); + return address; + } + else + { + /* + * For CIC on a partitioned table, finish by building indexes on + * partitions + */ + + ListCell *lc; + List *childs; + List *tosetvalid = NIL; + MemoryContext cic_context, + old_context; + + /* Create special memory context for cross-transaction storage */ + cic_context = AllocSetContextCreate(PortalContext, + "Create index concurrently", + ALLOCSET_DEFAULT_SIZES); + + old_context = MemoryContextSwitchTo(cic_context); + childs = find_all_inheritors(indexRelationId, ShareLock, NULL); + MemoryContextSwitchTo(old_context); + + foreach(lc, childs) + { + Oid indrelid = lfirst_oid(lc); + Oid tabrelid; + char relkind; + + /* + * Pre-existing partitions which were ATTACHED were already + * counted in the progress report. + */ + if (get_index_isvalid(indrelid)) + continue; + + /* + * Partitioned indexes are counted in the progress report, but + * don't need to be further processed. + */ + relkind = get_rel_relkind(indrelid); + if (!RELKIND_HAS_STORAGE(relkind)) + { + /* The toplevel index doesn't count towards "partitions done" */ + if (indrelid != indexRelationId) + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + + /* + * Build up a list of all the intermediate partitioned tables + * which will later need to be set valid. + */ + old_context = MemoryContextSwitchTo(cic_context); + tosetvalid = lappend_oid(tosetvalid, indrelid); + MemoryContextSwitchTo(old_context); + continue; + } + + rel = table_open(relationId, ShareUpdateExclusiveLock); + heaprelid = rel->rd_lockInfo.lockRelId; + table_close(rel, ShareUpdateExclusiveLock); + SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); + + /* Process each partition in a separate transaction */ + tabrelid = IndexGetRelation(indrelid, false); + DefineIndexConcurrentInternal(tabrelid, indrelid, indexInfo, + heaplocktag, heaprelid); + + PushActiveSnapshot(GetTransactionSnapshot()); + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + } + + /* Set as valid all partitioned indexes, including the parent */ + foreach(lc, tosetvalid) + { + Oid indrelid = lfirst_oid(lc); + + index_set_state_flags(indrelid, INDEX_CREATE_SET_READY); + CommandCounterIncrement(); + index_set_state_flags(indrelid, INDEX_CREATE_SET_VALID); + } + + MemoryContextDelete(cic_context); + pgstat_progress_end_command(); + PopActiveSnapshot(); + return address; + } +} + + +static void +DefineIndexConcurrentInternal(Oid relationId, + Oid indexRelationId, IndexInfo *indexInfo, + LOCKTAG heaplocktag, LockRelId heaprelid) +{ + TransactionId limitXmin; + Snapshot snapshot; + + /* Is index safe for others to ignore? See set_indexsafe_procflags() */ + bool safe_index = indexInfo->ii_Expressions == NIL && + indexInfo->ii_Predicate == NIL; + /* * For a concurrent build, it's important to make the catalog entries * visible to other transactions before we start to build the index. That @@ -1781,10 +1876,6 @@ DefineIndex(Oid relationId, * Last thing to do is release the session-level lock on the parent table. */ UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock); - - pgstat_progress_end_command(); - - return address; } diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index 2be8ffa7ec4..aefa203b14f 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -50,11 +50,130 @@ select relname, relkind, relhassubclass, inhparent::regclass (8 rows) drop table idxpart; --- Some unsupported features +-- CIC on partitioned table create table idxpart (a int, b int, c text) partition by range (a); -create table idxpart1 partition of idxpart for values from (0) to (10); -create index concurrently on idxpart (a); -ERROR: cannot create index on partitioned table "idxpart" concurrently +create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a); +create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a); +create table idxpart111 partition of idxpart11 default partition by range(a); +create table idxpart1111 partition of idxpart111 default partition by range(a); +create table idxpart2 partition of idxpart for values from (10) to (20); +create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a); +create table idxpart31 partition of idxpart3 default; +insert into idxpart2 values(10),(10); -- not unique +create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves +create index concurrently on idxpart1 (a); -- partitioned and partition +create index concurrently on idxpart2 (a); -- leaf +create index concurrently on idxpart (a); -- partitioned +create unique index concurrently on idxpart (a); -- partitioned, unique failure +ERROR: could not create unique index "idxpart2_a_idx1" +DETAIL: Key (a)=(10) is duplicated. +\d idxpart + Partitioned table "public.idxpart" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition key: RANGE (a) +Indexes: + "idxpart_a_idx" btree (a) + "idxpart_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 3 (Use \d+ to list them.) + +\d idxpart1 + Partitioned table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (0) TO (10) +Partition key: RANGE (a) +Indexes: + "idxpart1_a_idx" btree (a) + "idxpart1_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart11 + Partitioned table "public.idxpart11" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart1 FOR VALUES FROM (0) TO (10) +Partition key: RANGE (a) +Indexes: + "idxpart11_a_idx" btree (a) + "idxpart11_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart111 + Partitioned table "public.idxpart111" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart11 DEFAULT +Partition key: RANGE (a) +Indexes: + "idxpart111_a_idx" btree (a) + "idxpart111_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart1111 + Partitioned table "public.idxpart1111" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart111 DEFAULT +Partition key: RANGE (a) +Indexes: + "idxpart1111_a_idx" btree (a) + "idxpart1111_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 0 + +\d idxpart2 + Table "public.idxpart2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (10) TO (20) +Indexes: + "idxpart2_a_idx" btree (a) + "idxpart2_a_idx1" UNIQUE, btree (a) INVALID + +\d idxpart3 + Partitioned table "public.idxpart3" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (30) TO (40) +Partition key: RANGE (a) +Indexes: + "idxpart3_a_idx" btree (a) + "idxpart3_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart31 + Table "public.idxpart31" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart3 DEFAULT +Indexes: + "idxpart31_a_idx" btree (a) + "idxpart31_a_idx1" UNIQUE, btree (a) INVALID + drop table idxpart; -- Verify bugfix with query on indexed partitioned table with no partitions -- https://postgr.es/m/20180124162006.pmapfiznhgngwtjf@alvherre.pgsql diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index b69c41832ca..5ddeaf1c613 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -29,10 +29,30 @@ select relname, relkind, relhassubclass, inhparent::regclass where relname like 'idxpart%' order by relname; drop table idxpart; --- Some unsupported features +-- CIC on partitioned table create table idxpart (a int, b int, c text) partition by range (a); -create table idxpart1 partition of idxpart for values from (0) to (10); -create index concurrently on idxpart (a); +create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a); +create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a); +create table idxpart111 partition of idxpart11 default partition by range(a); +create table idxpart1111 partition of idxpart111 default partition by range(a); +create table idxpart2 partition of idxpart for values from (10) to (20); +create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a); +create table idxpart31 partition of idxpart3 default; + +insert into idxpart2 values(10),(10); -- not unique +create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves +create index concurrently on idxpart1 (a); -- partitioned and partition +create index concurrently on idxpart2 (a); -- leaf +create index concurrently on idxpart (a); -- partitioned +create unique index concurrently on idxpart (a); -- partitioned, unique failure +\d idxpart +\d idxpart1 +\d idxpart11 +\d idxpart111 +\d idxpart1111 +\d idxpart2 +\d idxpart3 +\d idxpart31 drop table idxpart; -- Verify bugfix with query on indexed partitioned table with no partitions -- 2.34.1