Re: REINDEX INDEX results in a crash for an index of pg_class since 9.6

Andres Freund Thu, 25 Apr 2019 19:03:32 -0700

Hi,

On 2019-04-25 17:12:33 -0400, Tom Lane wrote:
> Andres Freund <and...@anarazel.de> writes:
> > On 2019-04-25 16:02:03 -0400, Tom Lane wrote:
> >> That could work.  The important API spec is then that the relcache entry
> >> reflects the *previous* state of the relation, and is not to be modified
> >> by the tableam call.
>
> > Right.
>
> > I was wondering if we should just pass in the pg_class tuple as an "out"
> > argument, instead of pointers to relfilnode/relfrozenxid/relminmxid.
>
> Yeah, possibly.  The whole business with xids is perhaps heapam specific,
> so decoupling this function's signature from them would be good.


I've left that out in the attached. Currently VACUUM FULL / CLUSTER also
needs to handle those, and the callback for transactional rewrite
(table_relation_copy_for_cluster()), also returns those as output
parameter.  I think I can see a way how we could clean up the relevant
cluster.c code, but until that's done, I don't see much point in a
different interface (I'll probably write apatch .

The attached patch fixes the problem for me, and passes all existing
tests. It contains a few changes that are not strictly necessary, but
imo clear improvements.

We probably could split the tableam changes and related refactoring from
the fix to make backpatching simpler. I've not done that yet, but I
think we should before committing.

Questions:
- Should we move the the CommandCounterIncrement() from
  RelationSetNewRelfilenode() to the callers? That'd allow them to do
  other things to the new relation (e.g. fill it), before making the
  changes visible. Don't think it'd currently help, but it seems like it
  could make code more robust in the future.

- Should we introduce an assertion into CatalogIndexInsert()'s
  HeapTupleIsHeapOnly() path, that asserts that all the relevant indexes
  aren't ReindexIsProcessingIndex()? Otherwise it seems way too easy to
  re-introduce bugs like this one.  Dirty hack for that included.

- Wonder if we shouldn't introduce something akin to
  SetReindexProcessing() for table rewrites (e.g. VACUUM FULL), to
  prevent the related error of inserting/updating a catalog table that's
  currently being rewritten.

Taking this as a WIP, what do you think?

Greetings,

Andres Freund

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 6584a9cb8da..4d179881f27 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -566,10 +566,14 @@ heapam_finish_bulk_insert(Relation relation, int options)
  */
 
 static void
-heapam_relation_set_new_filenode(Relation rel, char persistence,
+heapam_relation_set_new_filenode(Relation rel,
+								 const RelFileNode *newrnode,
+								 char persistence,
 								 TransactionId *freezeXid,
 								 MultiXactId *minmulti)
 {
+	SMgrRelation srel;
+
 	/*
 	 * Initialize to the minimum XID that could put tuples in the table. We
 	 * know that no xacts older than RecentXmin are still running, so that
@@ -587,7 +591,7 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
 	 */
 	*minmulti = GetOldestMultiXactId();
 
-	RelationCreateStorage(rel->rd_node, persistence);
+	srel = RelationCreateStorage(*newrnode, persistence);
 
 	/*
 	 * If required, set up an init fork for an unlogged table so that it can
@@ -598,16 +602,17 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
 	 * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
 	 * record. Therefore, logging is necessary even if wal_level=minimal.
 	 */
-	if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
+	if (persistence == RELPERSISTENCE_UNLOGGED)
 	{
 		Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
 			   rel->rd_rel->relkind == RELKIND_MATVIEW ||
 			   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
-		RelationOpenSmgr(rel);
-		smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
-		log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
-		smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
+		smgrcreate(srel, INIT_FORKNUM, false);
+		log_smgrcreate(newrnode, INIT_FORKNUM);
+		smgrimmedsync(srel, INIT_FORKNUM);
 	}
+
+	smgrclose(srel);
 }
 
 static void
@@ -617,13 +622,21 @@ heapam_relation_nontransactional_truncate(Relation rel)
 }
 
 static void
-heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
+heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
 {
 	SMgrRelation dstrel;
 
-	dstrel = smgropen(newrnode, rel->rd_backend);
+	dstrel = smgropen(*newrnode, rel->rd_backend);
 	RelationOpenSmgr(rel);
 
+	/*
+	 * Since we copy the file directly without looking at the shared buffers,
+	 * we'd better first flush out any pages of the source relation that are
+	 * in shared buffers.  We assume no new changes will be made while we are
+	 * holding exclusive lock on the rel.
+	 */
+	FlushRelationBuffers(rel);
+
 	/*
 	 * Create and copy all forks of the relation, and schedule unlinking of
 	 * old physical files.
@@ -631,7 +644,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
 	 * NOTE: any conflict in relfilenode value will be caught in
 	 * RelationCreateStorage().
 	 */
-	RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
+	RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence);
 
 	/* copy main fork */
 	RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
@@ -652,7 +665,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
 			if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
 				(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
 				 forkNum == INIT_FORKNUM))
-				log_smgrcreate(&newrnode, forkNum);
+				log_smgrcreate(newrnode, forkNum);
 			RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
 								rel->rd_rel->relpersistence);
 		}
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 6b77eff0af1..ee6b72e550a 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -435,8 +435,9 @@ heap_create(const char *relname,
 			case RELKIND_RELATION:
 			case RELKIND_TOASTVALUE:
 			case RELKIND_MATVIEW:
-				table_relation_set_new_filenode(rel, relpersistence,
-									   relfrozenxid, relminmxid);
+				table_relation_set_new_filenode(rel, &rel->rd_node,
+												relpersistence,
+												relfrozenxid, relminmxid);
 				break;
 		}
 	}
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 331f90528cd..e81af36d969 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3318,9 +3318,6 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
 
 	PG_TRY();
 	{
-		/* Suppress use of the target index while rebuilding it */
-		SetReindexProcessing(heapId, indexId);
-
 		/* Fetch info needed for index_build */
 		indexInfo = BuildIndexInfo(iRel);
 
@@ -3338,6 +3335,9 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
 		/* We'll build a new physical relation for the index */
 		RelationSetNewRelfilenode(iRel, persistence);
 
+		/* Suppress use of the target index while rebuilding it */
+		SetReindexProcessing(heapId, indexId);
+
 		/* Initialize the index and rebuild */
 		/* Note: we do not need to re-establish pkey setting */
 		index_build(heapRelation, iRel, indexInfo, true, true);
diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c
index 0c994122d85..a28a6e64ef1 100644
--- a/src/backend/catalog/indexing.c
+++ b/src/backend/catalog/indexing.c
@@ -82,10 +82,6 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
 	Datum		values[INDEX_MAX_KEYS];
 	bool		isnull[INDEX_MAX_KEYS];
 
-	/* HOT update does not require index inserts */
-	if (HeapTupleIsHeapOnly(heapTuple))
-		return;
-
 	/*
 	 * Get information from the state structure.  Fall out if nothing to do.
 	 */
@@ -96,6 +92,26 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
 	indexInfoArray = indstate->ri_IndexRelationInfo;
 	heapRelation = indstate->ri_RelationDesc;
 
+
+	/* HOT update does not require index inserts */
+	if (HeapTupleIsHeapOnly(heapTuple))
+	{
+		// WIP
+#ifdef USE_ASSERT_CHECKING
+		for (i = 0; i < indstate->ri_NumIndices; i++)
+		{
+			IndexInfo  *indexInfo = indexInfoArray[i];
+
+			/* If the index is marked as read-only, ignore it */
+			if (!indexInfo->ii_ReadyForInserts)
+				continue;
+
+			Assert(!ReindexIsProcessingIndex(RelationGetRelid(relationDescs[i])));
+		}
+#endif /* HeapTupleIsHeapOnly */
+		return;
+	}
+
 	/* Need a slot to hold the tuple being examined */
 	slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
 									&TTSOpsHeapTuple);
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 72242b24761..fb41f223ada 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -75,7 +75,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
  * This function is transactional. The creation is WAL-logged, and if the
  * transaction aborts later on, the storage will be destroyed.
  */
-void
+SMgrRelation
 RelationCreateStorage(RelFileNode rnode, char relpersistence)
 {
 	PendingRelDelete *pending;
@@ -99,7 +99,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 			break;
 		default:
 			elog(ERROR, "invalid relpersistence: %c", relpersistence);
-			return;				/* placate compiler */
+			return NULL;				/* placate compiler */
 	}
 
 	srel = smgropen(rnode, backend);
@@ -117,13 +117,15 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 	pending->nestLevel = GetCurrentTransactionNestLevel();
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
+
+	return srel;
 }
 
 /*
  * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
  */
 void
-log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
+log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
 {
 	xl_smgr_create xlrec;
 
@@ -294,6 +296,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 
 /*
  * Copy a fork's data, block by block.
+ *
+ * Note that this requires that there is no dirty data in shared buffers. If
+ * it's possible that there are, callers need to flush those using
+ * e.g. FlushRelationBuffers(rel).
  */
 void
 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index aa7328ea400..bfcf6255cc1 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -12236,14 +12236,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 		elog(ERROR, "cache lookup failed for relation %u", tableOid);
 	rd_rel = (Form_pg_class) GETSTRUCT(tuple);
 
-	/*
-	 * Since we copy the file directly without looking at the shared buffers,
-	 * we'd better first flush out any pages of the source relation that are
-	 * in shared buffers.  We assume no new changes will be made while we are
-	 * holding exclusive lock on the rel.
-	 */
-	FlushRelationBuffers(rel);
-
 	/*
 	 * Relfilenodes are not unique in databases across tablespaces, so we need
 	 * to allocate a new one in the new tablespace.
@@ -12266,10 +12258,16 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 		Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
 			   rel->rd_rel->relkind == RELKIND_MATVIEW ||
 			   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
-		table_relation_copy_data(rel, newrnode);
+		table_relation_copy_data(rel, &newrnode);
 	}
 
-	/* update the pg_class row */
+	/*
+	 * Update the pg_class row.
+	 *
+	 * NB: This wouldn't work if ATExecSetTableSpace() were allowed to be
+	 * executed on pg_class or its indexes (the above copy wouldn't contain
+	 * the updated pg_class entry), but that's forbidden above.
+	 */
 	rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace;
 	rd_rel->relfilenode = newrelfilenode;
 	CatalogTupleUpdate(pg_class, &tuple->t_self, tuple);
@@ -12537,6 +12535,14 @@ index_copy_data(Relation rel, RelFileNode newrnode)
 	dstrel = smgropen(newrnode, rel->rd_backend);
 	RelationOpenSmgr(rel);
 
+	/*
+	 * Since we copy the file directly without looking at the shared buffers,
+	 * we'd better first flush out any pages of the source relation that are
+	 * in shared buffers.  We assume no new changes will be made while we are
+	 * holding exclusive lock on the rel.
+	 */
+	FlushRelationBuffers(rel);
+
 	/*
 	 * Create and copy all forks of the relation, and schedule unlinking of
 	 * old physical files.
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index bab59f16e68..c1b63623af3 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3440,10 +3440,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
 	Form_pg_class classform;
 	MultiXactId minmulti = InvalidMultiXactId;
 	TransactionId freezeXid = InvalidTransactionId;
-
-	/* Allocate a new relfilenode */
-	newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
-									   persistence);
+	RelFileNode	newrnode;
 
 	/*
 	 * Get a writable copy of the pg_class tuple for the given relation.
@@ -3457,28 +3454,18 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
 			 RelationGetRelid(relation));
 	classform = (Form_pg_class) GETSTRUCT(tuple);
 
-	/*
-	 * Schedule unlinking of the old storage at transaction commit.
-	 */
+	/* Schedule unlinking of the old storage at transaction commit. */
 	RelationDropStorage(relation);
 
-	/*
-	 * Now update the pg_class row.  However, if we're dealing with a mapped
-	 * index, pg_class.relfilenode doesn't change; instead we have to send the
-	 * update to the relation mapper.
-	 */
-	if (RelationIsMapped(relation))
-		RelationMapUpdateMap(RelationGetRelid(relation),
-							 newrelfilenode,
-							 relation->rd_rel->relisshared,
-							 true);
-	else
-	{
-		relation->rd_rel->relfilenode = newrelfilenode;
-		classform->relfilenode = newrelfilenode;
-	}
+	/* Allocate a new relfilenode */
+	newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
+									   persistence);
 
-	RelationInitPhysicalAddr(relation);
+	/* initialize new relfilenode from old relfilenode */
+	newrnode = relation->rd_node;
+
+	/* and overwrite new relfilenode */
+	newrnode.relNode = newrelfilenode;
 
 	/*
 	 * Create storage for the main fork of the new relfilenode. If it's
@@ -3505,14 +3492,25 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
 
 		case RELKIND_INDEX:
 		case RELKIND_SEQUENCE:
-			RelationCreateStorage(relation->rd_node, persistence);
-			RelationOpenSmgr(relation);
-			break;
+			{
+				SMgrRelation srel;
+
+				/*
+				 * Create storage for the main fork of the new relfilenode.
+				 *
+				 * NOTE: any conflict in relfilenode value will be caught here, if
+				 * GetNewRelFileNode messes up for any reason.
+				 */
+				srel = RelationCreateStorage(newrnode, persistence);
+				smgrclose(srel);
+				break;
+			}
 
 		case RELKIND_RELATION:
 		case RELKIND_TOASTVALUE:
 		case RELKIND_MATVIEW:
-			table_relation_set_new_filenode(relation, persistence,
+			table_relation_set_new_filenode(relation, &newrnode,
+											persistence,
 											&freezeXid, &minmulti);
 			break;
 	}
@@ -3528,8 +3526,23 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
 	classform->relminmxid = minmulti;
 	classform->relpersistence = persistence;
 
+	if (!RelationIsMapped(relation))
+		classform->relfilenode = newrelfilenode;
+
+	/* Now update the pg_class row. */
 	CatalogTupleUpdate(pg_class, &tuple->t_self, tuple);
 
+	/*
+	 * However, if we're dealing with a mapped index, pg_class.relfilenode
+	 * doesn't change; instead we have to send the update to the relation
+	 * mapper.
+	 */
+	if (RelationIsMapped(relation))
+		RelationMapUpdateMap(RelationGetRelid(relation),
+							 newrelfilenode,
+							 relation->rd_rel->relisshared,
+							 true);
+
 	heap_freetuple(tuple);
 
 	table_close(pg_class, RowExclusiveLock);
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index c018a44267a..ebfa0d51855 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -416,7 +416,12 @@ typedef struct TableAmRoutine
 	 * This callback needs to create a new relation filenode for `rel`, with
 	 * appropriate durability behaviour for `persistence`.
 	 *
-	 * On output *freezeXid, *minmulti must be set to the values appropriate
+	 * Note that only the subset of the relcache filled by
+	 * RelationBuildLocalRelation() can be relied upon and that the relation's
+	 * catalog entries either will either not yet exist (new relation), or
+	 * will still reference the old relfilenode.
+	 *
+	 * As output *freezeXid, *minmulti must be set to the values appropriate
 	 * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
 	 * fields to be filled they can be set to InvalidTransactionId and
 	 * InvalidMultiXactId, respectively.
@@ -424,6 +429,7 @@ typedef struct TableAmRoutine
 	 * See also table_relation_set_new_filenode().
 	 */
 	void		(*relation_set_new_filenode) (Relation rel,
+											  const RelFileNode *newrnode,
 											  char persistence,
 											  TransactionId *freezeXid,
 											  MultiXactId *minmulti);
@@ -444,7 +450,8 @@ typedef struct TableAmRoutine
 	 * This can typically be implemented by directly copying the underlying
 	 * storage, unless it contains references to the tablespace internally.
 	 */
-	void		(*relation_copy_data) (Relation rel, RelFileNode newrnode);
+	void		(*relation_copy_data) (Relation rel,
+									   const RelFileNode *newrnode);
 
 	/* See table_relation_copy_for_cluster() */
 	void		(*relation_copy_for_cluster) (Relation NewHeap,
@@ -1251,21 +1258,25 @@ table_finish_bulk_insert(Relation rel, int options)
  */
 
 /*
- * Create a new relation filenode for `rel`, with persistence set to
+ * Create storage for `rel` in `newrode`, with persistence set to
  * `persistence`.
  *
  * This is used both during relation creation and various DDL operations to
- * create a new relfilenode that can be filled from scratch.
+ * create a new relfilenode that can be filled from scratch.  When creating
+ * new storage for an existing relfilenode, this should be called before the
+ * relcache entry has been updated.
  *
  * *freezeXid, *minmulti are set to the xid / multixact horizon for the table
  * that pg_class.{relfrozenxid, relminmxid} have to be set to.
  */
 static inline void
-table_relation_set_new_filenode(Relation rel, char persistence,
+table_relation_set_new_filenode(Relation rel,
+								const RelFileNode *newrnode,
+								char persistence,
 								TransactionId *freezeXid,
 								MultiXactId *minmulti)
 {
-	rel->rd_tableam->relation_set_new_filenode(rel, persistence,
+	rel->rd_tableam->relation_set_new_filenode(rel, newrnode, persistence,
 											   freezeXid, minmulti);
 }
 
@@ -1288,7 +1299,7 @@ table_relation_nontransactional_truncate(Relation rel)
  * changing a relation's tablespace.
  */
 static inline void
-table_relation_copy_data(Relation rel, RelFileNode newrnode)
+table_relation_copy_data(Relation rel, const RelFileNode *newrnode)
 {
 	rel->rd_tableam->relation_copy_data(rel, newrnode);
 }
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index 882dc65c893..3579d3f3eb0 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -19,7 +19,7 @@
 #include "storage/smgr.h"
 #include "utils/relcache.h"
 
-extern void RelationCreateStorage(RelFileNode rnode, char relpersistence);
+extern SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence);
 extern void RelationDropStorage(Relation rel);
 extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
 extern void RelationTruncate(Relation rel, BlockNumber nblocks);
diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h
index dfca3611ea3..40419efd373 100644
--- a/src/include/catalog/storage_xlog.h
+++ b/src/include/catalog/storage_xlog.h
@@ -50,7 +50,7 @@ typedef struct xl_smgr_truncate
 	int			flags;
 } xl_smgr_truncate;
 
-extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum);
+extern void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum);
 
 extern void smgr_redo(XLogReaderState *record);
 extern void smgr_desc(StringInfo buf, XLogReaderState *record);

Re: REINDEX INDEX results in a crash for an index of pg_class since 9.6

Reply via email to