Re: [HACKERS] crash with assertions and WAL_DEBUG

Heikki Linnakangas Thu, 26 Jun 2014 06:23:52 -0700

On 06/24/2014 05:47 PM, Alvaro Herrera wrote:

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 3c1c81a..4264373 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -219,6 +219,16 @@ mdinit(void)
                                                                          
&hash_ctl,
                                                                   HASH_ELEM | 
HASH_FUNCTION | HASH_CONTEXT);
                pendingUnlinks = NIL;
+
+               /*
+                * XXX: The checkpointer needs to add entries to the pending ops
+                * table when absorbing fsync requests. That is done within a 
critical
+                * section. It means that there's a theoretical possibility 
that you
+                * run out of memory while absorbing fsync requests, which 
leads to
+                * a PANIC. Fortunately the hash table is small so that's 
unlikely to
+                * happen in practice.
+                */
+               MemoryContextAllowInCriticalSection(MdCxt, true);
        }
  }


Isn't that allowing a bit too much? We e.g. shouldn't allow
_fdvec_alloc() within a crritical section. Might make sense to create a
child context for it.


I agree.


Ok. That leaves nothing but _fdvec_alloc() in MdCxt.

Rahila Syed wrote:

The patch on compilation gives following error,

mcxt.c: In function ‘MemoryContextAllowInCriticalSection’:
mcxt.c:322: error: ‘struct MemoryContextData’ has no member named
‘allowInCriticalSection’

The member in MemoryContextData is defined as 'allowInCritSection' while
the MemoryContextAllowInCriticalSection accesses the field as
'context->allowInCriticalSection'.


It appears Heikki did a search'n replace for "->allowInCritSection"
before submitting, which failed to match the struct declaration.

Uh, looks like I failed to test this at all with assertions enabled.Once you fix that error, you get a lot of assertion failures :-(.


Attached is a new attempt:

* walDebugCxt is now created at backend startup (in XLOGShmemInit()).This fixes the issue that Andres pointed out, that the context creationwould PANIC if the first XLogInsert in a backend happens within acritical section. LWLOCK_STATS had the same issue, it would fail if thefirst LWLock acquisition in a process happens within a critical section.I fixed that by adding an explicit InitLWLockAccess() function and movedthe initialization there.

* hash_create also creates a new memory context within the givencontext. That new context needs to inherit the allowInCritSection flag,otherwise allocating entries in the hash will still fail. BothLWLOCK_STATS and the pending ops hash table in the checkpointer had thisissue.

* Checkpointer does one palloc to allocate private space to copy therequests to, in addition to using the hash table. I moved that palloc tobefore entering the critical section.


- Heikki

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index abc5682..bef0f66 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -60,6 +60,7 @@
 #include "storage/spin.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/relmapper.h"
 #include "utils/snapmgr.h"
@@ -736,6 +737,10 @@ static bool bgwriterLaunched = false;
 static int	MyLockNo = 0;
 static bool holdingAllLocks = false;
 
+#ifdef WAL_DEBUG
+static MemoryContext walDebugCxt = NULL;
+#endif
+
 static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
 static bool recoveryStopsBefore(XLogRecord *record);
@@ -1258,6 +1263,7 @@ begin:;
 	if (XLOG_DEBUG)
 	{
 		StringInfoData buf;
+		MemoryContext oldCxt = MemoryContextSwitchTo(walDebugCxt);
 
 		initStringInfo(&buf);
 		appendStringInfo(&buf, "INSERT @ %X/%X: ",
@@ -1282,10 +1288,11 @@ begin:;
 
 			appendStringInfoString(&buf, " - ");
 			RmgrTable[rechdr->xl_rmid].rm_desc(&buf, (XLogRecord *) recordbuf.data);
-			pfree(recordbuf.data);
 		}
 		elog(LOG, "%s", buf.data);
-		pfree(buf.data);
+
+		MemoryContextSwitchTo(oldCxt);
+		MemoryContextReset(walDebugCxt);
 	}
 #endif
 
@@ -4807,6 +4814,24 @@ XLOGShmemInit(void)
 	char	   *allocptr;
 	int			i;
 
+#ifdef WAL_DEBUG
+	/*
+	 * Create a memory context for WAL debugging that's exempt from the
+	 * normal "no pallocs in critical section" rule. Yes, that can lead to a
+	 * PANIC if an allocation fails, but no-one is going to enable wal_debug
+	 * in production.
+	 */
+	if (walDebugCxt == NULL)
+	{
+		walDebugCxt = AllocSetContextCreate(TopMemoryContext,
+											"WAL Debug",
+											ALLOCSET_DEFAULT_MINSIZE,
+											ALLOCSET_DEFAULT_INITSIZE,
+											ALLOCSET_DEFAULT_MAXSIZE);
+		MemoryContextAllowInCriticalSection(walDebugCxt, true);
+	}
+#endif
+
 	ControlFile = (ControlFileData *)
 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
 	XLogCtl = (XLogCtlData *)
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 2ac3061..6c814ba 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -1305,19 +1305,6 @@ AbsorbFsyncRequests(void)
 	if (!AmCheckpointerProcess())
 		return;
 
-	/*
-	 * We have to PANIC if we fail to absorb all the pending requests (eg,
-	 * because our hashtable runs out of memory).  This is because the system
-	 * cannot run safely if we are unable to fsync what we have been told to
-	 * fsync.  Fortunately, the hashtable is so small that the problem is
-	 * quite unlikely to arise in practice.
-	 */
-	START_CRIT_SECTION();
-
-	/*
-	 * We try to avoid holding the lock for a long time by copying the request
-	 * array.
-	 */
 	LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
 
 	/* Transfer stats counts into pending pgstats message */
@@ -1327,12 +1314,25 @@ AbsorbFsyncRequests(void)
 	CheckpointerShmem->num_backend_writes = 0;
 	CheckpointerShmem->num_backend_fsync = 0;
 
+	/*
+	 * We try to avoid holding the lock for a long time by copying the request
+	 * array, and processing the requests after releasing the lock.
+	 *
+	 * Once we have cleared the requests from shared memory, we have to PANIC
+	 * if we then fail to absorb them (eg, because our hashtable runs out of
+	 * memory).  This is because the system cannot run safely if we are unable
+	 * to fsync what we have been told to fsync.  Fortunately, the hashtable
+	 * is so small that the problem is quite unlikely to arise in practice.
+	 */
 	n = CheckpointerShmem->num_requests;
 	if (n > 0)
 	{
 		requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
 		memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
 	}
+
+	START_CRIT_SECTION();
+
 	CheckpointerShmem->num_requests = 0;
 
 	LWLockRelease(CheckpointerCommLock);
@@ -1340,10 +1340,10 @@ AbsorbFsyncRequests(void)
 	for (request = requests; n > 0; request++, n--)
 		RememberFsyncRequest(request->rnode, request->forknum, request->segno);
 
+	END_CRIT_SECTION();
+
 	if (requests)
 		pfree(requests);
-
-	END_CRIT_SECTION();
 }
 
 /*
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index d23ac62..a62af27 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -104,8 +104,8 @@ typedef struct lwlock_stats
 	int			spin_delay_count;
 }	lwlock_stats;
 
-static int	counts_for_pid = 0;
 static HTAB *lwlock_stats_htab;
+static lwlock_stats lwlock_stats_dummy;
 #endif
 
 #ifdef LOCK_DEBUG
@@ -142,21 +142,39 @@ static void
 init_lwlock_stats(void)
 {
 	HASHCTL		ctl;
+	static MemoryContext lwlock_stats_cxt = NULL;
+	static bool exit_registered = false;
 
-	if (lwlock_stats_htab != NULL)
-	{
-		hash_destroy(lwlock_stats_htab);
-		lwlock_stats_htab = NULL;
-	}
+	if (lwlock_stats_cxt != NULL)
+		MemoryContextDelete(lwlock_stats_cxt);
+
+	/*
+	 * The LWLock stats will be updated within a critical section, which
+	 * requires allocating new hash entries. Allocations within a critical
+	 * section are normally not allowed because running out of memory would
+	 * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally
+	 * turned on in production, so that's an acceptable risk. The hash entries
+	 * are small, so the risk of running out of memory is minimal in practice.
+	 */
+	lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext,
+											 "LWLock stats",
+											 ALLOCSET_DEFAULT_MINSIZE,
+											 ALLOCSET_DEFAULT_INITSIZE,
+											 ALLOCSET_DEFAULT_MAXSIZE);
+	MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true);
 
 	MemSet(&ctl, 0, sizeof(ctl));
 	ctl.keysize = sizeof(lwlock_stats_key);
 	ctl.entrysize = sizeof(lwlock_stats);
 	ctl.hash = tag_hash;
+	ctl.hcxt = lwlock_stats_cxt;
 	lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl,
-									HASH_ELEM | HASH_FUNCTION);
-	counts_for_pid = MyProcPid;
-	on_shmem_exit(print_lwlock_stats, 0);
+									HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+	if (!exit_registered)
+	{
+		on_shmem_exit(print_lwlock_stats, 0);
+		exit_registered = true;
+	}
 }
 
 static void
@@ -190,9 +208,13 @@ get_lwlock_stats_entry(LWLock *lock)
 	lwlock_stats *lwstats;
 	bool		found;
 
-	/* Set up local count state first time through in a given process */
-	if (counts_for_pid != MyProcPid)
-		init_lwlock_stats();
+	/*
+	 * During shared memory initialization, the hash table doesn't exist yet.
+	 * Stats of that phase aren't very interesting, so just collect operations
+	 * on all locks in a single dummy entry.
+	 */
+	if (lwlock_stats_htab == NULL)
+		return &lwlock_stats_dummy;
 
 	/* Fetch or create the entry. */
 	key.tranche = lock->tranche;
@@ -361,6 +383,16 @@ CreateLWLocks(void)
 	LWLockRegisterTranche(0, &MainLWLockTranche);
 }
 
+/*
+ * InitLWLockAccess - initialize backend-local state needed to hold LWLocks
+ */
+void
+InitLWLockAccess(void)
+{
+#ifdef LWLOCK_STATS
+	init_lwlock_stats();
+#endif
+}
 
 /*
  * LWLockAssign - assign a dynamically-allocated LWLock number
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index dfaf10e..ea88a24 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -411,8 +411,9 @@ InitProcess(void)
 
 	/*
 	 * Now that we have a PGPROC, we could try to acquire locks, so initialize
-	 * the deadlock checker.
+	 * local state needed for LWLocks, and the deadlock checker.
 	 */
+	InitLWLockAccess();
 	InitDeadLockChecking();
 }
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 3c1c81a..167d61c 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -115,7 +115,7 @@ typedef struct _MdfdVec
 	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 } MdfdVec;
 
-static MemoryContext MdCxt;		/* context for all md.c allocations */
+static MemoryContext MdCxt;		/* context for all MdfdVec objects */
 
 
 /*
@@ -157,6 +157,7 @@ typedef struct
 
 static HTAB *pendingOpsTable = NULL;
 static List *pendingUnlinks = NIL;
+static MemoryContext pendingOpsCxt;		/* context for the above  */
 
 static CycleCtr mdsync_cycle_ctr = 0;
 static CycleCtr mdckpt_cycle_ctr = 0;
@@ -209,11 +210,27 @@ mdinit(void)
 	{
 		HASHCTL		hash_ctl;
 
+		/*
+		 * XXX: The checkpointer needs to add entries to the pending ops table
+		 * when absorbing fsync requests.  That is done within a critical
+		 * section, which isn't usually allowed, but we make an exception.
+		 * It means that there's a theoretical possibility that you run out of
+		 * memory while absorbing fsync requests, which leads to a PANIC.
+		 * Fortunately the hash table is small so that's unlikely to happen in
+		 * practice.
+		 */
+		pendingOpsCxt = AllocSetContextCreate(MdCxt,
+											  "Pending Ops Context",
+											  ALLOCSET_DEFAULT_MINSIZE,
+											  ALLOCSET_DEFAULT_INITSIZE,
+											  ALLOCSET_DEFAULT_MAXSIZE);
+		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
 		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 		hash_ctl.keysize = sizeof(RelFileNode);
 		hash_ctl.entrysize = sizeof(PendingOperationEntry);
 		hash_ctl.hash = tag_hash;
-		hash_ctl.hcxt = MdCxt;
+		hash_ctl.hcxt = pendingOpsCxt;
 		pendingOpsTable = hash_create("Pending Ops Table",
 									  100L,
 									  &hash_ctl,
@@ -1516,7 +1533,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 	else if (segno == UNLINK_RELATION_REQUEST)
 	{
 		/* Unlink request: put it in the linked list */
-		MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
 		PendingUnlinkEntry *entry;
 
 		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
@@ -1533,7 +1550,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 	else
 	{
 		/* Normal case: enter a request to fsync this segment */
-		MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
 		PendingOperationEntry *entry;
 		bool		found;
 
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index e83e76d..4185a03 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -60,15 +60,9 @@ static void MemoryContextStatsInternal(MemoryContext context, int level);
  * You should not do memory allocations within a critical section, because
  * an out-of-memory error will be escalated to a PANIC. To enforce that
  * rule, the allocation functions Assert that.
- *
- * There are a two exceptions: 1) error recovery uses ErrorContext, which
- * has some memory set aside so that you don't run out. And 2) checkpointer
- * currently just hopes for the best, which is wrong and ought to be fixed,
- * but it's a known issue so let's not complain about in the meanwhile.
  */
 #define AssertNotInCriticalSection(context) \
-	Assert(CritSectionCount == 0 || (context) == ErrorContext || \
-		   AmCheckpointerProcess())
+	Assert(CritSectionCount == 0 || (context)->allowInCritSection)
 
 /*****************************************************************************
  *	  EXPORTED ROUTINES														 *
@@ -120,7 +114,10 @@ MemoryContextInit(void)
 	 * require it to contain at least 8K at all times. This is the only case
 	 * where retained memory in a context is *essential* --- we want to be
 	 * sure ErrorContext still has some memory even if we've run out
-	 * elsewhere!
+	 * elsewhere! Also, allow allocations in ErrorContext within a critical
+	 * section. Otherwise a PANIC will cause an assertion failure in the
+	 * error reporting code, before printing out the real cause of the
+	 * failure.
 	 *
 	 * This should be the last step in this function, as elog.c assumes memory
 	 * management works once ErrorContext is non-null.
@@ -130,6 +127,7 @@ MemoryContextInit(void)
 										 8 * 1024,
 										 8 * 1024,
 										 8 * 1024);
+	MemoryContextAllowInCriticalSection(ErrorContext, true);
 }
 
 /*
@@ -306,6 +304,26 @@ MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
 }
 
 /*
+ * MemoryContextAllowInCriticalSection
+ *		Allow/disallow allocations in this memory context within a critical
+ *		section.
+ *
+ * Normally, memory allocations are not allowed within a critical section,
+ * because a failure would lead to PANIC.  There are a few exceptions to
+ * that, like allocations related to debugging code that is not supposed to
+ * be enabled in production.  This function can be used to exempt specific
+ * memory contexts from the assertion in palloc().
+ */
+void
+MemoryContextAllowInCriticalSection(MemoryContext context, bool allow)
+{
+	AssertArg(MemoryContextIsValid(context));
+#ifdef USE_ASSERT_CHECKING
+	context->allowInCritSection = allow;
+#endif
+}
+
+/*
  * GetMemoryChunkSpace
  *		Given a currently-allocated chunk, determine the total space
  *		it occupies (including all memory-allocation overhead).
@@ -533,6 +551,7 @@ MemoryContextCreate(NodeTag tag, Size size,
 	MemoryContext node;
 	Size		needed = size + strlen(name) + 1;
 
+	/* creating new memory contexts is not allowed in a critical section */
 	Assert(CritSectionCount == 0);
 
 	/* Get space for node and name */
@@ -570,6 +589,11 @@ MemoryContextCreate(NodeTag tag, Size size,
 		node->parent = parent;
 		node->nextchild = parent->firstchild;
 		parent->firstchild = node;
+
+#ifdef USE_ASSERT_CHECKING
+		/* inherit allowInCritSection flag from parent */
+		node->allowInCritSection = parent->allowInCritSection;
+#endif
 	}
 
 	VALGRIND_CREATE_MEMPOOL(node, 0, false);
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index f79ebd4..ad77509 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -60,6 +60,9 @@ typedef struct MemoryContextData
 	MemoryContext nextchild;	/* next child of same parent */
 	char	   *name;			/* context name (just for debugging) */
 	bool		isReset;		/* T = no space alloced since last reset */
+#ifdef USE_ASSERT_CHECKING
+	bool		allowInCritSection;	/* allow palloc in critical section */
+#endif
 } MemoryContextData;
 
 /* utils/palloc.h contains typedef struct MemoryContextData *MemoryContext */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index d588b14..1d90b9f 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -182,6 +182,7 @@ extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
 
 extern Size LWLockShmemSize(void);
 extern void CreateLWLocks(void);
+extern void InitLWLockAccess(void);
 
 /*
  * The traditional method for obtaining an lwlock for use by an extension is
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 59d0aec..2fede86 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -101,6 +101,8 @@ extern MemoryContext GetMemoryChunkContext(void *pointer);
 extern MemoryContext MemoryContextGetParent(MemoryContext context);
 extern bool MemoryContextIsEmpty(MemoryContext context);
 extern void MemoryContextStats(MemoryContext context);
+extern void MemoryContextAllowInCriticalSection(MemoryContext context,
+									bool allow);
 
 #ifdef MEMORY_CONTEXT_CHECKING
 extern void MemoryContextCheck(MemoryContext context);

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] crash with assertions and WAL_DEBUG

Reply via email to