Re: [HACKERS] shm_mq fix for non-blocking mode

2015-11-03 Thread Robert Haas
On Thu, Oct 22, 2015 at 10:00 PM, Robert Haas  wrote:
>> ...and so I've committed it and back-patched to 9.4.
>
> Sigh.  This was buggy; I have no idea how it survived my earlier testing.
>
> I will go fix it.  Sorry.

Gah!  That, too, turned out to be buggy, although in a considerably
more subtle way.  I've pushed another fix with a detailed comment and
an explanatory commit message that hopefully squashes this problem for
good.  Combined with the fix at
http://www.postgresql.org/message-id/ca+tgmozzv3u9trsvcao+-otxbsz_u+a5q8x-_b+vzcehhtz...@mail.gmail.com
this seems to squash occasional complaints about workers "dying
unexpectedly" when they really had done no such thing.

The test code I used to find these problems is attached.  I compiled
and installed the parallel_dummy extension, did pgbench -i -s 100, and
then ran this:

while psql -c "select parallel_count('pgbench_accounts', 4)"; do sleep 1; done

Without these fixes, this can hang or error out, but with these fixes,
it works fine.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
commit 84fa9b17b8427721af23e38094a6cbfec8c8bb00
Author: Robert Haas 
Date:   Thu Oct 15 19:21:38 2015 -0400

parallel_dummy

diff --git a/contrib/parallel_dummy/Makefile b/contrib/parallel_dummy/Makefile
new file mode 100644
index 000..de00f50
--- /dev/null
+++ b/contrib/parallel_dummy/Makefile
@@ -0,0 +1,19 @@
+MODULE_big = parallel_dummy
+OBJS = parallel_dummy.o $(WIN32RES)
+PGFILEDESC = "parallel_dummy - dummy use of parallel infrastructure"
+
+EXTENSION = parallel_dummy
+DATA = parallel_dummy--1.0.sql
+
+REGRESS = parallel_dummy
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/parallel_dummy
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/parallel_dummy/parallel_dummy--1.0.sql b/contrib/parallel_dummy/parallel_dummy--1.0.sql
new file mode 100644
index 000..d49bd0f
--- /dev/null
+++ b/contrib/parallel_dummy/parallel_dummy--1.0.sql
@@ -0,0 +1,7 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION parallel_dummy" to load this file. \quit
+
+CREATE FUNCTION parallel_count(rel pg_catalog.regclass,
+			  nworkers pg_catalog.int4)
+RETURNS pg_catalog.int8 STRICT
+	AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/contrib/parallel_dummy/parallel_dummy.c b/contrib/parallel_dummy/parallel_dummy.c
new file mode 100644
index 000..f63b63e
--- /dev/null
+++ b/contrib/parallel_dummy/parallel_dummy.c
@@ -0,0 +1,152 @@
+/*--
+ *
+ * parallel_dummy.c
+ *		Test harness code for parallel mode code.
+ *
+ * Copyright (C) 2013-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		contrib/parallel_dummy/parallel_dummy.c
+ *
+ * -
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/parallel.h"
+#include "access/relscan.h"
+#include "access/xact.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/snapmgr.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(parallel_count);
+
+#define		TOC_SCAN_KEY			1
+#define		TOC_RESULT_KEY			2
+
+void		_PG_init(void);
+void		count_worker_main(dsm_segment *seg, shm_toc *toc);
+
+static void count_helper(HeapScanDesc pscan, int64 *resultp);
+
+Datum
+parallel_count(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	int32		nworkers = PG_GETARG_INT32(1);
+	int64	   *resultp;
+	int64		result;
+	ParallelContext *pcxt;
+	ParallelHeapScanDesc pscan;
+	HeapScanDesc	scan;
+	Relation	rel;
+	Size		sz;
+
+	if (nworkers < 0)
+		ereport(ERROR,
+(errmsg("number of parallel workers must be non-negative")));
+
+	rel = relation_open(relid, AccessShareLock);
+
+	EnterParallelMode();
+
+	pcxt = CreateParallelContextForExternalFunction("parallel_dummy",
+			 "count_worker_main",
+			 nworkers);
+	sz = heap_parallelscan_estimate(GetActiveSnapshot());
+	shm_toc_estimate_chunk(>estimator, sz);
+	shm_toc_estimate_chunk(>estimator, sizeof(int64));
+	shm_toc_estimate_keys(>estimator, 2);
+	InitializeParallelDSM(pcxt);
+	pscan = shm_toc_allocate(pcxt->toc, sz);
+	heap_parallelscan_initialize(pscan, rel, GetActiveSnapshot());
+	shm_toc_insert(pcxt->toc, TOC_SCAN_KEY, pscan);
+	resultp = shm_toc_allocate(pcxt->toc, sizeof(int64));
+	shm_toc_insert(pcxt->toc, TOC_RESULT_KEY, resultp);
+
+	LaunchParallelWorkers(pcxt);
+
+	scan = heap_beginscan_parallel(rel, pscan);
+
+	/* here's where we do the "real work" ... */
+	count_helper(scan, resultp);
+
+	WaitForParallelWorkersToFinish(pcxt);
+
+	heap_rescan(scan, NULL);
+	*resultp = 0;
+
+	ReinitializeParallelDSM(pcxt);
+	

Re: [HACKERS] shm_mq fix for non-blocking mode

2015-10-22 Thread Robert Haas
On Fri, Oct 16, 2015 at 5:08 PM, Robert Haas  wrote:
> The shm_mq code handles blocking mode and non-blocking mode
> asymmetrically in a couple of places, with the unfortunate result that
> if you are using non-blocking mode, and your counterparty dies before
> attaching the queue, operations on the queue continue to return
> SHM_MQ_WOULD_BLOCK instead of, as they should, returning
> SHM_MQ_DETACHED.  The attached patch fixes the problem.  Thanks to my
> colleague Rushabh Lathia for helping track this down.
>
> (There's are some further bugs in this area outside the shm_mq code
> ... but I'm still trying to figure out exactly what they are and what
> we should do about them.  This much, however, seems clear-cut.)

...and so I've committed it and back-patched to 9.4.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] shm_mq fix for non-blocking mode

2015-10-22 Thread Robert Haas
On Thu, Oct 22, 2015 at 4:45 PM, Robert Haas  wrote:
> On Fri, Oct 16, 2015 at 5:08 PM, Robert Haas  wrote:
>> The shm_mq code handles blocking mode and non-blocking mode
>> asymmetrically in a couple of places, with the unfortunate result that
>> if you are using non-blocking mode, and your counterparty dies before
>> attaching the queue, operations on the queue continue to return
>> SHM_MQ_WOULD_BLOCK instead of, as they should, returning
>> SHM_MQ_DETACHED.  The attached patch fixes the problem.  Thanks to my
>> colleague Rushabh Lathia for helping track this down.
>>
>> (There's are some further bugs in this area outside the shm_mq code
>> ... but I'm still trying to figure out exactly what they are and what
>> we should do about them.  This much, however, seems clear-cut.)
>
> ...and so I've committed it and back-patched to 9.4.

Sigh.  This was buggy; I have no idea how it survived my earlier testing.

I will go fix it.  Sorry.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] shm_mq fix for non-blocking mode

2015-10-16 Thread Robert Haas
The shm_mq code handles blocking mode and non-blocking mode
asymmetrically in a couple of places, with the unfortunate result that
if you are using non-blocking mode, and your counterparty dies before
attaching the queue, operations on the queue continue to return
SHM_MQ_WOULD_BLOCK instead of, as they should, returning
SHM_MQ_DETACHED.  The attached patch fixes the problem.  Thanks to my
colleague Rushabh Lathia for helping track this down.

(There's are some further bugs in this area outside the shm_mq code
... but I'm still trying to figure out exactly what they are and what
we should do about them.  This much, however, seems clear-cut.)

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index 80956ce..61f9298 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -142,6 +142,8 @@ static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mq, Size nbytes,
   const void *data, bool nowait, Size *bytes_written);
 static shm_mq_result shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed,
 	 bool nowait, Size *nbytesp, void **datap);
+static bool shm_mq_counterparty_gone(volatile shm_mq *mq,
+		 BackgroundWorkerHandle *handle);
 static bool shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr,
 	 BackgroundWorkerHandle *handle);
 static uint64 shm_mq_get_bytes_read(volatile shm_mq *mq, bool *detached);
@@ -499,6 +501,8 @@ shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait)
 	{
 		if (nowait)
 		{
+			if (shm_mq_counterparty_gone(mq, mqh->mqh_handle))
+return SHM_MQ_DETACHED;
 			if (shm_mq_get_sender(mq) == NULL)
 return SHM_MQ_WOULD_BLOCK;
 		}
@@ -794,6 +798,11 @@ shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
 			 */
 			if (nowait)
 			{
+if (shm_mq_counterparty_gone(mq, mqh->mqh_handle))
+{
+	*bytes_written = sent;
+	return SHM_MQ_DETACHED;
+}
 if (shm_mq_get_receiver(mq) == NULL)
 {
 	*bytes_written = sent;
@@ -948,6 +957,45 @@ shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed, bool nowait,
 }
 
 /*
+ * Test whether a counterparty who may not even be alive yet is definitely gone.
+ */
+static bool
+shm_mq_counterparty_gone(volatile shm_mq *mq, BackgroundWorkerHandle *handle)
+{
+	bool	detached;
+	pid_t	pid;
+
+	/* Acquire the lock just long enough to check the pointer. */
+	SpinLockAcquire(>mq_mutex);
+	detached = mq->mq_detached;
+	SpinLockRelease(>mq_mutex);
+
+	/* If the queue has been detached, counterparty is definitely gone. */
+	if (detached)
+		return true;
+
+	/* If there's a handle, check worker status. */
+	if (handle != NULL)
+	{
+		BgwHandleStatus status;
+
+		/* Check for unexpected worker death. */
+		status = GetBackgroundWorkerPid(handle, );
+		if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+		{
+			/* Mark it detached, just to make it official. */
+			SpinLockAcquire(>mq_mutex);
+			mq->mq_detached = true;
+			SpinLockRelease(>mq_mutex);
+			return true;
+		}
+	}
+
+	/* Counterparty is not definitively gone. */
+	return false;
+}
+
+/*
  * This is used when a process is waiting for its counterpart to attach to the
  * queue.  We exit when the other process attaches as expected, or, if
  * handle != NULL, when the referenced background process or the postmaster

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers