Re: [HACKERS] Tracking wait event for latches

2016-08-03 Thread Michael Paquier
On Tue, Aug 2, 2016 at 10:31 PM, Michael Paquier
 wrote:
> Attached is an updated patch.

Updated version for 2 minor issues:
1) s/stram/stream/
2) Docs used incorrect number
-- 
Michael
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 8ca1c1c..954a166 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -496,7 +496,7 @@ pgfdw_get_result(PGconn *conn, const char *query)
 			wc = WaitLatchOrSocket(MyLatch,
    WL_LATCH_SET | WL_SOCKET_READABLE,
    PQsocket(conn),
-   -1L);
+   -1L, EVENT_EXTENSION);
 			ResetLatch(MyLatch);
 
 			CHECK_FOR_INTERRUPTS();
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 0776428..a14abe4 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -679,6 +679,16 @@ postgres   27093  0.0  0.0  30096  2752 ?Ss   11:34   0:00 postgres: ser
   buffer in question.
  
 
+
+ 
+  EventSet: The server process is waiting on a socket
+  or a timer. This wait happens in a latch, an inter-process facility
+  using boolean variables letting a process sleep until it is set.
+  Latches support several type of operations, like postmaster death
+  handling, timeout and socket activity lookup, and are a useful
+  replacement for sleep where signal handling matters.
+ 
+

   
  
@@ -1085,6 +1095,139 @@ postgres   27093  0.0  0.0  30096  2752 ?Ss   11:34   0:00 postgres: ser
  BufferPin
  Waiting to acquire a pin on a buffer.
 
+
+ EventSet
+ ArchiverMain
+ Waiting in main loop of the archiver process.
+
+
+ AutoVacuumMain
+ Waiting in main loop of autovacuum launcher process.
+
+
+ BaseBackupThrottle
+ Waiting during base backup when throttling activity.
+
+
+ BgWorkerStartup
+ Waiting for background worker to start up.
+
+
+ BgWorkerShutdown
+ Waiting for background worker to shut down.
+
+
+ BgWriterMain
+ Waiting in main loop of background writer process background worker.
+
+
+ BgWriterHibernate
+ Waiting in background writer process, hibernating.
+
+
+ CheckpointerMain
+ Waiting in main loop of checkpointer process.
+
+
+ ExecuteGather
+ Waiting for activity from child process when executing Gather node.
+
+
+ Extension
+ Waiting in the code path of an extension, should be used by custom plugins and modules
+
+
+ MessageQueuePutMessage
+ Waiting to put new message in shared message queue.
+
+
+ MessageQueueSend
+ Waiting to send bytes in shared message queue.
+
+
+ MessageQueueReceive
+ Waiting to receive bytes in shared message queue.
+
+
+ MessageQueueInternal
+ Waiting for other process to be attached in shared message queue.
+
+
+ ParallelFinish
+ Waiting for parallel workers to finish computing.
+
+
+ PgSleep
+ Waiting in process that called pg_sleep.
+
+
+ PgStatMain
+ Waiting in main loop of the statistics collector process.
+
+
+ ProcSleep
+ Waiting for a specific lock.
+
+
+ ProcSignal
+ Waiting for signal from another backend.
+
+
+ RecoveryApplyDelay
+ Waiting to apply WAL at recovery because it is delayed.
+
+
+ RecoveryWalStream
+ Waiting for WAL from a stream at recovery.
+
+
+ RecoveryWalAll
+ Waiting for WAL from any kind of source (local, archive or stream) at recovery.
+
+
+ SecureRead
+ Waiting to read data from a secure connection.
+
+
+ SecureWrite
+ Waiting to write data to a secure connection.
+
+
+ SSLOpenServer
+ Waiting for SSL while attempting connection.
+
+
+ SyncRep
+ Waiting for WAL commit during synchronous replication.
+
+
+ SysLoggerMain
+ Waiting in main loop of syslogger process.
+
+
+ WalWriterMain
+ Waiting in main loop of WAL writer process.
+
+
+ WalReceiverWaitStart
+ Waiting for startup process to send initial data for streaming replication.
+
+
+ WalReceiverMain
+ Waiting in main loop of WAL receiver process.
+
+
+ WalSenderMain
+ 

[HACKERS] Optimization for lazy_scan_heap

2016-08-03 Thread Masahiko Sawada
Hi all,

While reviewing freeze map code, Andres pointed out[1] that
lazy_scan_heap could accesses visibility map twice and its logic is
seems a bit tricky.
As discussed before, it's not nice especially when large relation is
entirely frozen.

I posted the patch for that before but since this is an optimization,
not bug fix, I'd like to propose it again.
Please give me feedback.

[1] 
https://www.postgresql.org/message-id/20160505000840.epatoq6d2e556446%40alap3.anarazel.de

Regards,

--
Masahiko Sawada
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 231e92d..e7cdd2c 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -471,6 +471,8 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 		PROGRESS_VACUUM_MAX_DEAD_TUPLES
 	};
 	int64		initprog_val[3];
+	BlockNumber	n_skipped;
+	BlockNumber n_all_frozen;
 
 	pg_rusage_init();
 
@@ -501,6 +503,9 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 	initprog_val[2] = vacrelstats->max_dead_tuples;
 	pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
 
+	n_skipped = 0;
+	n_all_frozen = 0;
+
 	/*
 	 * Except when aggressive is set, we want to skip pages that are
 	 * all-visible according to the visibility map, but only when we can skip
@@ -558,14 +563,20 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 			{
 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
 	break;
+n_all_frozen++;
 			}
 			else
 			{
 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
 	break;
+
+/* Count the number of all-frozen pages */
+if (vmstatus & VISIBILITYMAP_ALL_FROZEN)
+	n_all_frozen++;
 			}
 			vacuum_delay_point();
 			next_unskippable_block++;
+			n_skipped++;
 		}
 	}
 
@@ -574,7 +585,8 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 	else
 		skipping_blocks = false;
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	blkno = 0;
+	while (blkno < nblocks)
 	{
 		Buffer		buf;
 		Page		page;
@@ -592,18 +604,22 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 		TransactionId visibility_cutoff_xid = InvalidTransactionId;
 
 		/* see note above about forcing scanning of last page */
-#define FORCE_CHECK_PAGE() \
-		(blkno == nblocks - 1 && should_attempt_truncation(vacrelstats))
+#define FORCE_CHECK_PAGE(blk) \
+		((blk) == nblocks - 1 && should_attempt_truncation(vacrelstats))
 
 		pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
 
 		if (blkno == next_unskippable_block)
 		{
+			n_skipped = 0;
+			n_all_frozen = 0;
+
 			/* Time to advance next_unskippable_block */
 			next_unskippable_block++;
 			if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
 			{
-while (next_unskippable_block < nblocks)
+while (next_unskippable_block < nblocks &&
+	   !FORCE_CHECK_PAGE(next_unskippable_block))
 {
 	uint8		vmskipflags;
 
@@ -614,14 +630,22 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 	{
 		if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
 			break;
+
+		/* Count the number of all-frozen pages */
+		n_all_frozen++;
 	}
 	else
 	{
 		if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
 			break;
+
+		/* Count the number of all-frozen pages */
+		if (vmskipflags & VISIBILITYMAP_ALL_FROZEN)
+			n_all_frozen++;
 	}
 	vacuum_delay_point();
 	next_unskippable_block++;
+	n_skipped++;
 }
 			}
 
@@ -646,26 +670,23 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 		else
 		{
 			/*
-			 * The current block is potentially skippable; if we've seen a
-			 * long enough run of skippable blocks to justify skipping it, and
-			 * we're not forced to check it, then go ahead and skip.
-			 * Otherwise, the page must be at least all-visible if not
-			 * all-frozen, so we can set all_visible_according_to_vm = true.
+			 * The current block is the first block of continuous skippable blocks,
+			 * and we know that how many blocks we can skip to scan. If we've
+			 * seen a long enough run of skippable blocks to justify skipping it,
+			 * then go ahead and skip. Otherwise, the page must be at least all-visible
+			 * if not all-frozen, so we can set all_visible_according_to_vm = true.
 			 */
-			if (skipping_blocks && !FORCE_CHECK_PAGE())
+			if (skipping_blocks)
 			{
 /*
- * Tricky, tricky.  If this is in aggressive vacuum, the page
- * must have been all-frozen at the time we checked whether it
- * was skippable, but it might not be any more.  We must be
- * careful to count it as a skipped all-frozen page in that
- * case, or else we'll think we can't update relfrozenxid and
- * relminmxid.  If it's not an aggressive vacuum, we don't
- * know whether it was all-frozen, so we have to recheck; but
- * in this case an approximate answer is OK.
+ * We know that there 

Re: [HACKERS] max_parallel_degree > 0 for 9.6 beta

2016-08-03 Thread Tom Lane
Noah Misch  writes:
> On Wed, Apr 20, 2016 at 02:28:15PM -0400, Tom Lane wrote:
>> +1, but let's put an entry on the 9.6 open-items page to remind us to
>> make that decision at the right time.

> It's that time.  Do we restore the max_parallel_workers_per_gather=0 default,
> or is enabling this by default the right thing after all?

At this point I'd have to vote against enabling by default in 9.6.  The
fact that in the past week we've found bugs as bad as e1a93dd6a does not
give me a warm fuzzy feeling about the parallel-query code being ready
for prime time.

Of course the question is how do we ever get to that point if we chicken
out with enabling it by default now.  Maybe we could keep it turned on
in HEAD.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Tom Lane
Craig Ringer  writes:
> On 4 August 2016 at 02:15, Tom Lane  wrote:
>> So it seems like fixing libpq's parsing of server_version_num is
>> something we definitely want to fix ASAP in all back branches.

> Well, this seems like a good time to make server_version_num GUC_REPORT as
> well...

To what end?  Existing versions of libpq wouldn't know about it, and new
versions of libpq couldn't rely on it to get reported by older servers,
so it'd still be the path of least resistance to examine server_version.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Quorum commit for multiple synchronous replication.

2016-08-03 Thread Masahiko Sawada
On Wed, Aug 3, 2016 at 3:05 PM, Michael Paquier
 wrote:
> On Wed, Aug 3, 2016 at 2:52 PM, Masahiko Sawada  wrote:
>> I was thinking that the syntax for quorum method would use '[ ... ]'
>> but it will be confused with '( ... )' priority method used.
>> 001 patch adds 'Any N ( ... )' style syntax but I know that we still
>> might need to discuss about better syntax, discussion is very welcome.
>> Attached draft patch, please give me feedback.
>
> I am +1 for using either "{}" or "[]" to define a quorum set, and -1
> for the addition of a keyword in front of the integer defining for how
> many nodes server need to wait for.

Thank you for reply.
"{}" or "[]" are not bad but because these are not intuitive, I
thought that it will be hard for uses to use different method for each
purpose.

> -foreach(cell, sync_standbys)
> +foreach (cell, sync_standbys)
>  {
> -WalSnd   *walsnd = >walsnds[lfirst_int(cell)];
> +WalSnd *walsnd = >walsnds[lfirst_int(cell)];
> This patch has some noise.

Will fix.

-- 
Regards,

--
Masahiko Sawada


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] max_parallel_degree > 0 for 9.6 beta

2016-08-03 Thread Noah Misch
On Wed, Apr 20, 2016 at 02:28:15PM -0400, Tom Lane wrote:
> Andres Freund  writes:
> > max_parallel_degree currently defaults to 0.  I think we should enable
> > it by default for at least the beta period. Otherwise we're primarily
> > going to get reports back after the release.
> 
> > Then, at the end of beta, we can decide what the default should be.
> 
> +1, but let's put an entry on the 9.6 open-items page to remind us to
> make that decision at the right time.

It's that time.  Do we restore the max_parallel_workers_per_gather=0 default,
or is enabling this by default the right thing after all?


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Craig Ringer
On 4 August 2016 at 02:15, Tom Lane  wrote:

> Robert Haas  writes:
> > On Wed, Aug 3, 2016 at 12:12 PM, Peter Eisentraut
> >  wrote:
> >> One hiccup I found is that server_version_num is not sent to clients.
> >> Instead, libpq assembles the numeric version number itself from the
> >> string version, and it will fail if it sees only one number (e.g.,
> >> 10devel).  It will then set the version number to 0 for "unknown".
>
> Ugh.
>
> > This pretty much sucks.  I suppose we could at least alleviate the
> > problem by back-patching some intelligence about the new scheme into
> > back-branches, but of course that will only help people if they
> > install newer minor releases.
>
> Yeah.  I doubt there is much reason to assume that people would be
> using, say, a 9.5.5 psql and a 9.5.3 libpq or vice versa.  Whatever
> the current client behavior is is what people will see.
>
> Having said that, this sort of problem is one reason we wanted to give
> ourselves a full year to implement the new scheme.  If we put some
> appropriate fix into the back branches *now*, there would be a fair
> amount of daylight for that to spread into the field before any users
> would be seeing v10 servers in practice.
>
> So it seems like fixing libpq's parsing of server_version_num is
> something we definitely want to fix ASAP in all back branches.
> Is there anything else that's particularly bad?
>
>
Well, this seems like a good time to make server_version_num GUC_REPORT as
well...

-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


[HACKERS] Re: [sqlsmith] FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", Line: 511)

2016-08-03 Thread Noah Misch
On Wed, Aug 03, 2016 at 05:52:44PM -0400, Tom Lane wrote:
> I wrote:
> > I'm thinking there are two distinct bugs here.
> 
> Actually, make that three bugs.  I was so focused on the crashing
> that I failed to notice that ts_delete wasn't producing sane answers
> even when it didn't crash:

[Action required within 72 hours.  This is a generic notification.]

The above-described topic is currently a PostgreSQL 9.6 open item.  Teodor,
since you committed the patch believed to have created it, you own this open
item.  If some other commit is more relevant or if this does not belong as a
9.6 open item, please let us know.  Otherwise, please observe the policy on
open item ownership[1] and send a status update within 72 hours of this
message.  Include a date for your subsequent status update.  Testers may
discover new open items at any time, and I want to plan to get them all fixed
in advance of shipping 9.6rc1 next week.  Consequently, I will appreciate your
efforts toward speedy resolution.  Thanks.

[1] 
http://www.postgresql.org/message-id/20160527025039.ga447...@tornado.leadboat.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Claudio Freire
On Wed, Aug 3, 2016 at 4:37 PM, Claudio Freire  wrote:
> On Wed, Aug 3, 2016 at 4:20 AM, Simon Riggs  wrote:
>> == IndexScan ==
>>
>> Note that the executor code for IndexScan appears identical between
>> the two optimizations. The difference between duplicate and range LITE
>> tuples is needed only at INSERT time (or UPDATE indexed column to a
>> new value).
>>
>> When we do an IndexScan if we see a LITE tuple we do a scan of the
>> linepointer ranges covered by this index tuple (which might eventually
>> go to a full block scan). For example with bit1 set we would scan 16
>> linepointers (on an 8192 byte block) and with 2 bits set we would scan
>> 32 linepointers etc.., not necessarily consecutive ranges. So the
>> IndexScan can return potentially many heap rows per index entry, which
>> need to be re-checked and may also need to be sorted prior to
>> returning them. If no rows are returned we can kill the index pointer,
>> just as we do now for btrees, so they will be removed eventually from
>> the index without the need for VACUUM. An BitmapIndexScan that sees a
>> lossy pointer can also use the lossy TID concept, so we can have
>> partially lossy bitmaps.
>
> Wouldn't this risk scanning rows more than once unless it's part of a
> bitmap scan?

I think an alternative that would not have such a problem and would
have a smaller impact both in performance and code, would be to just
not add new index tuples when the update happens on the same page,
easily checked by comparing the old tuple's t_ctid against the new's.
Index scans would have to follow same-page update chains, and perhaps
vacuum would need some fixing too, but that would probably be less of
a performance hit than the proposed lossy item pointers.


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Bruce Momjian
On Wed, Aug  3, 2016 at 08:34:02PM -0400, Bruce Momjian wrote:
> On Thu, Aug  4, 2016 at 01:16:20AM +0100, Simon Riggs wrote:
> > > Would you only add a LITE index entry when there isn't an
> > > existing index entry for the same values and heap page?  That seems
> > > quite complicated.
> > 
> > The insertion algorithm is described. Doesn't seem complicated to me.
> 
> Ah, I see it now:
> 
>   As UPDATEs occur we request inserts into the index. If a lossy index
>   pointer already exists that covers the new linepointer then we skip
>   the index insert, optimizing writes to the index.
> 
> I thought you meant "already exists" just means it matches the existing
> range.  I was unclear that was the entire page.

How do plan to clear the bitmask so it, over time, doesn't end up being
all-set?  I can see how a non-LITE index tuple would become a LITE tuple
when an update chain happens on the same page, but then new matching
index values would also set the bitmap, update-chain or not.  Then, when
the update chain is gone and only non-update-chain tuples exist, you
will need to remove the bitmap and create new index entries by scanning
the heap page.

Also, what happens when you have two non-LITE index tuples in a heap
page, and then you create an update chain in the heap page --- do you
remove one of the two index entries and create a bitmap out of the
remaining one?  Do you put them back when the heap chain is gone?

Also, why not use this bitmap for all indexes, not just update chains? 
Because it would break index-only scans?

I think there are two use-cases for LITE --- first, it would shrink
indexes for tables with many duplicate indexed values on the same page. 
In the case Simon mentioned, it would help with update chain churn, but
the problem is it seems to cause a lot of overhead to create and remove
the bitmap in the index.  This is why I was thinking of per-update-chain
LITE entries.

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Bruce Momjian
On Thu, Aug  4, 2016 at 08:22:25AM +0800, Craig Ringer wrote:
> Yep, it does. But we've made little to no progress on integration of ICU
> support and AFAIK nobody's working on it right now. 

Uh, this email from July says Peter Eisentraut will submit it in
September  :-)


https://www.postgresql.org/message-id/2b833706-1133-1e11-39d9-4fa228892...@2ndquadrant.com

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Way to access LSN (for each transaction) by directly talking to postgres?

2016-08-03 Thread Michael Paquier
On Thu, Aug 4, 2016 at 3:02 AM, Joshua Bay  wrote:
> Could you please tell me how I can get LSN of each transaction at decoder
> plugin?

Craig already gave you hints, but here are more. You will need to hack
your own plugin. You could just use the one in contrib/test_decoding,
remove most of its code, and use the commit callback to issue the LSN
you are interested in. Note as well that when using
pg_logical_slot_peek_changes or pg_logical_slot_get_changes, you can
get a LSN location. Using test_decoding as a base, that's not a
complicated effort.
-- 
Michael


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Bruce Momjian
On Thu, Aug  4, 2016 at 01:16:20AM +0100, Simon Riggs wrote:
> > Would you only add a LITE index entry when there isn't an
> > existing index entry for the same values and heap page?  That seems
> > quite complicated.
> 
> The insertion algorithm is described. Doesn't seem complicated to me.

Ah, I see it now:

As UPDATEs occur we request inserts into the index. If a lossy index
pointer already exists that covers the new linepointer then we skip
the index insert, optimizing writes to the index.

I thought you meant "already exists" just means it matches the existing
range.  I was unclear that was the entire page.

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Bruce Momjian
On Thu, Aug  4, 2016 at 01:16:20AM +0100, Simon Riggs wrote:
> On 4 August 2016 at 00:56, Bruce Momjian  wrote:
> > On Wed, Aug  3, 2016 at 07:28:52PM -0400, Bruce Momjian wrote:
> >> With LITE, you can avoid the creation of duplicate-value index entries
> >> for indexes without changed column values by using a bitmap in place of
> >> the tid item number (16 bits).  It can't remove dead tids.
> >
> > How would you handle the case where there are two LITE index entries
> > pointing to two different update chains on the same page?
> > When you
> > search the page for the first heap chain, could the second index entry
> > find the same chain.  How would you know which index entry is which
> > chain?
> 
> It's easiest to understand this is by imagining each LITE pointer
> pointing to a whole page. The chains aren't followed during the scan,
> individual heap tuple versions are treated as they would be by a seq
> scan.
> 
> That is more expensive than we might like, so the bitmap/linepointer
> thing is just an extra tweak to avoid scanning the whole block. The
> bitmap refers to ranges of linepointers, not chains. i.e. 0-15, 16-31,
> 32-47 etc

Well, there is no way to know how many linepointers there are on a page,
so doing "mod 16" automatically hashes the line pointers into the 16-bit
field.

> > Would you only add a LITE index entry when there isn't an
> > existing index entry for the same values and heap page?  That seems
> > quite complicated.
> 
> The insertion algorithm is described. Doesn't seem complicated to me.

OK.

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Way to access LSN (for each transaction) by directly talking to postgres?

2016-08-03 Thread Craig Ringer
On 4 August 2016 at 01:35, Joshua Bay  wrote:

> Thanks for responses!
>
> The problem I wanted to solve was to find the (global) order of commits
> across the postgres cluster. So, my attempt was to use the LSN.
>

Have a look at how logical decoding does it. Check out ReorderBufferCommit
in src/backend/replication/logical/reorderbuffer.c  .

Or just write a trivial a logical decoding plugin that only implements the
commit callback and only emits the LSN.

-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


Re: [HACKERS] PostgreSQL 10 kick-off

2016-08-03 Thread Michael Paquier
On Thu, Aug 4, 2016 at 12:09 AM, Fabrízio de Royes Mello
 wrote:
> If there are no complains about my lack of experience in this field I would
> like do become the next CFM (am I the first brazilian??)

That would be great. I am not sending more patches for the 1st CF and
as long as there are no issues for 9.6 to deal with, so I am switching
to patch review mode pretty soon, so I'll help out with things.
-- 
Michael


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Craig Ringer
On 4 August 2016 at 05:00, Thomas Munro 
wrote:

> On Thu, Aug 4, 2016 at 5:16 AM, Craig Ringer 
> wrote:
> > On 3 August 2016 at 22:54, Álvaro Hernández Tortosa 
> wrote:
> >> What would it take to support it? Isn't the varlena header
> propagated
> >> everywhere, which could help infer the real length of the string? Any
> >> pointers or suggestions would be welcome.
> >
> >
> > One of the bigger pain points is that our interaction with C library
> > collation routines for sorting uses NULL-terminated C strings.  strcoll,
> > strxfrm, etc.
>
> That particular bit of the problem would go away if this ever happened:
>
> https://wiki.postgresql.org/wiki/Todo:ICU
>
> ucoll_strcoll takes explicit lengths (though optionally accepts -1 for
> null terminated mode).
>
>
> http://userguide.icu-project.org/strings#TOC-Using-C-Strings:-NUL-Terminated-vs.-Length-Parameters
>

Yep, it does. But we've made little to no progress on integration of ICU
support and AFAIK nobody's working on it right now.

I wonder how MySQL implements their collation and encoding support?

-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Simon Riggs
On 4 August 2016 at 00:56, Bruce Momjian  wrote:
> On Wed, Aug  3, 2016 at 07:28:52PM -0400, Bruce Momjian wrote:
>> With LITE, you can avoid the creation of duplicate-value index entries
>> for indexes without changed column values by using a bitmap in place of
>> the tid item number (16 bits).  It can't remove dead tids.
>
> How would you handle the case where there are two LITE index entries
> pointing to two different update chains on the same page?
> When you
> search the page for the first heap chain, could the second index entry
> find the same chain.  How would you know which index entry is which
> chain?

It's easiest to understand this is by imagining each LITE pointer
pointing to a whole page. The chains aren't followed during the scan,
individual heap tuple versions are treated as they would be by a seq
scan.

That is more expensive than we might like, so the bitmap/linepointer
thing is just an extra tweak to avoid scanning the whole block. The
bitmap refers to ranges of linepointers, not chains. i.e. 0-15, 16-31,
32-47 etc

> Would you only add a LITE index entry when there isn't an
> existing index entry for the same values and heap page?  That seems
> quite complicated.

The insertion algorithm is described. Doesn't seem complicated to me.

-- 
Simon Riggshttp://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Possible duplicate release of buffer lock.

2016-08-03 Thread Kyotaro HORIGUCHI
Sorry. That's partially wrong.

At Wed, 03 Aug 2016 17:31:16 +0900 (Tokyo Standard Time), Kyotaro HORIGUCHI 
 wrote in 
<20160803.173116.111915228.horiguchi.kyot...@lab.ntt.co.jp>
> I had an inquiry about the following log messages.
> 
> 2016-07-20 10:16:58.294 JST,,,3240,,578ed102.ca8,1,,2016-07-20 10:16:50 
> JST,30/75,0,LOG,0,"no left sibling (concurrent deletion?) in 
> ""some_index_rel""""_bt_unlink_halfdead_page, nbtpage.c:1643",""
> 2016-07-20 10:16:58.294 JST,,,3240,,578ed102.ca8,2,,2016-07-20 10:16:50 
> JST,30/75,0,ERROR,XX000,"lock main 13879 is not held","automatic vacuum 
> of table ""db.nsp.tbl""",,,"LWLockRelease, lwlock.c:1137","" 
> 
> These are gotten after pg_upgrade from 9.1.13 to 9.4.
> 
> The first line is emitted for simultaneous deletion of a index
> page, which is impossible by design in a consistent state so the
> complained situation should be the result of an index corruption
> before upgading, specifically, inconsistent sibling pointers
> around a deleted page.
> 
> I noticed the following part in nbtpage.c related to this. It is
> the same still in the master.
> 
> nbtpage.c:1635@9.4.8:
> 
> >  while (P_ISDELETED(opaque) || opaque->btpo_next != target)
> >  {
> >  /* step right one page */
> >  leftsib = opaque->btpo_next;
> >  _bt_relbuf(rel, lbuf);
> >  if (leftsib == P_NONE)
> >  {
> >  elog(LOG, "no left sibling (concurrent deletion?) in 
> > \"%s\"",
> >   RelationGetRelationName(rel));
> >  return false;
> 
> With the condition for the while loop, if the just left sibling
> of target is (mistakenly, of course) in deleted state (and the
> target is somehow pointing to the deleted page as left sibling),
> lbuf finally goes beyond to right side of the target. This seems
> to result in unintentional releasing of the lock on target and
> the second log message.
> 
> 
> My point here is that if concurrent deletion can't be perfomed by
> the current implement, this while loop could be removed and
> immediately error out or log a message,
> 
> 
> > if (P_ISDELETED(opaque) || opaque->btpo_next != target)
> > {
> >elog(ERROR, "no left sibling of page %d (concurrent deletion?) in 
> > \"%s\"",..

The above is the result of forgetting the main object of this
loop. Please forget it.

Still it seems right to stop before target.

> or, the while loop at least should stop before overshooting the
> target.
> 
> > while (P_ISDELETED(opaque) || opaque->btpo_next != target)
> > {
> >/* step right one page */
> >leftsib = opaque->btpo_next;
> >_bt_relbuf(rel, lbuf);
> >if (leftsib == target || leftsib == P_NONE)
> >{
> >  elog(ERROR, "no left sibling of page %d (concurrent deletion?) in 
> > \"%s\"",..

regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Bruce Momjian
On Wed, Aug  3, 2016 at 07:28:52PM -0400, Bruce Momjian wrote:
> With LITE, you can avoid the creation of duplicate-value index entries
> for indexes without changed column values by using a bitmap in place of
> the tid item number (16 bits).  It can't remove dead tids.

How would you handle the case where there are two LITE index entries
pointing to two different update chains on the same page?  When you
search the page for the first heap chain, could the second index entry
find the same chain.  How would you know which index entry is which
chain?  Would you only add a LITE index entry when there isn't an
existing index entry for the same values and heap page?  That seems
quite complicated.

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] regression test for extended query protocol

2016-08-03 Thread Michael Paquier
On Thu, Aug 4, 2016 at 12:14 AM, Dave Cramer  wrote:
> We currently run tests every time a PR is created on github, but I don't
> think there are any animals running the JDBC test suite
>
> We can add tests, what exactly do we want to test. Then setting up an animal
> to run the tests would be fairly straight forward.

It may be interesting to implement that as a module in the buildfarm
client I think that any animal could include at will. Just a thought.
-- 
Michael


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Slowness of extended protocol

2016-08-03 Thread Bruce Momjian
On Wed, Aug  3, 2016 at 10:02:39AM -0400, Tom Lane wrote:
> Bruce Momjian  writes:
> > On Sun, Jul 31, 2016 at 05:57:12PM -0400, Tom Lane wrote:
> >> In hindsight it seems clear that what a lot of apps want out of extended
> >> protocol is only the ability to send parameter values out-of-line instead
> >> of having to quote/escape them into SQL literals.  Maybe an idea for the
> >> fabled V4 protocol update is some compromise query type that corresponds
> >> precisely to PQexecParams's feature set: you can send parameter values
> >> out-of-line, and you can specify text or binary results, but there's no
> >> notion of any persistent state being created and no feedback about
> >> parameter data types.
> 
> > Do you want this on the TODO list?
> 
> I didn't hear anyone say it was a silly idea, so sure.

Done:

Create a more efficient way to handle out-of-line parameters 

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Bruce Momjian
On Wed, Aug  3, 2016 at 08:20:49AM +0100, Simon Riggs wrote:
> == Update Duplicate Removal ==
> 
> We want an optimization that reduces the effects of multiple UPDATEs
> on the same block that have duplicate values caused because another
> index column has been updated and a non-HOT index insert has been
> generated. This optimizes the case where someone has 12 indexes yet
> updates just one of them, so we optimize the 11 unnecessary updates,
> if possible.
> 
> As UPDATEs occur we request inserts into the index. If a lossy index
> pointer already exists that covers the new linepointer then we skip
> the index insert, optimizing writes to the index. If a lossy index
> pointer already exists for that value but doesn't yet cover our
> linepointer then we update the bitmask. If a non-lossy index pointer
> already exists we set the lossy bit and update the bitmask to reflect
> which linepointers the index entry covers. If no index pointer exists
> we insert one. The first update on a row will not be optimized, but
> the 2nd - 16th could be. These actions optimize away most of the
> writes and WAL activity to the index data block since only 1 in every
> 16 updates would cause changes to the block (actually about 1 in 10 on
> average). Most importantly, updates will cause far fewer index block
> splits and reindexing will be less needed.
> 
> The same situation can occur if we have many INSERTs with same values
> on the same block. This could lead to a reduction in size of the btree
> for indexes with many duplicate values.

Let me see if I understand this.  Right now, if no indexed values are
changed and the old and new update rows are in the same page, we don't
create a new index entry for the new row, but rather use redirect tid
pointers.  We can recycle tuple data and tid pointers for frequent
updates because there is only one index entry for the entire update
chain.

This doesn't work if we changed an indexed value.  Your solution is not
to use redirect tid/line pointers at all.  Instead, you create normal
index pointers for the indexes with changed column values.  For the
indexes without changed column values, you create this new LITE index
entry which can point to all the update-chain tuples in the same page.

In Postgres currently, we can remove tuple data when it is expired, and
tids if they are part of redirect chains (no index changes), and need
vacuum to remove non-redirect tids and old index entries.

With LITE, you can avoid the creation of duplicate-value index entries
for indexes without changed column values by using a bitmap in place of
the tid item number (16 bits).  It can't remove dead tids.

I had some ideas of how to handle the bitmap, like doing "mod 16" on the
tid item value, meaning we would have 16 equal-sized buckets.  Assuming
tuples are 100 bytes, that makes each bucket hold five rows on average.

However, I am now wondering why bother with the bitmap at all.  Can't
the LITE item pointer just point to the head of the update chain, and we
can walk the chain to find the tuple that matches our snapshot?  (The
tuple has a pointer to the next entry in the chain.)

Basically, right now with an update that changes values in some indexes
and not others, and the tuples are all on the same page, we are creating
multiple index pointers to point to different tuples on the same page. 
I am suggesting we don't do that, and instead just leave the index tuple
pointing to the head of the chain.

I think the big problem with this is that removing the tuple data will
mark also remove the rows update chain pointer.  Maybe we can modify tid
redirect pointers to handle this.

> == Clustered Index ==

I read about clustered indexes and I am not sure of how it would help in
most cases.  Are there enough index entries with identical or
sequentially-grouped values to be useful?  It kind of feels like a
single-page BRIN index.

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Small fix: avoid passing null pointers to memcpy()

2016-08-03 Thread Peter Eisentraut
On 4/16/16 8:48 AM, Piotr Stefaniak wrote:
> On 2016-04-03 09:24, Piotr Stefaniak wrote:
>> > from running the regression test suite (including TAP tests) and also
>> > sqlsmith, I've got a couple of places where UBSan reported calls to
>> > memcpy() with null pointer passed as either source or destination.
>> >
>> > Patch attached.
> Patch updated.
> 
> Since this time the patch includes fixes for other standard library 
> function calls (memset and bsearch), I'm renaming the patch file to be 
> more generic.

Most of these changes appear to address the case where the size argument
of memcpy() is zero.  I'm not sure why those changes are necessary.  At
least in some cases that I sporadically checked, a zero size does not
lead to a null pointer argument.  We'll need some more details here.

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Álvaro Hernández Tortosa



On 03/08/16 21:42, Geoff Winkless wrote:

On 3 August 2016 at 20:36, Álvaro Hernández Tortosa  wrote:

 Isn't the correct syntax something like:

select E'\uc080', U&'\c080';

?

 It is a single character, 16 bit unicode sequence (see
https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html).

No, what you've done there is created the three-byte utf8 sequence \xec8280

# select U&'\c080'::bytea;
   bytea
--
  \xec8280

It's not a UCS2 c080, it's utf8 c080.

Geoff


Yes, you're absolutely right ^_^

Álvaro


--

Álvaro Hernández Tortosa


---
8Kdata



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Cache Hash Index meta page.

2016-08-03 Thread Tom Lane
Jeff Janes  writes:
> On Fri, Jul 22, 2016 at 3:02 AM, Mithun Cy  wrote:
>> I have created a patch to cache the meta page of Hash index in
>> backend-private memory. This is to save reading the meta page buffer every
>> time when we want to find the bucket page. In “_hash_first” call, we try 
>> to
>> read meta page buffer twice just to make sure bucket is not split after we
>> found bucket page. With this patch meta page buffer read is not done, if the
>> bucket is not split after caching the meta page.

Is this really safe?  The metapage caching in btree is all right because
the algorithm is guaranteed to work even if it starts with a stale idea of
where the root page is.  I do not think the hash code is equally robust
about stale data in its metapage.

>> Idea is to cache the Meta page data in rd_amcache and store maxbucket number
>> in hasho_prevblkno of bucket primary page (which will always be NULL other
>> wise, so reusing it here for this cause!!!).

> If it is otherwise unused, shouldn't we rename the field to reflect
> what it is now used for?

No, because on other pages that still means what it used to.  Nonetheless,
I agree that's a particularly ugly wart, and probably a dangerous one.

> What happens on a system which has gone through pg_upgrade?

That being one reason why.  It might be okay if we add another hasho_flag
bit saying that hasho_prevblkno really contains a maxbucket number, and
then add tests for that bit everyplace that hasho_prevblkno is referenced.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Increasing timeout of poll_query_until for TAP tests

2016-08-03 Thread Michael Paquier
On Thu, Aug 4, 2016 at 2:34 AM, Alvaro Herrera  wrote:
> Michael Paquier wrote:
>> On Wed, Aug 3, 2016 at 7:21 AM, Alvaro Herrera  
>> wrote:
>
>> > Why not capture both items in a single select, such as in the attached
>> > patch?
>>
>> Let me test this
>> [... A while after ...]
>> This looks to work properly. 12 runs in a row have passed.
>
> Okay, applied that way.
>
> BTW, one-line long queries look awful in that perl code.  I don't
> propose to change anything now, but I propose that long queries are
> split using here-docs in new code,
>
> $node->safe_psql(< SELECT foo
>   FROM bar
> EQ

Yep, that would be a good idea. I didn't know this grammar existed. Or
use qq() directly.
-- 
Michael


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [sqlsmith] FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", Line: 511)

2016-08-03 Thread Thomas Munro
On Thu, Aug 4, 2016 at 9:39 AM, Tom Lane  wrote:
> Andreas Seltenreich  writes:
>> the following statement triggers an assertion in tsearch:
>
>> select ts_delete(array_to_tsvector('{smith,smith,smith}'::text[]),  
>> '{smith,smith}'::text[]);
>> -- TRAP: FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", 
>> Line: 511)
>
> Confirmed here.  I notice that the output of array_to_tsvector() is
> already fishy in this example:
>
> regression=# select array_to_tsvector('{smith,smith,smith}'::text[]);
> array_to_tsvector
> -
>  'smith' 'smith' 'smith'
> (1 row)
>
> Shouldn't those have been merged together?  You certainly don't get
> results like that from other tsvector-producing operations:
>
> regression=# select to_tsvector('smith smith smith');
>   to_tsvector
> ---
>  'smith':1,2,3
> (1 row)
> regression=# select 'smith smith smith'::tsvector;
>  tsvector
> --
>  'smith'
> (1 row)
>
> However, that does not seem to be the proximate cause of the crash
> in ts_delete, because this non-duplicated case still crashes:
>
> select ts_delete(array_to_tsvector('{smith,smithx,smithy}'::text[]),  
> '{smith,smith}'::text[]);
>
> It kinda looks like you need more than one deletion request for
> the first entry in the sorted tsvector, because for example
> {smith,foo,toolbox} works but not {smith,too,toolbox}.
>
> I'm thinking there are two distinct bugs here.

The assertion in tsvector_delete_by_indices fails because its counting
algorithm doesn't expect indices_to_delete to contain multiple
references to the same index.  Maybe that could be fixed by
uniquifying in tsvector_delete_arr before calling it, but since
tsvector_delete_by_indices already qsorts its input, it should be able
to handle duplicates cheaply.  I was thinking something like this:

for (i = j = k = 0; i < tsv->size; i++)
{
+   bool drop_lexeme = false;
+
/*
 * Here we should check whether current i is present in
 * indices_to_delete or not. Since indices_to_delete
is already sorted
-* we can advance it index only when we have match.
+* we can advance it index only when we have match.  We do this
+* repeatedly, in case indices_to_delete contains
duplicate references
+* to the same index.
 */
-   if (k < indices_count && i == indices_to_delete[k])
+   while (k < indices_count && i == indices_to_delete[k])
{
+   drop_lexeme = true;
k++;
-   continue;
}
+   if (drop_lexeme)
+   continue;

But that doesn't seem to be enough, there is something else wrong here
resulting in garbage output, maybe related to the failure to merge the
tsvector...

-- 
Thomas Munro
http://www.enterprisedb.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [sqlsmith] FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", Line: 511)

2016-08-03 Thread Tom Lane
I wrote:
> I'm thinking there are two distinct bugs here.

Actually, make that three bugs.  I was so focused on the crashing
that I failed to notice that ts_delete wasn't producing sane answers
even when it didn't crash:

regression=# select ts_delete(array_to_tsvector('{smith,foo,bar}'::text[]),  
'{smith,smith}'::text[]);
  ts_delete  
-
 'smith' 'foo' 'bar'
(1 row)

regression=# select ts_delete(array_to_tsvector('{smith,foo,bar}'::text[]),  
'{smith,foo}'::text[]);
   ts_delete   
---
 'smith' 'bar'
(1 row)

regression=# select ts_delete(array_to_tsvector('{smith,foo,bar}'::text[]),  
'{bar,smith}'::text[]);
  ts_delete  
-
 'smith' 'foo' 'bar'
(1 row)

The non-array version is no better:

regression=# select ts_delete(array_to_tsvector('{smith,foo,bar}'::text[]),  
'smith'::text);
  ts_delete  
-
 'smith' 'foo' 'bar'
(1 row)

regression=# select ts_delete(array_to_tsvector('{smith,foo,bar}'::text[]),  
'foo'::text);
   ts_delete   
---
 'smith' 'bar'
(1 row)

regression=# select ts_delete(array_to_tsvector('{smith,foo,bar}'::text[]),  
'bar'::text);
  ts_delete  
-
 'smith' 'foo' 'bar'
(1 row)

I'm not sure if ts_delete takes its second argument as verbatim lexemes
or normalizes them first, but none of these words are changed by
to_tsvector, so either way it seems to fail to delete stuff it should.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] [sqlsmith] FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", Line: 511)

2016-08-03 Thread Tom Lane
Andreas Seltenreich  writes:
> the following statement triggers an assertion in tsearch:

> select ts_delete(array_to_tsvector('{smith,smith,smith}'::text[]),  
> '{smith,smith}'::text[]);
> -- TRAP: FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", 
> Line: 511)

Confirmed here.  I notice that the output of array_to_tsvector() is
already fishy in this example:

regression=# select array_to_tsvector('{smith,smith,smith}'::text[]);
array_to_tsvector
-
 'smith' 'smith' 'smith'
(1 row)

Shouldn't those have been merged together?  You certainly don't get
results like that from other tsvector-producing operations:

regression=# select to_tsvector('smith smith smith');
  to_tsvector  
---
 'smith':1,2,3
(1 row)
regression=# select 'smith smith smith'::tsvector;
 tsvector 
--
 'smith'
(1 row)

However, that does not seem to be the proximate cause of the crash
in ts_delete, because this non-duplicated case still crashes:

select ts_delete(array_to_tsvector('{smith,smithx,smithy}'::text[]),  
'{smith,smith}'::text[]);

It kinda looks like you need more than one deletion request for
the first entry in the sorted tsvector, because for example
{smith,foo,toolbox} works but not {smith,too,toolbox}.

I'm thinking there are two distinct bugs here.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Cache Hash Index meta page.

2016-08-03 Thread Jeff Janes
On Fri, Jul 22, 2016 at 3:02 AM, Mithun Cy  wrote:
> I have created a patch to cache the meta page of Hash index in
> backend-private memory. This is to save reading the meta page buffer every
> time when we want to find the bucket page. In “_hash_first” call, we try to
> read meta page buffer twice just to make sure bucket is not split after we
> found bucket page. With this patch meta page buffer read is not done, if the
> bucket is not split after caching the meta page.
>
> Idea is to cache the Meta page data in rd_amcache and store maxbucket number
> in hasho_prevblkno of bucket primary page (which will always be NULL other
> wise, so reusing it here for this cause!!!).

If it is otherwise unused, shouldn't we rename the field to reflect
what it is now used for?

What happens on a system which has gone through pg_upgrade?  Are we
sure that those on-disk representations will always have
InvalidBlockNumber in that fields?  If not, then it seems we can't
support pg_upgrade at all.  If so, I don't see a provision for
properly dealing with pages which still have InvalidBlockNumber in
them.  Unless I am missing something, the code below will always think
it found the right bucket in such cases, won't it?

if (opaque->hasho_prevblkno <=  metap->hashm_maxbucket)

Cheers,

Jeff


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2016-08-03 Thread Peter Geoghegan
On Wed, Aug 3, 2016 at 11:42 AM, Robert Haas  wrote:
> I'm not going to say it's bad to be able to do things 2-2.5x faster,
> but linear scalability this ain't - particularly because your 2.58x
> faster case is using up to 7 or 8 times as much memory.  The
> single-process case would be faster in that case, too: you could
> quicksort.

Certainly, there are cases where a parallel version could benefit from
having more memory more so than actually parallelizing the underlying
task. However, this case was pointedly chosen to *not* be such a case.
When maintenance_work_mem exceeds about 5GB, I've observed that since
9.6 increasing it is just as likely to hurt as to help by about +/-5%
(unless and until it's all in memory, which still doesn't help much).
In general, there isn't all that much point in doing a very large sort
like this in memory. You just don't get that much of a benefit for the
memory you use, because linearithmic CPU costs eventually really
dominate linear sequential I/O costs.

I think you're focusing on the fact that there is a large absolute
disparity in memory used in this one benchmark, but that isn't
something that the gains shown particularly hinge upon. There isn't
that much difference when workers must merge their own runs, for
example. It saves the serial leader merge some work, and in particular
makes it more cache efficient (by having fewer runs/tapes).

Finally, while about 8x as much memory is used, the memory used over
and above the serial case is almost all freed when the final merge
begins (the final merges are therefore very similar in both cases,
including in terms of memory use). So, for as long as you use 8x as
much memory for 8 active processes, you get a 5.67x speed-up of that
part alone. You still keep a few extra KiBs of memory for worker tapes
and things like that during the leader's merge, but that's a close to
negligible amount.

> I feel like for sorting, in particular, we probably ought
> to be setting the total memory budget, not the per-process memory
> budget.  Or if not, then any CREATE INDEX benchmarking had better
> compare using scaled values for maintenance_work_mem; otherwise,
> you're measuring the impact of using more memory as much as anything
> else.

As I said, the benchmark was chosen to avoid that (and to be simple
and reproducible). I am currently neutral on the question of whether
or not maintenance_work_mem should be dolled out per process or per
sort operation. I do think that making it a per-process allowance is
far closer to what we do for hash joins today, and is simpler.

What's nice about the idea of making the workMem/maintenance_work_mem
budget per sort is that that leaves the leader process with license to
greatly increase the amount of memory it can use for the merge.
Increasing the amount of memory used for the merge will improve things
for longer than it will for workers. I've simulated it already.

> I also think that Amdahl's law is going to pinch pretty severely here.

Doesn't that almost always happen, though? Isn't that what you
generally see with queries that show off the parallel join capability?

> If the final merge phase is a significant percentage of the total
> runtime, picking an algorithm that can't parallelize the final merge
> is going to limit the speedups to small multiples.  That's an OK place
> to be as a result of not having done all the work yet, but you don't
> want to get locked into it.  If we're going to have a substantial
> portion of the work that can never be parallelized, maybe we've picked
> the wrong algorithm.

I suggest that this work be compared to something with similar
constraints. I used Google to try to get some indication of how much
of a difference parallel CREATE INDEX makes in other major database
systems. This is all I could find:

https://www.mssqltips.com/sqlservertip/3100/reduce-time-for-sql-server-index-rebuilds-and-update-statistics/

It seems like the degree of parallelism used for SQL Server tends to
affect index build time in a way that is strikingly similar with what
I've come up with (which may be a coincidence; I don't know anything
about SQL Server). So, I suspect that the performance of this is
fairly good in an apples-to-apples comparison.

Parallelizing merging can hurt or help, because there is a cost in
memory bandwidth (if not I/O) for the extra passes that are used to
keep more CPUs busy, which is kind of analogous to the situation with
polyphase merge. I'm not saying that we shouldn't do that even still,
but I believe that there are sharply diminishing returns. Merging
tuple comparisons are much more expensive than quicksort tuple
comparisons, which tend to benefit from abbreviated keys a lot.

As I've said, there is probably a good argument to be made for
partitioning to increase parallelism. But, that involves risks around
the partitioning being driven by statistics or a cost model, and I
don't think you'd be too on board with the idea of every 

Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Thomas Munro
On Thu, Aug 4, 2016 at 5:16 AM, Craig Ringer  wrote:
> On 3 August 2016 at 22:54, Álvaro Hernández Tortosa  wrote:
>> What would it take to support it? Isn't the varlena header propagated
>> everywhere, which could help infer the real length of the string? Any
>> pointers or suggestions would be welcome.
>
>
> One of the bigger pain points is that our interaction with C library
> collation routines for sorting uses NULL-terminated C strings.  strcoll,
> strxfrm, etc.

That particular bit of the problem would go away if this ever happened:

https://wiki.postgresql.org/wiki/Todo:ICU

ucoll_strcoll takes explicit lengths (though optionally accepts -1 for
null terminated mode).

http://userguide.icu-project.org/strings#TOC-Using-C-Strings:-NUL-Terminated-vs.-Length-Parameters

-- 
Thomas Munro
http://www.enterprisedb.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Kevin Grittner
On Wed, Aug 3, 2016 at 2:15 PM, Tom Lane  wrote:
> "Joshua D. Drake"  writes:
>> On 08/03/2016 11:23 AM, Tom Lane wrote:
>>> I think the realistic answer if you suffer replication-induced corruption
>>> is usually going to be "re-clone that slave", and logical rep doesn't
>>> really offer much gain in that.
>
>> Yes, it actually does. The ability to unsubscribe a set of tables,
>> truncate them and then resubscribe them is vastly superior to having to
>> take a base backup.
>
> True, *if* you can circumscribe the corruption to a relatively small
> part of your database, logical rep might provide more support for a
> partial re-clone.

When I worked with Wisconsin Courts to migrate their databases to
PostgreSQL, we had a DBMS-agnostic logical replication system, and
we had a compare program that could be run off-hours as well as
having that be a background activity for the replication software
to work on during idle time.  Either way. a range of rows based on
primary key was read on each side and hashed, the hashes compared,
and if they didn't match there was a column-by-column compare for
each row in the range, with differences listed.  This is how we
discovered issues like the non-standard handling of backslash
mangling our data.

Personally, I can't imagine running logical replication of
supposedly matching sets of data without something equivalent.

Certainly, the courts had source documents to use for resolving any
question of the correct value on a mismatch, and I would imagine
that many environments would.  If you have a meaningful primary key
(like a court case number, by which the file folder is physically
located), seeing the different values for a specific column in a
specific row makes fixes pretty straightforward.

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Optimizing numeric SUM() aggregate

2016-08-03 Thread Andres Freund
On 2016-08-03 17:47:20 +, Andrey Borodin wrote:
> I suppose this proves performance impact of a patch. I don't think
> that sum calculation was a common bottleneck, but certainly patch will
> slightly improve performance of a very huge number of queries.

They commonly are in OLAP style workloads.


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Geoff Winkless
On 3 August 2016 at 20:36, Álvaro Hernández Tortosa  wrote:
> Isn't the correct syntax something like:
>
> select E'\uc080', U&'\c080';
>
> ?
>
> It is a single character, 16 bit unicode sequence (see
> https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html).

No, what you've done there is created the three-byte utf8 sequence \xec8280

# select U&'\c080'::bytea;
  bytea
--
 \xec8280

It's not a UCS2 c080, it's utf8 c080.

Geoff


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Tom Lane
=?UTF-8?Q?=c3=81lvaro_Hern=c3=a1ndez_Tortosa?=  writes:
>  According to https://en.wikipedia.org/wiki/UTF-8#Codepage_layout 
> the encoding used in Modified UTF-8 is an (otherwise) invalid UTF-8 code 
> point. In short, the \u00 nul is represented (overlong encoding) by the 
> two-byte, 1 character sequence \uc080. These two bytes are invalid UTF-8 
> so should not appear in an otherwise valid UTF-8 string. Yet they are 
> accepted by Postgres (like if Postgres would support Modified UTF-8 
> intentionally).

Really?  It sure looks to me like pg_utf8_islegal() would reject this.

We could hack it to allow the case, no doubt, but I concur with Peter's
concern that we'd have trouble with OS-level code that is strict about
what UTF8 allows.  glibc, for example, is known to do very strange things
with strings that it thinks are invalid in the active encoding.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Lossy Index Tuple Enhancement (LITE)

2016-08-03 Thread Claudio Freire
On Wed, Aug 3, 2016 at 4:20 AM, Simon Riggs  wrote:
> == IndexScan ==
>
> Note that the executor code for IndexScan appears identical between
> the two optimizations. The difference between duplicate and range LITE
> tuples is needed only at INSERT time (or UPDATE indexed column to a
> new value).
>
> When we do an IndexScan if we see a LITE tuple we do a scan of the
> linepointer ranges covered by this index tuple (which might eventually
> go to a full block scan). For example with bit1 set we would scan 16
> linepointers (on an 8192 byte block) and with 2 bits set we would scan
> 32 linepointers etc.., not necessarily consecutive ranges. So the
> IndexScan can return potentially many heap rows per index entry, which
> need to be re-checked and may also need to be sorted prior to
> returning them. If no rows are returned we can kill the index pointer,
> just as we do now for btrees, so they will be removed eventually from
> the index without the need for VACUUM. An BitmapIndexScan that sees a
> lossy pointer can also use the lossy TID concept, so we can have
> partially lossy bitmaps.

Wouldn't this risk scanning rows more than once unless it's part of a
bitmap scan?


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Álvaro Hernández Tortosa



On 03/08/16 21:31, Geoff Winkless wrote:

On 3 August 2016 at 20:13, Álvaro Hernández Tortosa  wrote:

Yet they are accepted by Postgres
(like if Postgres would support Modified UTF-8 intentionally). The caracter
in psql does not render as a nul but as this symbol: "삀".

Not accepted as valid utf8:

# select E'\xc0\x80';
ERROR:  invalid byte sequence for encoding "UTF8": 0xc0 0x80

You would need a "modified utf8" encoding, I think.

Geoff


Isn't the correct syntax something like:

select E'\uc080', U&'\c080';

?

It is a single character, 16 bit unicode sequence (see 
https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html).



Álvaro

--

Álvaro Hernández Tortosa


---
8Kdata



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Geoff Winkless
On 3 August 2016 at 20:13, Álvaro Hernández Tortosa  wrote:
> Yet they are accepted by Postgres
> (like if Postgres would support Modified UTF-8 intentionally). The caracter
> in psql does not render as a nul but as this symbol: "삀".

Not accepted as valid utf8:

# select E'\xc0\x80';
ERROR:  invalid byte sequence for encoding "UTF8": 0xc0 0x80

You would need a "modified utf8" encoding, I think.

Geoff


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Tom Lane
"Joshua D. Drake"  writes:
> On 08/03/2016 11:23 AM, Tom Lane wrote:
>> I think the realistic answer if you suffer replication-induced corruption
>> is usually going to be "re-clone that slave", and logical rep doesn't
>> really offer much gain in that.

> Yes, it actually does. The ability to unsubscribe a set of tables, 
> truncate them and then resubscribe them is vastly superior to having to 
> take a base backup.

True, *if* you can circumscribe the corruption to a relatively small
part of your database, logical rep might provide more support for a
partial re-clone.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Álvaro Hernández Tortosa



On 03/08/16 20:14, Álvaro Hernández Tortosa wrote:



On 03/08/16 17:47, Kevin Grittner wrote:
On Wed, Aug 3, 2016 at 9:54 AM, Álvaro Hernández Tortosa 
 wrote:



 What would it take to support it?

Would it be of any value to support "Modified UTF-8"?

https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8



That's nice, but I don't think so.

The problem is that you cannot predict how people would send you 
data, like when importing from other databases. I guess it may work if 
Postgres would implement such UTF-8 variant and also the drivers, but 
that would still require an encoding conversion (i.e., parsing every 
string) to change the 0x00, which seems like a serious performance hit.


It could be worse than nothing, though!

Thanks,

Álvaro



It may indeed work.

According to https://en.wikipedia.org/wiki/UTF-8#Codepage_layout 
the encoding used in Modified UTF-8 is an (otherwise) invalid UTF-8 code 
point. In short, the \u00 nul is represented (overlong encoding) by the 
two-byte, 1 character sequence \uc080. These two bytes are invalid UTF-8 
so should not appear in an otherwise valid UTF-8 string. Yet they are 
accepted by Postgres (like if Postgres would support Modified UTF-8 
intentionally). The caracter in psql does not render as a nul but as 
this symbol: "삀".


Given that this works, the process would look like this:

- Parse all input data looking for bytes with hex value 0x00. If they 
appear in the string, they are the null byte.

- Replace that byte with the two bytes 0xc080.
- Reverse the operation when reading.

This is OK but of course a performance hit (searching for 0x00 and 
then augmenting the byte[] or whatever data structure to account for the 
extra byte). A little bit of a PITA, but I guess better than fixing it 
all :)



Álvaro


--

Álvaro Hernández Tortosa


---
8Kdata



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Robert Haas
On Wed, Aug 3, 2016 at 2:23 PM, Tom Lane  wrote:
> Robert Haas  writes:
>> I don't think they are saying that logical replication is more
>> reliable than physical replication, nor do I believe that to be true.
>> I think they are saying that if logical corruption happens, you can
>> fix it by typing SQL statements to UPDATE, INSERT, or DELETE the
>> affected rows, whereas if physical corruption happens, there's no
>> equally clear path to recovery.
>
> Well, that's not an entirely unreasonable point, but I dispute the
> implication that it makes recovery from corruption an easy thing to do.
> How are you going to know what SQL statements to issue?  If the master
> database is changing 24x7, how are you going to keep up with that?

I think in many cases people fix their data using business logic.  For
example, suppose your database goes down and you have to run
pg_resetxlog to get it back up.  You dump-and-restore, as one does,
and find that you can't rebuild one of your unique indexes because
there are now two records with that same PK.  Well, what you do is you
look at them and judge which one has the correct data, often the one
that looks more complete or the one with the newer timestamp.  Or,
maybe you need to merge them somehow.  In my experience helping users
through problems of this type, once you explain the problem to the
user and tell them they have to square it on their end, the support
call ends.  The user may not always be entirely thrilled about having
to, say, validate a problematic record against external sources of
truth, but they usually know how to do it.  Database bugs aren't the
only way that databases become inaccurate.  If the database that they
use to keep track of land ownership in the jurisdiction where I live
says that two different people own the same piece of property,
somewhere there is a paper deed in a filing cabinet.  Fishing that out
to understand what happened may not be fun, but a DBA can explain that
problem to other people in the organization and those people can get
it fixed.  It's a problem, but it's fixable.

On the other hand, if a heap tuple contains invalid infomask bits that
cause an error every time you read the page (this actually happened to
an EnterpriseDB customer!), the DBA can't tell other people how to fix
it and can't fix it personally either.  Instead, the DBA calls me.
While I try to figure out what happened and solve the problem, every
sequential scan on that table fails, so the customer is basically
down.  In contrast, in the logical corruption scenario, one record
might be wrong, but basically everything is still working.  So it's a
difference between a problem that the DBA can work with coworkers to
fix while the system is up, and a problem that the DBA can't fix and
the system is meanwhile down.  That's a big difference.

> I think the realistic answer if you suffer replication-induced corruption
> is usually going to be "re-clone that slave", and logical rep doesn't
> really offer much gain in that.

If you're using multi-master replication, the notion of what's a slave
gets a bit fuzzy, but, apart from that, yes, this is often the
solution.  However, even here, logical replication can be better.
Given the right tools, I can fix up the slave incrementally, comparing
it to the master row by row and updating anything that's wrong.  If I
have to rebuild a physical master, I'm offline.  The difference
doesn't matter if the slave is so badly corrupted that it's unusable,
but it's very common for corruption to involve only a handful of
records, and many users not unreasonably prefer a database with a
couple of corrupted records to one which is totally down.  "Hey, the
payroll record for that Tom Lane guy is messed up, don't cut his
paycheck until we get that straightened out."  "OK, no problem."

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Joshua D. Drake

On 08/03/2016 11:23 AM, Tom Lane wrote:

Robert Haas  writes:

I don't think they are saying that logical replication is more
reliable than physical replication, nor do I believe that to be true.
I think they are saying that if logical corruption happens, you can
fix it by typing SQL statements to UPDATE, INSERT, or DELETE the
affected rows, whereas if physical corruption happens, there's no
equally clear path to recovery.


Well, that's not an entirely unreasonable point, but I dispute the
implication that it makes recovery from corruption an easy thing to do.
How are you going to know what SQL statements to issue?  If the master
database is changing 24x7, how are you going to keep up with that?

I think the realistic answer if you suffer replication-induced corruption
is usually going to be "re-clone that slave", and logical rep doesn't
really offer much gain in that.


Yes, it actually does. The ability to unsubscribe a set of tables, 
truncate them and then resubscribe them is vastly superior to having to 
take a base backup.


JD



regards, tom lane





--
Command Prompt, Inc.  http://the.postgres.company/
+1-503-667-4564
PostgreSQL Centered full stack support, consulting and development.
Everyone appreciates your honesty, until you are honest with them.
Unless otherwise stated, opinions are my own.


--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2016-08-03 Thread Robert Haas
On Mon, Aug 1, 2016 at 6:18 PM, Peter Geoghegan  wrote:
> As some of you know, I've been working on parallel sort. I think I've
> gone as long as I can without feedback on the design (and I see that
> we're accepting stuff for September CF now), so I'd like to share what
> I came up with. This project is something that I've worked on
> inconsistently since late last year. It can be thought of as the
> Postgres 10 follow-up to the 9.6 work on external sorting.

I am glad that you are working on this.

Just a first thought after reading the email:

> As you can see, the parallel case is 2.58x faster (while using more
> memory, though it's not the case that a higher maintenance_work_mem
> setting speeds up the serial/baseline index build). 8 workers are a
> bit faster than 4, but not by much (not shown). 16 are a bit slower,
> but not by much (not shown).
...
> I've seen cases where a CREATE INDEX ended up more than 3x faster,
> though. I benchmarked this case in the interest of simplicity (the
> serial case is intended to be comparable, making the test fair).
> Encouragingly, as you can see from the trace_sort output, the 8
> parallel workers are 5.67x faster at getting to the final merge (a
> merge that even it performs serially). Note that the final merge for
> each CREATE INDEX is comparable (7 runs vs. 8 runs from each of 8
> workers). Not bad!

I'm not going to say it's bad to be able to do things 2-2.5x faster,
but linear scalability this ain't - particularly because your 2.58x
faster case is using up to 7 or 8 times as much memory.  The
single-process case would be faster in that case, too: you could
quicksort.  I feel like for sorting, in particular, we probably ought
to be setting the total memory budget, not the per-process memory
budget.  Or if not, then any CREATE INDEX benchmarking had better
compare using scaled values for maintenance_work_mem; otherwise,
you're measuring the impact of using more memory as much as anything
else.

I also think that Amdahl's law is going to pinch pretty severely here.
If the final merge phase is a significant percentage of the total
runtime, picking an algorithm that can't parallelize the final merge
is going to limit the speedups to small multiples.  That's an OK place
to be as a result of not having done all the work yet, but you don't
want to get locked into it.  If we're going to have a substantial
portion of the work that can never be parallelized, maybe we've picked
the wrong algorithm.

The work on making the logtape infrastructure parallel-aware seems
very interesting and potentially useful for other things.  Sadly, I
don't have time to look at it right now.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Tom Lane
Robert Haas  writes:
> I don't think they are saying that logical replication is more
> reliable than physical replication, nor do I believe that to be true.
> I think they are saying that if logical corruption happens, you can
> fix it by typing SQL statements to UPDATE, INSERT, or DELETE the
> affected rows, whereas if physical corruption happens, there's no
> equally clear path to recovery.

Well, that's not an entirely unreasonable point, but I dispute the
implication that it makes recovery from corruption an easy thing to do.
How are you going to know what SQL statements to issue?  If the master
database is changing 24x7, how are you going to keep up with that?

I think the realistic answer if you suffer replication-induced corruption
is usually going to be "re-clone that slave", and logical rep doesn't
really offer much gain in that.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Optimizing numeric SUM() aggregate

2016-08-03 Thread Andrey Borodin

> 3 авг. 2016 г., в 22:47, Andrey Borodin  написал(а):
> 
> make installcheck-world:  tested, failed
> Implements feature:   tested, failed
> Spec compliant:   tested, failed
> Documentation:tested, failed
> 
> This is a review of a patch "Optimizing numeric SUM() aggregate" by Heikki 
> Linnakangas
Oh, I failed to use those checkboxes in web app. In my review please read 
"tested, failed" as "tested, passed". I'm sorry.

Best regards, Andrey Borodin.

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Álvaro Hernández Tortosa



On 03/08/16 18:35, Geoff Winkless wrote:

On 3 August 2016 at 15:54, Álvaro Hernández Tortosa  wrote:

 Given that 0x00 is a perfectly legal UTF-8 character, I conclude we're
strictly non-compliant.

It's perhaps worth mentioning that 0x00 is valid ASCII too, and
PostgreSQL has never stored that either.


Then yes, it could also be a problem. But as of today I believe 
solving the problem for UTF-8 would solve the great majority of this 
embedded NUL problems.


Álvaro


--

Álvaro Hernández Tortosa


---
8Kdata



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Tom Lane
Robert Haas  writes:
> On Wed, Aug 3, 2016 at 12:12 PM, Peter Eisentraut
>  wrote:
>> One hiccup I found is that server_version_num is not sent to clients.
>> Instead, libpq assembles the numeric version number itself from the
>> string version, and it will fail if it sees only one number (e.g.,
>> 10devel).  It will then set the version number to 0 for "unknown".

Ugh.

> This pretty much sucks.  I suppose we could at least alleviate the
> problem by back-patching some intelligence about the new scheme into
> back-branches, but of course that will only help people if they
> install newer minor releases.

Yeah.  I doubt there is much reason to assume that people would be
using, say, a 9.5.5 psql and a 9.5.3 libpq or vice versa.  Whatever
the current client behavior is is what people will see.

Having said that, this sort of problem is one reason we wanted to give
ourselves a full year to implement the new scheme.  If we put some
appropriate fix into the back branches *now*, there would be a fair
amount of daylight for that to spread into the field before any users
would be seeing v10 servers in practice.

So it seems like fixing libpq's parsing of server_version_num is
something we definitely want to fix ASAP in all back branches.
Is there anything else that's particularly bad?

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Álvaro Hernández Tortosa



On 03/08/16 17:47, Kevin Grittner wrote:

On Wed, Aug 3, 2016 at 9:54 AM, Álvaro Hernández Tortosa  
wrote:


 What would it take to support it?

Would it be of any value to support "Modified UTF-8"?

https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8



That's nice, but I don't think so.

The problem is that you cannot predict how people would send you 
data, like when importing from other databases. I guess it may work if 
Postgres would implement such UTF-8 variant and also the drivers, but 
that would still require an encoding conversion (i.e., parsing every 
string) to change the 0x00, which seems like a serious performance hit.


It could be worse than nothing, though!

Thanks,

Álvaro

--

Álvaro Hernández Tortosa


---
8Kdata



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Robert Haas
On Tue, Aug 2, 2016 at 5:14 PM, Tom Lane  wrote:
> Stephen Frost  writes:
>> With physical replication, there is the concern that a bug in *just* the
>> physical (WAL) side of things could cause corruption.
>
> Right.  But with logical replication, there's the same risk that the
> master's state could be fine but a replication bug creates corruption on
> the slave.
>
> Assuming that the logical replication works by issuing valid SQL commands
> to the slave, one could hope that this sort of "corruption" only extends
> to having valid data on the slave that fails to match the master.
> But that's still not a good state to be in.  And to the extent that
> performance concerns lead the implementation to bypass some levels of the
> SQL engine, you can easily lose that guarantee too.
>
> In short, I think Uber's position that logical replication is somehow more
> reliable than physical is just wishful thinking.  If anything, my money
> would be on the other way around: there's a lot less mechanism that can go
> wrong in physical replication.  Which is not to say there aren't good
> reasons to use logical replication; I just do not believe that one.

I don't think they are saying that logical replication is more
reliable than physical replication, nor do I believe that to be true.
I think they are saying that if logical corruption happens, you can
fix it by typing SQL statements to UPDATE, INSERT, or DELETE the
affected rows, whereas if physical corruption happens, there's no
equally clear path to recovery.  If an index is damaged, you can
recreate it; if a heap page is damaged such that you can no longer
scan the table, you're going to need expert assistance.

And I think there's some point to that.  I agree with the general
sentiment that they could have gotten further and been more successful
with PostgreSQL if they had some expert advice, but I think it's
indisputable that recovering a physically corrupted database is
generally a lot more painful than one where you only have to fix up
some damaged data.  Whether we really have data-corrupting WAL-replay
bugs sufficiently frequently to make this an ongoing issue rather than
a one-time event is also debatable, but nonetheless I don't think
their point is completely invalid.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Álvaro Hernández Tortosa



On 03/08/16 17:23, Tom Lane wrote:

=?UTF-8?Q?=c3=81lvaro_Hern=c3=a1ndez_Tortosa?=  writes:

  As has been previously discussed (see
https://www.postgresql.org/message-id/BAY7-F17FFE0E324AB3B642C547E96890%40phx.gbl
for instance) varlena fields cannot accept the literal 0x00 value.

Yup.


  What would it take to support it?

One key reason why that's hard is that datatype input and output
functions use nul-terminated C strings as the representation of the
text form of any datatype.  We can't readily change that API without
breaking huge amounts of code, much of it not under the core project's
control.

There may be other places where nul-terminated strings would be a hazard
(mumble fgets mumble), but offhand that API seems like the major problem
so far as the backend is concerned.

There would be a slew of client-side problems as well.  For example this
would assuredly break psql and pg_dump, along with every other client that
supposes that it can treat PQgetvalue() as returning a nul-terminated
string.  This end of it would possibly be even worse than fixing the
backend, because so little of the affected code is under our control.

In short, the problem is not with having an embedded nul in a stored
text value.  The problem is the reams of code that suppose that the
text representation of any data value is a nul-terminated C string.

regards, tom lane


Wow. That seems like a daunting task.

I guess, then, than even implementing a new datatype based on bytea 
but that would use the text IO functions to show up as text (not 
send/recv) would neither work, right?


Thanks for the input,

Álvaro


--

Álvaro Hernández Tortosa


---
8Kdata



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] PostmasterContext survives into parallel workers!?

2016-08-03 Thread Tom Lane
Alvaro Herrera  writes:
> Tom Lane wrote:
>> The attached patch passes "make check-world" for me.  Can you check it
>> against BDR?

> Just checked.  It works fine.

Thanks!  I'll push it shortly.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Robert Haas
On Wed, Aug 3, 2016 at 12:12 PM, Peter Eisentraut
 wrote:
> One hiccup I found is that server_version_num is not sent to clients.
> Instead, libpq assembles the numeric version number itself from the
> string version, and it will fail if it sees only one number (e.g.,
> 10devel).  It will then set the version number to 0 for "unknown".
> Client code such as psql and pg_dump is coded so that it will then fall
> back to code for the oldest server version it happens to support (less
> than 8.1 at times).  So in other words, old psql plus new server up
> until production release will have many \d commands failing.  Once the
> release becomes 10.0, it will work again.  (It will still think in terms
> of three-component versions, but it won't make a difference in practice.)

This pretty much sucks.  I suppose we could at least alleviate the
problem by back-patching some intelligence about the new scheme into
back-branches, but of course that will only help people if they
install newer minor releases.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Way to access LSN (for each transaction) by directly talking to postgres?

2016-08-03 Thread Joshua Bay
Thanks Michael,

Could you please tell me how I can get LSN of each transaction at decoder
plugin?


On Wed, Aug 3, 2016 at 2:08 AM, Michael Paquier 
wrote:

> On Wed, Aug 3, 2016 at 3:00 PM, Michael Paquier
>  wrote:
> > On Wed, Aug 3, 2016 at 12:37 PM, Joshua Bay 
> wrote:
> >> Could you please let me know if there is a way to get LSN of each
> >> transaction by directly communicating with Postgres server and NOT by
> >> accessing logs.
> >
> > Logical decoding is one way.
>
> And I just saw your other message... What I just meant here is that if
> you use a decoder plugin that just emits information at transaction
> begin/commit you can directly get this information. There is no need
> to directly look at the WAL logs, the server does it for you. And it
> offers a good cover regarding the information that has already been
> consumed or not.
>
> (Btw, avoid sending emails across multiple mailing lists, particularly
> pgsql-committers which is not aimed for that).
> --
> Michael
>


Re: [HACKERS] New version numbering practices

2016-08-03 Thread David G. Johnston
On Wed, Aug 3, 2016 at 1:20 PM, Tom Lane  wrote:

> Greg Stark  writes:
> > Right now git-describe --tags on a random revision between 9.4
> > and 9.5 will print something like REL9_4_BETA1-1973-g85c25fd or
> > something like REL9_5_BETA2-33-g55a2cc8 if it happens to be after a
> > beta. It's really hard to tell what release the revision you're on is
> > actually between from that.
>
> That command is kinda useless AFAICT :-(
>

​Mostly as a function of a lack of definition as to what it wants to show.
It would be good to at least ensure that shared commit between master and a
release branch is tagged on master.

git describe --tags REL9_6_BETA1~1 should show REL9_5_0 (or, e.g.,
REL9_5_GOLIVE if we cannot reasonably put the 9.5.0 tag on master) and not
REL9_5_ALPHA1-*

David J.


Re: [HACKERS] PostmasterContext survives into parallel workers!?

2016-08-03 Thread Alvaro Herrera
Tom Lane wrote:
> Alvaro Herrera  writes:
> > Tom Lane wrote:
> >> It looks to me like the reason for it is simply not having bothered to
> >> copy the rw->rw_worker data to somewhere that would survive deletion
> >> of the PostmasterContext.  I wonder though if anyone remembers a more
> >> fundamental reason?  Surely the bgworker is not supposed to touch any
> >> of the rest of the BackgroundWorkerList?
> 
> > I just checked BDR, which is the more complex code using workers I know
> > of, and I don't see any reason why this cannot be changed.
> 
> The attached patch passes "make check-world" for me.  Can you check it
> against BDR?

Just checked.  It works fine.

-- 
Álvaro Herrerahttp://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Optimizing numeric SUM() aggregate

2016-08-03 Thread Andrey Borodin
The following review has been posted through the commitfest application:
make installcheck-world:  tested, failed
Implements feature:   tested, failed
Spec compliant:   tested, failed
Documentation:tested, failed

This is a review of a patch "Optimizing numeric SUM() aggregate" by Heikki 
Linnakangas
https://www.postgresql.org/message-id/flat/c0545351-a467-5b76-6d46-4840d1ea8...@iki.fi#c0545351-a467-5b76-6d46-4840d1ea8...@iki.fi
This review contains summarization of all my posts regarding this patch and a 
little bit of additional suggestions.

Contents & Purpose
==
This patch improves performance of aggregates computation by delaying numeric 
overflow carring between NBASE-digit arbitrary-length arithmetic. This is 
possible because 32-bit int is used for storage of every NBASE-digit, where 
NBASE is 1.
Patch changes file numeric.c only. Folder of numeric.c does not contain README. 
That is why all documentation of a patch is done in comments. I consider 
documentation sufficient.

Initial Run
===
Patch can be applied cleanly to current HEAD. Regression tests path before and 
after patch application.

Performance
===
I've tested patch with this query
CREATE TABLE avg_test AS SELECT (random() * 999)::decimal(5,2) as d FROM 
generate_series(1, 100) s;
SELECT avg(d) a , avg(d+0) s0 , avg(d+1) s1 , avg(d+2) s2, avg(d+3) s3 , 
avg(d+4) s4 , avg(d+5) s5, avg(d+6) s6 , avg(d+7) s7, avg(d+8) s8 , avg(d+9) s9 
FROM avg_test;

Test specs were: Ubuntu 14 LTS VM, dynamic RAM, hypervisor Windows
Server 2016, SSD disk, core i5-2500. Configuration: out of the box master make.

On 10 test runs timing of select statement: AVG 3739.8 ms, STD  435.4193
With patch applied (as is) : 3017.8 ms, STD 319.893

I suppose this proves performance impact of a patch. I don't think that sum 
calculation was a common bottleneck, but certainly patch will slightly improve 
performance of a very huge number of queries.

Suggestions
===
1. Currenlty overflow is carried every  addition. I suggest that it is 
possibe to do it only every (INT32_MAX - INT32_MAX / NBASE) / (NBASE - 1) 
addition. Please note that with this overflow count it will be neccesary to 
check whether two finishing carrings are required.
2. Aggregates and numeric regression tests do not containt any test case 
covering overflows. I recommend adding sum with numer 1 000 000 of 99 999 999 
values. May be you will come up with more clever overflow testing.

Conclusion
==
This patch is important and thorough work, but I'd suggest to consider some 
polishing on two points listed above.

The new status of this patch is: Waiting on Author

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Way to access LSN (for each transaction) by directly talking to postgres?

2016-08-03 Thread Joshua Bay
Thanks for responses!

The problem I wanted to solve was to find the (global) order of commits
across the postgres cluster. So, my attempt was to use the LSN.



On Wed, Aug 3, 2016 at 9:47 AM, Craig Ringer  wrote:

>
>
> On 3 August 2016 at 11:37, Joshua Bay  wrote:
>
>> Hi,
>>
>> Could you please let me know if there is a way to get LSN of each
>> transaction by directly communicating with Postgres server and NOT by
>> accessing logs.
>>
>
>
> To what end? What problem are you trying to solve?
>
> What LSN, exactly? The LSN of the first write and xid allocation? The LSN
> of the commit record? What if it's a complex commit like with prepared
> xacts?
>
>
> --
>  Craig Ringer   http://www.2ndQuadrant.com/
>  PostgreSQL Development, 24x7 Support, Training & Services
>


Re: [HACKERS] Increasing timeout of poll_query_until for TAP tests

2016-08-03 Thread Alvaro Herrera
Michael Paquier wrote:
> On Wed, Aug 3, 2016 at 7:21 AM, Alvaro Herrera  
> wrote:

> > Why not capture both items in a single select, such as in the attached
> > patch?
> 
> Let me test this
> [... A while after ...]
> This looks to work properly. 12 runs in a row have passed.

Okay, applied that way.

BTW, one-line long queries look awful in that perl code.  I don't
propose to change anything now, but I propose that long queries are
split using here-docs in new code,

$node->safe_psql(<

[HACKERS] [sqlsmith] FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", Line: 511)

2016-08-03 Thread Andreas Seltenreich
Hi,

the following statement triggers an assertion in tsearch:

select ts_delete(array_to_tsvector('{smith,smith,smith}'::text[]),  
'{smith,smith}'::text[]);
-- TRAP: FailedAssertion("!(k == indices_count)", File: "tsvector_op.c", Line: 
511)

regards,
Andreas


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Tom Lane
Greg Stark  writes:
> It would also be nice to have a tag or branch name for the development
> branch.

Uh, the branch name is "master".  I doubt we want to change that.
And you can't really have a tag on a branch, AFAIK --- a tag names
a specific commit and can't ever be moved.  I think the fundamental
difference between tags and branches is exactly that a branch is a
movable commit-pointer while a tag is fixed.

> Right now git-describe --tags on a random revision between 9.4
> and 9.5 will print something like REL9_4_BETA1-1973-g85c25fd or
> something like REL9_5_BETA2-33-g55a2cc8 if it happens to be after a
> beta. It's really hard to tell what release the revision you're on is
> actually between from that.

That command is kinda useless AFAICT :-(

> There's also a git feature I just noticed called "annotations". Tags
> can be "annotated" with a commit message and then git describe will
> only use those and skip other tags that are not annotated. Currently
> we appear to have no annotated tags. (which confuses me because there
> are clearly commit messages for the tags with logs like "Stamp
> 9.5alpha" c.f. f78329d594c2fe893f9174d5b3da7d3fbc6dd8b6)

Those commit messages are for the stamping commits, ie the ones that
put a new version number into configure.in and so on.  Tags get applied
to those commits after the fact, once we've verified that tarballs made
from those commits are publishable.  That is, the tags are meant to mark
which tree state the published tarballs were actually made from.  It has
happened that we've fixed something and re-wrapped after the stamping
commit, in which case the tag would be placed on a later commit.

I was just wondering whether it would be worth starting to use "git tag -a".
It doesn't seem to buy much, and if it would cause git describe to ignore
all non-annotated tags, I should think that that's a misfeature.

> It would be nice if git describe on a random revision between 9.4 and
> 9.5 always printed something like REL_09_05_DEV-* which indicated it
> was some revision committed after 9.4 was branched and before 9.5.0
> was released. (That would sort incorrectly so maybe some further tweak
> is needed?)

I think you need to file a feature request with the git folk.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] [sqlsmith] FailedAssertion("!(XLogCtl->Insert.exclusiveBackup)", File: "xlog.c", Line: 10200)

2016-08-03 Thread Andreas Seltenreich
Hi,

testing with sqlsmith shows that the following assertion doesn't hold:

FailedAssertion("!(XLogCtl->Insert.exclusiveBackup)", File: "xlog.c", Line: 
10200)

The triggering statements always contain a call to pg_start_backup with
the third argument 'true', i.e. it's trying to start an exlusive backup.

I didn't manage to put together a stand-alone testcase yet.

regards,
Andreas


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Craig Ringer
On 3 August 2016 at 22:54, Álvaro Hernández Tortosa  wrote:

>
> Hi list.
>
> As has been previously discussed (see
> https://www.postgresql.org/message-id/BAY7-F17FFE0E324AB3B642C547E96890%40phx.gbl
> for instance) varlena fields cannot accept the literal 0x00 value. Sure,
> you can use bytea, but this hardly a good solution. The problem seems to be
> hitting some use cases, like:
>
> - People migrating data from other databases (apart from PostgreSQL, I
> don't know of any other database which suffers the same problem).
> - People using drivers which use UTF-8 or equivalent encodings by default
> (Java for example)
>
> Given that 0x00 is a perfectly legal UTF-8 character, I conclude we're
> strictly non-compliant. And given the general Postgres policy regarding
> standards compliance and the people being hit by this, I think it should be
> addressed. Specially since all the usual fixes are a real PITA (re-parsing,
> re-generating strings, which is very expensive, or dropping data).
>
> What would it take to support it? Isn't the varlena header propagated
> everywhere, which could help infer the real length of the string? Any
> pointers or suggestions would be welcome.


One of the bigger pain points is that our interaction with C library
collation routines for sorting uses NULL-terminated C strings.  strcoll,
strxfrm, etc.

-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Greg Stark
On Wed, Aug 3, 2016 at 5:27 PM, Tom Lane  wrote:
> Well, the rule would be that "REL_xx" is a branch, "REL_xx_yy" is a
> release tag.  Neither of these is confusable with old-style
> branch or tag names.  The alternative seems to be saying that
> "REL_xx_STABLE" is a branch while "REL_xx_yy" is a release tag.
> That works but it doesn't seem to have all that much to recommend it;
> unless there is code in the buildfarm or elsewhere that really wants
> to see _STABLE in the branch names.

It would also be nice to have a tag or branch name for the development
branch. Right now git-describe --tags on a random revision between 9.4
and 9.5 will print something like REL9_4_BETA1-1973-g85c25fd or
something like REL9_5_BETA2-33-g55a2cc8 if it happens to be after a
beta. It's really hard to tell what release the revision you're on is
actually between from that.

There's also a git feature I just noticed called "annotations". Tags
can be "annotated" with a commit message and then git describe will
only use those and skip other tags that are not annotated. Currently
we appear to have no annotated tags. (which confuses me because there
are clearly commit messages for the tags with logs like "Stamp
9.5alpha" c.f. f78329d594c2fe893f9174d5b3da7d3fbc6dd8b6)

It would be nice if git describe on a random revision between 9.4 and
9.5 always printed something like REL_09_05_DEV-* which indicated it
was some revision committed after 9.4 was branched and before 9.5.0
was released. (That would sort incorrectly so maybe some further tweak
is needed?)

-- 
greg


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Detecting skipped data from logical slots (data silently skipped)

2016-08-03 Thread Craig Ringer
On 3 August 2016 at 21:55, Greg Stark  wrote:

> I didn't follow all of that but I wonder if it isn't just that when you
> restore from backup you should be creating a new slot?
>

Yes, you should.

But when you restore a Pg instance from a disk snapshot or similar it can't
tell it isn't the original of its self. It has no way to know "I shouldn't
connect to this slot and consume data from it". Right now the upstream has
no way to tell it "that data's gone, sorry" - it just ignores the
downstream's requested start position and starts from wherever it thinks
the downstream is up to.

With physical replication we'll detect such a problem. With logical
replication we'll silently continue with an invisible gap in the history.

-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


Re: [HACKERS] New version numbering practices

2016-08-03 Thread David G. Johnston
On Wed, Aug 3, 2016 at 12:51 PM, Greg Stark  wrote:

> On Tue, Aug 2, 2016 at 2:57 PM, Tom Lane  wrote:
> > I thought we'd pretty much done that cleanup during the cvs->git
> > conversion?
>
> I guess I'm talking about tags. I'm not clear on the distinction
> between tags and branches names in git.
>

​Ignoring git for a moment our setup is that each major version (9.6) gets
a branch when it is finally released.  For each minor release the last
commit on each branch that is included in the release is tagged with both
the branch/major-version AND the patch/minor-version.

I'm sure the internet can provide a better overview of the differences,
within git, between tags and branches.  One way to look at it, though is
that tags are
 explicit labels pointing to commits
​ whereas b
ranches are
​ implicit​ labels.

When you do:  git checkout branch you are asking for whatever HEAD - for
that branch - points to.  Committing to a branch causes a new commit to be
created and then HEAD - for that branch - to be moved.  So you are, by
default, dealing with the implicit HEAD label within the branch "namespace".

David J.


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Tom Lane
Greg Stark  writes:
> On Tue, Aug 2, 2016 at 2:57 PM, Tom Lane  wrote:
>> I thought we'd pretty much done that cleanup during the cvs->git
>> conversion?

> I guess I'm talking about tags. I'm not clear on the distinction
> between tags and branches names in git.

> Prior to 8.0.0 we seem to have tagged the first release with a tag
> that doesn't include _0 for the minor release and afterwards with one
> that does:

> REL2_0
> REL6_1
> REL6_2
> REL6_4
> REL7_0
> REL7_1
> REL7_2
> REL7_3
> REL7_4
> REL8_0_0
> REL8_1_0

Ah.  Well, that's a reflection of what those releases were actually
called at the time: we did not start using ".0" on major releases
until the 8.0 branch.  Compare tarball names in
https://www.postgresql.org/ftp/source/v7.4/
https://www.postgresql.org/ftp/source/v8.0/

So we could go back and add tags like REL7_4_0 but it would be historical
revisionism.  Is there a particular reason to do it?  It seems like stuff
that far back is only of historical interest, so I'm kind of -1 on
corrupting the historical record with retroactive labels.

BTW, there are some missing tags back there, for instance no REL7_0_1.
I believe this is because we couldn't exactly identify which commit
corresponded to the published tarballs.  I'd be for filling in those gaps
if anyone can figure it out.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Greg Stark
On Tue, Aug 2, 2016 at 2:57 PM, Tom Lane  wrote:
> I thought we'd pretty much done that cleanup during the cvs->git
> conversion?

I guess I'm talking about tags. I'm not clear on the distinction
between tags and branches names in git.

Prior to 8.0.0 we seem to have tagged the first release with a tag
that doesn't include _0 for the minor release and afterwards with one
that does:

REL2_0
REL6_1
REL6_2
REL6_4
REL7_0
REL7_1
REL7_2
REL7_3
REL7_4
REL8_0_0
REL8_1_0
...



-- 
greg


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] OldSnapshotTimemapLock information is missing in monitoring.sgml file

2016-08-03 Thread Ashutosh Sharma
> Pushed with minor fixes (white space and bumping the count for the tranche).
>
Thanks. I just missed to increase the LWLock count by one.

With Regards,
Ashutosh Sharma
EnterpriseDB:http://www.enterprisedb.com


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Geoff Winkless
On 3 August 2016 at 15:54, Álvaro Hernández Tortosa  wrote:
> Given that 0x00 is a perfectly legal UTF-8 character, I conclude we're
> strictly non-compliant.

It's perhaps worth mentioning that 0x00 is valid ASCII too, and
PostgreSQL has never stored that either.

If you want to start quoting standards, there is in fact specific
mention in the ANSI spec of null terminators in passing strings to
host languages, so if postgresql stored NULs in that way we would end
up with parameters that we couldn't pass to UDFs in a
standards-compliant way.

Geoff


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Tom Lane
Peter Eisentraut  writes:
> On 8/1/16 9:10 PM, Alvaro Herrera wrote:
>> I don't see any value to the _STABLE
>> suffix, given the way we treat branches.

> It would be nice to be able to tell easily from convention whether
> something is a branch or a tag.

Well, the rule would be that "REL_xx" is a branch, "REL_xx_yy" is a
release tag.  Neither of these is confusable with old-style
branch or tag names.  The alternative seems to be saying that
"REL_xx_STABLE" is a branch while "REL_xx_yy" is a release tag.
That works but it doesn't seem to have all that much to recommend it;
unless there is code in the buildfarm or elsewhere that really wants
to see _STABLE in the branch names.

> Anyway, this is a question for many months from now.

True, but we might as well make the decisions now.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Slowness of extended protocol

2016-08-03 Thread Vladimir Sitnikov
Tatsuo Ishii :

> Doesn't this patch break an existing behavior of unnamed statements?
> That is, an unnamed statement shall exist until next parse message
> using unnamed statement received. It is possible to use the same
> unnamed statement multiple times in a transaction.
>

>Doesn't this patch break an existing behavior of unnamed statements?

As it was expected, the behavior for unnamed statements is broken (some
tests from make check-world fail with segmentation fault).
So some more sophisticated patch is required.

For those who are interested, I've created a Github-Travis mirror that
automatically runs several regression suites for the given postgresql
patch: https://github.com/vlsi/postgres
I think it will simplify running regression tests for postgresql patches
against multiple suites.

Current tests include: make check, make check-world, and pgjdbc test suite
(except XA and SSL).

For instance, here's the link to my patch
https://github.com/vlsi/postgres/pull/1
Feel free to file PRs for travis branch of https://github.com/vlsi/postgres so
the patch gets tested.

Vladimir


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Peter Eisentraut
On 8/1/16 11:49 AM, Tom Lane wrote:
> Somebody needs to come up with a patch implementing this changeover.

Here is such a patch.  It does not yet implement:

> (External code will need some cue as
> to how to format displays from PG_VERSION_NUM, so we should have a hard
> and fast rule that major >= 10 means new style.)

e.g., in psql, but that's a UI issue that can be sorted out.

One hiccup I found is that server_version_num is not sent to clients.
Instead, libpq assembles the numeric version number itself from the
string version, and it will fail if it sees only one number (e.g.,
10devel).  It will then set the version number to 0 for "unknown".
Client code such as psql and pg_dump is coded so that it will then fall
back to code for the oldest server version it happens to support (less
than 8.1 at times).  So in other words, old psql plus new server up
until production release will have many \d commands failing.  Once the
release becomes 10.0, it will work again.  (It will still think in terms
of three-component versions, but it won't make a difference in practice.)

Some possibilities to make this slightly better:

- Report server_version_num to clients, and use that.

- If libpq can't parse the version number, it should set it to its own
version number instead of 0.  Alternatively,

- If psql sees a server version number of 0, it should assume its own
version number.

- Similarly for pg_dump, although old pg_dump with new server is not
really supported anyway.


Apart from some UI issues, the attached patch passes check-world.

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

From 9d9513ea7293be262ea691ad8057ad81836ef8c4 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Wed, 3 Aug 2016 11:52:04 -0400
Subject: [PATCH] Stamp version 10devel

WIP
---
 configure | 20 ++--
 configure.in  |  4 ++--
 doc/bug.template  |  2 +-
 doc/src/sgml/runtime.sgml | 31 ---
 src/backend/catalog/genbki.pl |  4 ++--
 src/backend/utils/init/miscinit.c | 18 +++---
 src/bin/pg_upgrade/check.c|  6 +++---
 src/bin/pg_upgrade/server.c   |  2 +-
 src/include/pg_config.h.win32 |  8 
 src/interfaces/libpq/fe-exec.c| 17 +++--
 src/interfaces/libpq/libpq.rc.in  |  8 
 src/port/win32ver.rc  |  4 ++--
 src/tools/version_stamp.pl| 14 ++
 13 files changed, 73 insertions(+), 65 deletions(-)

diff --git a/configure b/configure
index b49cc11..45c8eef 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for PostgreSQL 9.6beta3.
+# Generated by GNU Autoconf 2.69 for PostgreSQL 10devel.
 #
 # Report bugs to .
 #
@@ -582,8 +582,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='PostgreSQL'
 PACKAGE_TARNAME='postgresql'
-PACKAGE_VERSION='9.6beta3'
-PACKAGE_STRING='PostgreSQL 9.6beta3'
+PACKAGE_VERSION='10devel'
+PACKAGE_STRING='PostgreSQL 10devel'
 PACKAGE_BUGREPORT='pgsql-b...@postgresql.org'
 PACKAGE_URL=''
 
@@ -1398,7 +1398,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures PostgreSQL 9.6beta3 to adapt to many kinds of systems.
+\`configure' configures PostgreSQL 10devel to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1463,7 +1463,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
- short | recursive ) echo "Configuration of PostgreSQL 9.6beta3:";;
+ short | recursive ) echo "Configuration of PostgreSQL 10devel:";;
esac
   cat <<\_ACEOF
 
@@ -1615,7 +1615,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-PostgreSQL configure 9.6beta3
+PostgreSQL configure 10devel
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2326,7 +2326,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by PostgreSQL $as_me 9.6beta3, which was
+It was created by PostgreSQL $as_me 10devel, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2711,7 +2711,7 @@ ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
 configure_args=$ac_configure_args
 
 
-PG_MAJORVERSION=`expr "$PACKAGE_VERSION" : '\([0-9][0-9]*\.[0-9][0-9]*\)'`
+PG_MAJORVERSION=`expr "$PACKAGE_VERSION" : '\([0-9][0-9]*\)'`
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16433,7 +16433,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input 

Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Peter Eisentraut
On 8/3/16 11:47 AM, Kevin Grittner wrote:
> On Wed, Aug 3, 2016 at 9:54 AM, Álvaro Hernández Tortosa  
> wrote:
> 
>> What would it take to support it?
> 
> Would it be of any value to support "Modified UTF-8"?
> 
> https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8

Will this work with OS libraries?


-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] New version numbering practices

2016-08-03 Thread Peter Eisentraut
On 8/1/16 9:10 PM, Alvaro Herrera wrote:
> I don't see any value to the _STABLE
> suffix, given the way we treat branches.

It would be nice to be able to tell easily from convention whether
something is a branch or a tag.

Anyway, this is a question for many months from now.

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Possible duplicate release of buffer lock.

2016-08-03 Thread Tom Lane
Kyotaro HORIGUCHI  writes:
> My point here is that if concurrent deletion can't be perfomed by
> the current implement, this while loop could be removed and
> immediately error out or log a message,

>> if (P_ISDELETED(opaque) || opaque->btpo_next != target)
>> {
>> elog(ERROR, "no left sibling of page %d (concurrent deletion?) in \"%s\"",..

That would certainly break things: there are valid cases for the
loop to need to iterate, specifically where the left sibling got
split before we could acquire lock on it.

> or, the while loop at least should stop before overshooting the
> target.

>> while (P_ISDELETED(opaque) || opaque->btpo_next != target)
>> {
>> /* step right one page */
>> leftsib = opaque->btpo_next;
>> _bt_relbuf(rel, lbuf);
>> if (leftsib == target || leftsib == P_NONE)
>> {
>> elog(ERROR, "no left sibling of page %d (concurrent deletion?) in \"%s\"",..

Huh?  Surely that added test condition could never be true because
of the second part of the while() test.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Kevin Grittner
On Wed, Aug 3, 2016 at 9:54 AM, Álvaro Hernández Tortosa  
wrote:

> What would it take to support it?

Would it be of any value to support "Modified UTF-8"?

https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8

-- 
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Wanting to learn about pgsql design decision

2016-08-03 Thread Kevin Grittner
On Wed, Aug 3, 2016 at 2:52 AM, Tal Walter  wrote:

> I'd appreciate if you could help me understand how I can research the
> answers to these type of questions by myself.

> Could I perhaps search the commit
> comments somehow? Or perhaps a different approach to suggest?

In addition to Tom's suggestions about how to review commit
comments -- if you look in the source code directories for README
files, you will find they often contain discussions of such
matters.  If you are up for it, the C code for implementing
features also often discusses alternatives and why they were not
chosen.

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Tom Lane
=?UTF-8?Q?=c3=81lvaro_Hern=c3=a1ndez_Tortosa?=  writes:
>  As has been previously discussed (see 
> https://www.postgresql.org/message-id/BAY7-F17FFE0E324AB3B642C547E96890%40phx.gbl
>  
> for instance) varlena fields cannot accept the literal 0x00 value.

Yup.

>  What would it take to support it?

One key reason why that's hard is that datatype input and output
functions use nul-terminated C strings as the representation of the
text form of any datatype.  We can't readily change that API without
breaking huge amounts of code, much of it not under the core project's
control.

There may be other places where nul-terminated strings would be a hazard
(mumble fgets mumble), but offhand that API seems like the major problem
so far as the backend is concerned.

There would be a slew of client-side problems as well.  For example this
would assuredly break psql and pg_dump, along with every other client that
supposes that it can treat PQgetvalue() as returning a nul-terminated
string.  This end of it would possibly be even worse than fixing the
backend, because so little of the affected code is under our control.

In short, the problem is not with having an embedded nul in a stored
text value.  The problem is the reams of code that suppose that the
text representation of any data value is a nul-terminated C string.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] regression test for extended query protocol

2016-08-03 Thread Dave Cramer
On 3 August 2016 at 10:53, Tom Lane  wrote:

> Tatsuo Ishii  writes:
> >>> In my understanding, we don't have any regression test for protocol
> >>> level prepared query (we do have SQL level prepared query tests,
> >>> though). Shouldn't we add those tests to the regression test suites?
>
> >> I thought that ECPG was covering a portion of that. Right?
>
> > In my understanding, ECPG calls libpq, thus the test cases are limited
> > to the cases which are only possible with libpq (or it may be even
> > limited to the cases where ECPG deal with.)
>
> I do not think it's very practical for the core tests to try to cover
> any behavior that's not reachable via libpq.  Even testing stuff that's
> not reached through ecpg would require a whole slew of new infrastructure
> with no use except testing.
>
> I think realistically a better approach to this would be to get some
> buildfarm members running the JDBC regression tests (I assume there
> are some ...)
>
> regards, tom lane
>
> We currently run tests every time a PR is created on github, but I don't
think there are any animals running the JDBC test suite

We can add tests, what exactly do we want to test. Then setting up an
animal to run the tests would be fairly straight forward.

Dave Cramer

da...@postgresintl.com
www.postgresintl.com


Re: [HACKERS] PostgreSQL 10 kick-off

2016-08-03 Thread Fabrízio de Royes Mello
On Tue, Aug 2, 2016 at 5:44 PM, Alvaro Herrera 
wrote:
>
> Peter Eisentraut wrote:
> > On 8/1/16 1:08 PM, Fabrízio de Royes Mello wrote:
> > > What knowledge is expected for a CFM? I'm really would like to help
and
> > > also learn more about our development.
> >
> > If you search the wiki for "commitfest" you will find several pages of
> > varying outdatedness that describe the process.
>
> I have cleaned up the wiki a bit, so the authoritative info should be at
> https://wiki.postgresql.org/wiki/CommitFest_Checklist
> Note that what that page describes is not exactly what we do nowadays;
> if someone wants to edit that page to better reflect reality, be my
> guest.  Also see the old (2009) page where an older workflow was
> described:
>
https://wiki.postgresql.org/index.php?title=Running_a_CommitFest=9369
> Perhaps it'd be good to resurrect some of that contents, in modernized
> form, to the current page.
>

Thank you both for the informations.

If there are no complains about my lack of experience in this field I would
like do become the next CFM (am I the first brazilian??)

Regards,

--
Fabrízio de Royes Mello
Consultoria/Coaching PostgreSQL
>> Timbira: http://www.timbira.com.br
>> Blog: http://fabriziomello.github.io
>> Linkedin: http://br.linkedin.com/in/fabriziomello
>> Twitter: http://twitter.com/fabriziomello
>> Github: http://github.com/fabriziomello


Re: [HACKERS] OldSnapshotTimemapLock information is missing in monitoring.sgml file

2016-08-03 Thread Kevin Grittner
On Wed, Aug 3, 2016 at 4:52 AM, Ashutosh Sharma  wrote:

> monitoring.sgml file has the list of all the wait event names that is
> being monitored by pg_stat_activity but as one of the LWLock named
> 'OldSnapshotTimemapLock' was added recently as the part of git commit
> '848ef42b' -->"Add the "snapshot too old" feature", this lwlock
> information is currently missing in the monitoring.sgml file. I have
> added the necessary information about the same in this file and
> attached is the patch. Thanks.

Pushed with minor fixes (white space and bumping the count for the tranche).

Thanks!

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] Implementing full UTF-8 support (aka supporting 0x00)

2016-08-03 Thread Álvaro Hernández Tortosa


Hi list.

As has been previously discussed (see 
https://www.postgresql.org/message-id/BAY7-F17FFE0E324AB3B642C547E96890%40phx.gbl 
for instance) varlena fields cannot accept the literal 0x00 value. Sure, 
you can use bytea, but this hardly a good solution. The problem seems to 
be hitting some use cases, like:


- People migrating data from other databases (apart from PostgreSQL, I 
don't know of any other database which suffers the same problem).
- People using drivers which use UTF-8 or equivalent encodings by 
default (Java for example)


Given that 0x00 is a perfectly legal UTF-8 character, I conclude 
we're strictly non-compliant. And given the general Postgres policy 
regarding standards compliance and the people being hit by this, I think 
it should be addressed. Specially since all the usual fixes are a real 
PITA (re-parsing, re-generating strings, which is very expensive, or 
dropping data).


What would it take to support it? Isn't the varlena header 
propagated everywhere, which could help infer the real length of the 
string? Any pointers or suggestions would be welcome.


Thanks,

Álvaro


--

Álvaro Hernández Tortosa


---
8Kdata



--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Wanting to learn about pgsql design decision

2016-08-03 Thread Tom Lane
Tal Walter  writes:
> The example questions I gave are just some of the questions I've tried to
> search the answer to, using google and searching this mailing list
> specifically, but I came up with nothing. Could I perhaps search the commit
> comments somehow? Or perhaps a different approach to suggest?

Well, the git history is not hard to come by: pull down a copy of our
git repo and see 'git log'.  Also the src/tools/git_changelog script
in the repo produces a nicely formatted cross-branch history.

https://wiki.postgresql.org/wiki/Working_with_Git

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] regression test for extended query protocol

2016-08-03 Thread Tom Lane
Tatsuo Ishii  writes:
>>> In my understanding, we don't have any regression test for protocol
>>> level prepared query (we do have SQL level prepared query tests,
>>> though). Shouldn't we add those tests to the regression test suites?

>> I thought that ECPG was covering a portion of that. Right?

> In my understanding, ECPG calls libpq, thus the test cases are limited
> to the cases which are only possible with libpq (or it may be even
> limited to the cases where ECPG deal with.)

I do not think it's very practical for the core tests to try to cover
any behavior that's not reachable via libpq.  Even testing stuff that's
not reached through ecpg would require a whole slew of new infrastructure
with no use except testing.

I think realistically a better approach to this would be to get some
buildfarm members running the JDBC regression tests (I assume there
are some ...)

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Slowness of extended protocol

2016-08-03 Thread Vladimir Sitnikov
Tom Lane :

> Bruce Momjian  writes:
> > On Sun, Jul 31, 2016 at 05:57:12PM -0400, Tom Lane wrote:
> >> In hindsight it seems clear that what a lot of apps want out of extended
> >> protocol is only the ability to send parameter values out-of-line
> instead
> >> of having to quote/escape them into SQL literals.  Maybe an idea for the
> >> fabled V4 protocol update is some compromise query type that corresponds
> >> precisely to PQexecParams's feature set: you can send parameter values
> >> out-of-line, and you can specify text or binary results, but there's no
> >> notion of any persistent state being created and no feedback about
> >> parameter data types.
>
> > Do you want this on the TODO list?
>
> I didn't hear anyone say it was a silly idea, so sure.
>

Frankly speaking, it is not clear what this change buys.

Are you sure v3 cannot be tuned to reach comparable performance?

I do not like very much having a variety of "query modes".
For instance, when working with logical replication, extended queries are
not supported over the wire, that complicates client.
This particular issue delays merge of logical repilcation support to the
JDBC driver:
https://github.com/pgjdbc/pgjdbc/pull/550#issuecomment-236418614



If adding one more "execute flavor" the things would get only worse, not
better.

Reusing parse state does indeed improve the performance in real-life
applications, so I would wonder if we can make current "extended" query
faster rather than implementing yet another protocol.

So while the request itself would definitely make sense if we had no
"v2/v3" protocols at all, however as we do have v2 and v3, it adding
"PQexecParams's
feature set" looks not that important.

Just in case, here are "protocol wanted features" as seen by client
applications (e.g. JDBC client):
https://github.com/pgjdbc/pgjdbc/blob/master/backend_protocol_v4_wanted_features.md



Vladimir


Re: [HACKERS] Comment typo in tuplesort.c

2016-08-03 Thread Bruce Momjian
On Tue, Aug  2, 2016 at 01:56:27PM +0900, Amit Langote wrote:
> Attached patch fixes a minor typo in tuplesort.c
> 
> s/child content/child context/g

Thanks, patch applied.

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Geoff Winkless
On 3 August 2016 at 15:04, Kevin Grittner  wrote:
> My initial experience with PostgreSQL would have been entirely
> different had I not found the community lists and benefited from
> the assistance and collective wisdom found on them.

The top non-sponsored link on google for "postgres support" takes you
straight to a page with a link to the mailing lists. I'm not sure that
not being able to find them was a problem.

I can well imagine that uber wouldn't have wanted to publicise their
problems (and so wouldn't have used a mailing list anyway); obviously
I've no way of knowing if they contacted any of the support companies
in the professional services page - I assume that professional
courtesy (and/or NDAs!) would preclude anyone from posting such here
anyway.

The problem with the professional services page is that the list of
companies is very dry, but it might be difficult to improve: as a
community it might be considered unreasonable to promote one over the
other; however if I had to go searching for professional support (and
hadn't seen the level of interaction that some of those companies'
employees provide on the mailing lists) I would have no clear idea
where to start.

Perhaps listing those companies that provide employment for some of
the core developers at the top (and explaining so) might be
acceptable? (or maybe not just core? you get the idea though). Maybe a
separate section for support companies versus hosts?

Geoff


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Kevin Grittner
On Wed, Aug 3, 2016 at 8:58 AM, Alfred Perlstein  wrote:
> On Aug 3, 2016, at 3:29 AM, Greg Stark  wrote:
>
>> Honestly the take-away I see in the Uber story is that they apparently
>> had nobody on staff that was on -hackers or apparently even -general
>> and tried to go it alone rather than involve experts from outside
>> their company. As a result they misdiagnosed their problems based on
>> prejudices seeing what they expected to see rather than what the real
>> problem was.
>
> +1 very true.
>
> At the same time there are some lessons to be learned. At the
> very least putting in big bold letters where to come for help is
> one.

+1

My initial experience with PostgreSQL would have been entirely
different had I not found the community lists and benefited from
the assistance and collective wisdom found on them.

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Slowness of extended protocol

2016-08-03 Thread Tom Lane
Bruce Momjian  writes:
> On Sun, Jul 31, 2016 at 05:57:12PM -0400, Tom Lane wrote:
>> In hindsight it seems clear that what a lot of apps want out of extended
>> protocol is only the ability to send parameter values out-of-line instead
>> of having to quote/escape them into SQL literals.  Maybe an idea for the
>> fabled V4 protocol update is some compromise query type that corresponds
>> precisely to PQexecParams's feature set: you can send parameter values
>> out-of-line, and you can specify text or binary results, but there's no
>> notion of any persistent state being created and no feedback about
>> parameter data types.

> Do you want this on the TODO list?

I didn't hear anyone say it was a silly idea, so sure.

regards, tom lane


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Alfred Perlstein


> On Aug 3, 2016, at 3:29 AM, Greg Stark  wrote:
> 
>> 
> 
> Honestly the take-away I see in the Uber story is that they apparently
> had nobody on staff that was on -hackers or apparently even -general
> and tried to go it alone rather than involve experts from outside
> their company. As a result they misdiagnosed their problems based on
> prejudices seeing what they expected to see rather than what the real
> problem was.
> 

+1 very true. 

At the same time there are some lessons to be learned. At the very least 
putting in big bold letters where to come for help is one. 





-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Detecting skipped data from logical slots (data silently skipped)

2016-08-03 Thread Greg Stark
I didn't follow all of that but I wonder if it isn't just that when you
restore from backup you should be creating a new slot?

On 3 Aug 2016 14:39, "Craig Ringer"  wrote:

> Hi all
>
> I think we have a bit of a problem with the behaviour specified for
> logical slots, one that makes it hard to prevent a outdated snapshot or
> backup of a logical-slot-using downstream from knowing it's missing a chunk
> of data that's been consumed from a slot. That's not great since slots are
> supposed to ensure a continuous, gapless data stream.
>
> If the downstream requests that logical decoding restarts at an LSN older
> than the slot's confirmed_flush_lsn, we silently ignore the client's
> request and start replay at the confirmed_flush_lsn. That's by design and
> fine normally, since we know the gap LSNs contained no transactions of
> interest to the downstream.
>
> But it's *bad* if the downstream is actually a copy of the original
> downstream that's been started from a restored backup/snapshot. In that
> case the downstream won't know that some other client, probably a newer
> instance of its self, consumed rows it should've seen. It'll merrily
> continue replaying and not know it isn't consistent.
>
> The cause is an optimisation intended to allow the downstream to avoid
> having to do local writes and flushes when the upstream's activity isn't of
> interest to it and doesn't result in replicated rows. When the upstream
> does a bunch of writes to another database or otherwise produces WAL not of
> interest to the downstream we send the downstream keepalive messages that
> include the upstream's current xlog position and the client replies to
> acknowledge it's seen the new LSN. But, so that we can avoid disk flushes
> on the downstream, we permit it to skip advancing its replication origin in
> response to those keepalives. We continue to advance the
> confirmed_flush_lsn and restart_lsn in the replication slot on the upstream
> so we can free WAL that's not needed and move the catalog_xmin up. The
> replication origin on the downstream falls behind the confirmed_flush_lsn
> on the upstream.
>
> This means that if the downstream exits/crashes before receiving some new
> row, its replication origin will tell it that it last replayed some LSN
> older than what it really did, and older than what the server retains.
> Logical decoding doesn't allow the client to "jump backwards" and replay
> anything older than the confirmed_lsn. Since we "know" that the gap cannot
> contain rows of interest, otherwise we'd have updated the replication
> origin, we just skip and start replay at the confirmed_flush_lsn.
>
> That means that if the downstream is restored from a backup it has no way
> of knowing it can't rejoin and become consistent because it can't tell the
> difference between "everything's fine, replication origin intentionally
> behind confirmed_flush_lsn due to activity not of interest" and "we've
> missed data consumed from this slot by some other peer and should refuse to
> continue replay".
>
> The simplest fix would be to require downstreams to flush their
> replication origin when they get a hot standby feedback message, before
> they send a reply with confirmation. That could be somewhat painful for
> performance, but can be alleviated somewhat by waiting for the downstream
> postgres to get around to doing a flush anyway and only forcing it if we're
> getting close to the walsender timeout. That's pretty much what BDR and
> pglogical do when applying transactions to avoid having to do a disk flush
> for each and every applied xact. Then we change START_REPLICATION ...
> LOGICAL so it ERRORs if you ask for a too-old LSN rather than silently
> ignoring it.
>
> This problem can also bite you if you restore a copy of a downstream (say,
> to look at since-deleted data) while the original happens to be
> disconnected for some reason. The copy connects to the upstream and
> consumes some data from the slot. Then when the original comes back on line
> it has no idea there's a gap in its time stream.
>
> This came up when investigating issues with people using snapshot-based
> BDR and pglogical backup/restore. It's a real-world problem that can result
> in silent data inconsistency.
>
> Thoughts on the proposed fix? Any ideas for lower-impact fixes that'd
> still allow a downstream to find out if it's missed data?
>
> --
>  Craig Ringer   http://www.2ndQuadrant.com/
>  PostgreSQL Development, 24x7 Support, Training & Services
>


Re: [HACKERS] Way to access LSN (for each transaction) by directly talking to postgres?

2016-08-03 Thread Craig Ringer
On 3 August 2016 at 11:37, Joshua Bay  wrote:

> Hi,
>
> Could you please let me know if there is a way to get LSN of each
> transaction by directly communicating with Postgres server and NOT by
> accessing logs.
>


To what end? What problem are you trying to solve?

What LSN, exactly? The LSN of the first write and xid allocation? The LSN
of the commit record? What if it's a complex commit like with prepared
xacts?


-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Craig Ringer
On 3 August 2016 at 05:14, Tom Lane  wrote:


>
> In short, I think Uber's position that logical replication is somehow more
> reliable than physical is just wishful thinking.  If anything, my money
> would be on the other way around: there's a lot less mechanism that can go
> wrong in physical replication.


Particularly since they aren't using row-based logical replication, but -
it seems - statement based replication. We all know the problems there.


 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


[HACKERS] Detecting skipped data from logical slots (data silently skipped)

2016-08-03 Thread Craig Ringer
Hi all

I think we have a bit of a problem with the behaviour specified for logical
slots, one that makes it hard to prevent a outdated snapshot or backup of a
logical-slot-using downstream from knowing it's missing a chunk of data
that's been consumed from a slot. That's not great since slots are supposed
to ensure a continuous, gapless data stream.

If the downstream requests that logical decoding restarts at an LSN older
than the slot's confirmed_flush_lsn, we silently ignore the client's
request and start replay at the confirmed_flush_lsn. That's by design and
fine normally, since we know the gap LSNs contained no transactions of
interest to the downstream.

But it's *bad* if the downstream is actually a copy of the original
downstream that's been started from a restored backup/snapshot. In that
case the downstream won't know that some other client, probably a newer
instance of its self, consumed rows it should've seen. It'll merrily
continue replaying and not know it isn't consistent.

The cause is an optimisation intended to allow the downstream to avoid
having to do local writes and flushes when the upstream's activity isn't of
interest to it and doesn't result in replicated rows. When the upstream
does a bunch of writes to another database or otherwise produces WAL not of
interest to the downstream we send the downstream keepalive messages that
include the upstream's current xlog position and the client replies to
acknowledge it's seen the new LSN. But, so that we can avoid disk flushes
on the downstream, we permit it to skip advancing its replication origin in
response to those keepalives. We continue to advance the
confirmed_flush_lsn and restart_lsn in the replication slot on the upstream
so we can free WAL that's not needed and move the catalog_xmin up. The
replication origin on the downstream falls behind the confirmed_flush_lsn
on the upstream.

This means that if the downstream exits/crashes before receiving some new
row, its replication origin will tell it that it last replayed some LSN
older than what it really did, and older than what the server retains.
Logical decoding doesn't allow the client to "jump backwards" and replay
anything older than the confirmed_lsn. Since we "know" that the gap cannot
contain rows of interest, otherwise we'd have updated the replication
origin, we just skip and start replay at the confirmed_flush_lsn.

That means that if the downstream is restored from a backup it has no way
of knowing it can't rejoin and become consistent because it can't tell the
difference between "everything's fine, replication origin intentionally
behind confirmed_flush_lsn due to activity not of interest" and "we've
missed data consumed from this slot by some other peer and should refuse to
continue replay".

The simplest fix would be to require downstreams to flush their replication
origin when they get a hot standby feedback message, before they send a
reply with confirmation. That could be somewhat painful for performance,
but can be alleviated somewhat by waiting for the downstream postgres to
get around to doing a flush anyway and only forcing it if we're getting
close to the walsender timeout. That's pretty much what BDR and pglogical
do when applying transactions to avoid having to do a disk flush for each
and every applied xact. Then we change START_REPLICATION ... LOGICAL so it
ERRORs if you ask for a too-old LSN rather than silently ignoring it.

This problem can also bite you if you restore a copy of a downstream (say,
to look at since-deleted data) while the original happens to be
disconnected for some reason. The copy connects to the upstream and
consumes some data from the slot. Then when the original comes back on line
it has no idea there's a gap in its time stream.

This came up when investigating issues with people using snapshot-based BDR
and pglogical backup/restore. It's a real-world problem that can result in
silent data inconsistency.

Thoughts on the proposed fix? Any ideas for lower-impact fixes that'd still
allow a downstream to find out if it's missed data?

-- 
 Craig Ringer   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services


Re: [HACKERS] Why we lost Uber as a user

2016-08-03 Thread Bruce Momjian
On Tue, Aug  2, 2016 at 10:33:15PM -0400, Bruce Momjian wrote:
> I saw from the Uber article that they weren't going to per-row logical
> replication but _statement_ replication, which is very hard to do
> because typical SQL doesn't record what concurrent transactions
> committed before a new statement's transaction snapshot is taken, and
> doesn't record lock order for row updates blocked by concurrent activity
> --- both of which affect the final result from the query.
> 
> So, for statement replication, it is not a question of whether the code
> has bugs, but that the replay is not 100% possible in all cases, unless
> you switch to some statement-row-lock hybrid ability.

Oh, and one more problem with statement-level replication is that the
overhead of statement replay is high, as high as it was on the master. 
That leaves minimal server resources left to handle read-only workloads
on the slave.

-- 
  Bruce Momjian  http://momjian.us
  EnterpriseDB http://enterprisedb.com

+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] Way to access LSN (for each transaction) by directly talking to postgres?

2016-08-03 Thread Joshua Bay
Hi,

Could you please let me know if there is a way to get LSN of each
transaction by directly communicating with Postgres server and NOT by
accessing logs.

Thanks!
Joshua


Re: [COMMITTERS] [HACKERS] Logical decoding

2016-08-03 Thread Joshua Bay
Thanks a lot for your help!!

On Tue, Jul 12, 2016 at 4:42 PM, Fabrízio de Royes Mello <
fabriziome...@gmail.com> wrote:

>
>
> On Mon, Jul 11, 2016 at 2:13 AM, Michael Paquier <
> michael.paqu...@gmail.com> wrote:
> >
> > On Mon, Jul 11, 2016 at 2:03 PM, Craig Ringer 
> wrote:
> > > On 9 July 2016 at 01:57, Joshua Bay  wrote:
> > >> and where are this code is in the codebase?
> > >
> > > src/backend/replication/logical/*
> > > src/backend/replication/walsender.c
> > > src/backend/access/transam/xlogreader.c
> > > src/include/access/xlogreader.h
> > > src/include/replication/output_plugin.h
> > > contrib/test_decoding/
> > > doc/src/sgml/logicaldecoding.sgml
> >
> > Some other references:
> > https://wiki.postgresql.org/images/7/77/Fosdem-2015-logical-decoding.pdf
> >
> > And some other examples of plugins:
> > https://github.com/leptonix/decoding-json
> > https://github.com/xstevens/decoderbufs
> > https://github.com/confluentinc/bottledwater-pg (for kafka)
> > https://github.com/michaelpq/pg_plugins/tree/master/decoder_raw (wrote
> this one)
> >
>
> Nice, also we have wal2json [1].
>
> Regards,
>
>
> [1] https://github.com/eulerto/wal2json
>
> --
> Fabrízio de Royes Mello
> Consultoria/Coaching PostgreSQL
> >> Timbira: http://www.timbira.com.br
> >> Blog: http://fabriziomello.github.io
> >> Linkedin: http://br.linkedin.com/in/fabriziomello
> >> Twitter: http://twitter.com/fabriziomello
> >> Github: http://github.com/fabriziomello
>


Re: [HACKERS] pg_ctl promote wait

2016-08-03 Thread Peter Eisentraut
Updated patch set for pg_ctl promote -w, incorporating AFAICT all of the
previous reviews, in particular Michael Paquier's changes to the
StartupXLOG() ordering, and rebasing on top of
src/common/controldata_utils.c.

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From 371ca61ab87e792864ac2f76f3e0dad7912c247e Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Tue, 26 Jul 2016 11:39:43 -0400
Subject: [PATCH 1/5] Make command_like output more compact

Consistently print the test name, not the full command, which can be
quite lenghty and include temporary directory names and other
distracting details.
---
 src/test/perl/TestLib.pm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/perl/TestLib.pm b/src/test/perl/TestLib.pm
index 649fd82..c7b3262 100644
--- a/src/test/perl/TestLib.pm
+++ b/src/test/perl/TestLib.pm
@@ -276,8 +276,8 @@ sub command_like
 	my ($stdout, $stderr);
 	print("# Running: " . join(" ", @{$cmd}) . "\n");
 	my $result = IPC::Run::run $cmd, '>', \$stdout, '2>', \$stderr;
-	ok($result, "@$cmd exit code 0");
-	is($stderr, '', "@$cmd no stderr");
+	ok($result, "$test_name: exit code 0");
+	is($stderr, '', "$test_name: no stderr");
 	like($stdout, $expected_stdout, "$test_name: matches");
 }
 
-- 
2.9.2

>From eabd099bf09d9a4815c29c730d568161c87a67ba Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Tue, 26 Jul 2016 10:48:43 -0400
Subject: [PATCH 2/5] pg_ctl: Add tests for promote action

---
 src/bin/pg_ctl/t/003_promote.pl | 41 +
 src/test/perl/TestLib.pm| 11 +++
 2 files changed, 52 insertions(+)
 create mode 100644 src/bin/pg_ctl/t/003_promote.pl

diff --git a/src/bin/pg_ctl/t/003_promote.pl b/src/bin/pg_ctl/t/003_promote.pl
new file mode 100644
index 000..d7e7b99
--- /dev/null
+++ b/src/bin/pg_ctl/t/003_promote.pl
@@ -0,0 +1,41 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 9;
+
+my $tempdir = TestLib::tempdir;
+
+command_fails_like([ 'pg_ctl', 'promote', '-D', "$tempdir/nonexistent" ],
+   qr/directory .* does not exist/,
+   'pg_ctl promote with nonexistent directory');
+
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+
+command_fails_like([ 'pg_ctl', 'promote', '-D', $node_primary->data_dir ],
+   qr/PID file .* does not exist/,
+   'pg_ctl promote of not running instance fails');
+
+$node_primary->start;
+
+command_fails_like([ 'pg_ctl', 'promote', '-D', $node_primary->data_dir ],
+   qr/not in standby mode/,
+   'pg_ctl promote of primary instance fails');
+
+my $node_standby = get_new_node('standby');
+$node_primary->backup('my_backup');
+$node_standby->init_from_backup($node_primary, 'my_backup', has_streaming => 1);
+$node_standby->start;
+
+is($node_standby->safe_psql('postgres', 'SELECT pg_is_in_recovery()'),
+   't', 'standby is in recovery');
+
+command_ok([ 'pg_ctl', 'promote', '-D', $node_standby->data_dir ],
+		   'pg_ctl promote of standby runs');
+
+$node_standby->poll_query_until('postgres', 'SELECT NOT pg_is_in_recovery()');
+
+is($node_standby->safe_psql('postgres', 'SELECT pg_is_in_recovery()'),
+   'f', 'promoted standby is not in recovery');
diff --git a/src/test/perl/TestLib.pm b/src/test/perl/TestLib.pm
index c7b3262..51b533e 100644
--- a/src/test/perl/TestLib.pm
+++ b/src/test/perl/TestLib.pm
@@ -34,6 +34,7 @@ our @EXPORT = qw(
   program_version_ok
   program_options_handling_ok
   command_like
+  command_fails_like
 
   $windows_os
 );
@@ -281,4 +282,14 @@ sub command_like
 	like($stdout, $expected_stdout, "$test_name: matches");
 }
 
+sub command_fails_like
+{
+	my ($cmd, $expected_stderr, $test_name) = @_;
+	my ($stdout, $stderr);
+	print("# Running: " . join(" ", @{$cmd}) . "\n");
+	my $result = IPC::Run::run $cmd, '>', \$stdout, '2>', \$stderr;
+	ok(!$result, "$test_name: exit code not 0");
+	like($stderr, $expected_stderr, "$test_name: matches");
+}
+
 1;
-- 
2.9.2

>From b5bc405a85c6a84c7916f5cc2c3225438df70315 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut 
Date: Tue, 26 Jul 2016 11:23:43 -0400
Subject: [PATCH 3/5] pg_ctl: Detect current standby state from pg_control

pg_ctl used to determine whether a server was in standby mode by looking
for a recovery.conf file.  With this change, it instead looks into
pg_control, which is potentially more accurate.  There are also
occasional discussions about removing recovery.conf, so this removes one
dependency.

Add a wait_seconds argument to get_controlfile() so that pg_ctl can wait
a bit for a consistent pg_control file.  Otherwise, pg_ctl operations
might fail on rare occasions when the server is updating the file at the
same time.
---
 src/backend/utils/misc/pg_controldata.c |  8 ++--
 src/bin/pg_controldata/pg_controldata.c |  2 +-
 src/bin/pg_ctl/pg_ctl.c 

  1   2   >