Re: refactoring basebackup.c

2022-01-26 Thread Dipesh Pandit
Hi,

> It only needed trivial rebasing; I have committed it after doing that.

I have updated the patches to support server compression (gzip) for
plain format backup. Please find attached v4 patches.

Thanks,
Dipesh
From 4d0c84d6fac841aafb757535cc0e48334a251581 Mon Sep 17 00:00:00 2001
From: Dipesh Pandit 
Date: Mon, 24 Jan 2022 15:28:48 +0530
Subject: [PATCH 1/2] Support for extracting gzip compressed archive

pg_basebackup can support server side compression using gzip. In
order to support plain format backup with option '-Fp' we need to
add support for decompressing the compressed blocks at client. This
patch addresses the extraction of gzip compressed blocks at client.
---
 doc/src/sgml/ref/pg_basebackup.sgml |   8 +-
 src/bin/pg_basebackup/Makefile  |   1 +
 src/bin/pg_basebackup/bbstreamer.h  |   1 +
 src/bin/pg_basebackup/bbstreamer_file.c | 182 
 src/bin/pg_basebackup/bbstreamer_gzip.c | 376 
 src/bin/pg_basebackup/pg_basebackup.c   |  19 +-
 6 files changed, 401 insertions(+), 186 deletions(-)
 create mode 100644 src/bin/pg_basebackup/bbstreamer_gzip.c

diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index 1d0df34..19849be 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -428,8 +428,12 @@ PostgreSQL documentation


 When the tar format is used, the suffix .gz will
-automatically be added to all tar filenames. Compression is not
-available in plain format.
+automatically be added to all tar filenames.
+   
+   
+Server compression can be specified with plain format backup. It
+enables compression of the archive at server and extract the
+compressed archive at client.

   
  
diff --git a/src/bin/pg_basebackup/Makefile b/src/bin/pg_basebackup/Makefile
index 5b18851..78d96c6 100644
--- a/src/bin/pg_basebackup/Makefile
+++ b/src/bin/pg_basebackup/Makefile
@@ -38,6 +38,7 @@ OBJS = \
 BBOBJS = \
 	pg_basebackup.o \
 	bbstreamer_file.o \
+	bbstreamer_gzip.o \
 	bbstreamer_inject.o \
 	bbstreamer_tar.o
 
diff --git a/src/bin/pg_basebackup/bbstreamer.h b/src/bin/pg_basebackup/bbstreamer.h
index fc88b50..270b0df 100644
--- a/src/bin/pg_basebackup/bbstreamer.h
+++ b/src/bin/pg_basebackup/bbstreamer.h
@@ -205,6 +205,7 @@ extern bbstreamer *bbstreamer_extractor_new(const char *basepath,
 			const char *(*link_map) (const char *),
 			void (*report_output_file) (const char *));
 
+extern bbstreamer *bbstreamer_gzip_extractor_new(bbstreamer *next);
 extern bbstreamer *bbstreamer_tar_parser_new(bbstreamer *next);
 extern bbstreamer *bbstreamer_tar_terminator_new(bbstreamer *next);
 extern bbstreamer *bbstreamer_tar_archiver_new(bbstreamer *next);
diff --git a/src/bin/pg_basebackup/bbstreamer_file.c b/src/bin/pg_basebackup/bbstreamer_file.c
index 77ca222..d721f87 100644
--- a/src/bin/pg_basebackup/bbstreamer_file.c
+++ b/src/bin/pg_basebackup/bbstreamer_file.c
@@ -11,10 +11,6 @@
 
 #include "postgres_fe.h"
 
-#ifdef HAVE_LIBZ
-#include 
-#endif
-
 #include 
 
 #include "bbstreamer.h"
@@ -30,15 +26,6 @@ typedef struct bbstreamer_plain_writer
 	bool		should_close_file;
 } bbstreamer_plain_writer;
 
-#ifdef HAVE_LIBZ
-typedef struct bbstreamer_gzip_writer
-{
-	bbstreamer	base;
-	char	   *pathname;
-	gzFile		gzfile;
-}			bbstreamer_gzip_writer;
-#endif
-
 typedef struct bbstreamer_extractor
 {
 	bbstreamer	base;
@@ -62,22 +49,6 @@ const bbstreamer_ops bbstreamer_plain_writer_ops = {
 	.free = bbstreamer_plain_writer_free
 };
 
-#ifdef HAVE_LIBZ
-static void bbstreamer_gzip_writer_content(bbstreamer *streamer,
-		   bbstreamer_member *member,
-		   const char *data, int len,
-		   bbstreamer_archive_context context);
-static void bbstreamer_gzip_writer_finalize(bbstreamer *streamer);
-static void bbstreamer_gzip_writer_free(bbstreamer *streamer);
-static const char *get_gz_error(gzFile gzf);
-
-const bbstreamer_ops bbstreamer_gzip_writer_ops = {
-	.content = bbstreamer_gzip_writer_content,
-	.finalize = bbstreamer_gzip_writer_finalize,
-	.free = bbstreamer_gzip_writer_free
-};
-#endif
-
 static void bbstreamer_extractor_content(bbstreamer *streamer,
 		 bbstreamer_member *member,
 		 const char *data, int len,
@@ -196,159 +167,6 @@ bbstreamer_plain_writer_free(bbstreamer *streamer)
 }
 
 /*
- * Create a bbstreamer that just compresses data using gzip, and then writes
- * it to a file.
- *
- * As in the case of bbstreamer_plain_writer_new, pathname is always used
- * for error reporting purposes; if file is NULL, it is also the opened and
- * closed so that the data may be written there.
- */
-bbstreamer *
-bbstreamer_gzip_writer_new(char *pathname, FILE *file, int compresslevel)
-{
-#ifdef HAVE_LIBZ
-	bbstreamer_gzip_writer *streamer;
-
-	streamer = palloc0(sizeof(bbstreamer_gzip_writer));
-	*((const bbstreamer_ops **) 

Re: Fix uninitialized variable access (src/backend/utils/mmgr/freepage.c)

2022-01-26 Thread Michael Paquier
On Fri, Oct 01, 2021 at 05:03:04PM -0300, Ranier Vilela wrote:
> For me the assertion remains valid and usable.

Well, I was looking at this thread again, and I still don't see what
we benefit from this change.  One thing that could also be done is to
initialize "result" at {0} at the top of FreePageManagerGetInternal()
and FreePageManagerPutInternal(), but that's in the same category as
the other suggestions.  I'll go drop the patch if there are no
objections.
--
Michael


signature.asc
Description: PGP signature


Re: row filtering for logical replication

2022-01-26 Thread Peter Smith
Here are some review comments for v71-0001

~~~

1. Commit Message - database

"...that don't satisfy this WHERE clause will be filtered out. This allows a
database or set of tables to be partially replicated. The row filter is
per table. A new row filter can be added simply by specifying a WHERE..."

I don't know what extra information is conveyed by saying "a
database". Isn't it sufficient to just say "This allows a set of
tables to be partially replicated." ?

~~~

2. Commit message - OR'ed

The row filters are applied before publishing the changes. If the
subscription has several publications in which the same table has been
published with different filters, those expressions get OR'ed together so
that rows satisfying any of the expressions will be replicated.

Shouldn't that say:
"with different filters," --> "with different filters (for the same
publish operation),"

~~~

3. Commit message - typo

This means all the other filters become redundant if (a) one of the
publications have no filter at all, (b) one of the publications was
created using FOR ALL TABLES, (c) one of the publications was created
using FOR ALL TABLES IN SCHEMA and the table belongs to that same schema.

Typo:
"have no filter" --> "has no filter"

~~~

4. Commit message - psql \d+

"Psql commands \dRp+ and \d+ will display any row filters."

Actually, just "\d" (without +) will also display row filters. You do
not need to say "\d+"

~~~

5. src/backend/executor/execReplication.c - CheckCmdReplicaIdentity

+ RelationBuildPublicationDesc(rel, );
+ if (!pubdesc.rf_valid_for_update && cmd == CMD_UPDATE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("cannot update table \"%s\"",
+ RelationGetRelationName(rel)),
+ errdetail("Column \"%s\" used in the publication WHERE expression is
not part of the replica identity.",
+get_attname(RelationGetRelid(rel),
+pubdesc.invalid_rfcol_update,
+false;
+ else if (!pubdesc.rf_valid_for_delete && cmd == CMD_DELETE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("cannot delete from table \"%s\"",
+ RelationGetRelationName(rel)),
+ errdetail("Column \"%s\" used in the publication WHERE expression is
not part of the replica identity.",
+get_attname(RelationGetRelid(rel),
+pubdesc.invalid_rfcol_delete,
+false;


IMO those conditions should be reversed because (a) it's more optimal
to test the other way around, and (b) for consistency with other code
in this function.

BEFORE
+ if (!pubdesc.rf_valid_for_update && cmd == CMD_UPDATE)
...
+ else if (!pubdesc.rf_valid_for_delete && cmd == CMD_DELETE)
AFTER
+ if (cmd == CMD_UPDATE && !pubdesc.rf_valid_for_update)
...
+ else if (cmd == CMD_DELETE && !pubdesc.rf_valid_for_delete)

~~~

6. src/backend/replication/pgoutput/pgoutput.c - pgoutput_row_filter

+ /*
+ * Unchanged toasted replica identity columns are only logged in the
+ * old tuple, copy this over to the new tuple. The changed (or WAL
+ * Logged) toast values are always assembled in memory and set as
+ * VARTAG_INDIRECT. See ReorderBufferToastReplace.
+ */

Something seems not quite right with the comma in that first sentence.
Maybe a period is better?

BEFORE
Unchanged toasted replica identity columns are only logged in the old
tuple, copy this over to the new tuple.
AFTER
Unchanged toasted replica identity columns are only logged in the old
tuple. Copy this over to the new tuple.

~~~

7. src/test/subscription/t/028_row_filter.pl - COPYRIGHT

This TAP file should have a copyright comment that is consistent with
all the other TAP files.

--
Kind Regards,
Peter Smith.
Fujitsu Australia




Re: TAP test to cover "EndOfLogTLI != replayTLI" case

2022-01-26 Thread Kyotaro Horiguchi
At Thu, 27 Jan 2022 15:31:37 +0900, Michael Paquier  wrote 
in 
> On Thu, Jan 20, 2022 at 12:13:08PM +0530, Amul Sul wrote:
> > I found the cause for the test failing on window -- is due to the
> > custom archive command setting which wasn't setting the correct window
> > archive directory path.
> 
> After reading this patch and this thread, I have noticed that you are
> testing the same thing as Heikki here, patch 0001:
> https://www.postgresql.org/message-id/52bc9ccd-8591-431b-0086-15d9acf25...@iki.fi
> 
> The patch sent on the other thread has a better description and shape,
> so perhaps we'd better drop what is posted here in favor of the other
> version.  Thoughts?

pg_switch_wal() is more preferable than filling-in a large number of
records as the means to advance segments because it is stable against
changes of wal_segment_size.

As you said, using has_restoring on initializing node_ptr is simpler
than explicitly setting archive_dir from that of another node.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: Proposal: More structured logging

2022-01-26 Thread Michael Paquier
On Tue, Jan 18, 2022 at 06:46:03AM +0100, Ronan Dunklau wrote:
> Hum, there was a missing import in csvlog.c from the fix above. Sorry about 
> that. 

+
 

You are also forgetting that the table listing all the jsonlog fields
needs a refresh with this new key called "tags", and that it has a
JSON object underneath.

If you want to test all the formats supported by logging_destination,
wouldn't this stuff be better if tested through TAP with
logging_destination set with csvlog, jsonlog and stderr?  This is
lacking coverage for csvlog.c and jsonlog.c, paths touched by your
patch.
--
Michael


signature.asc
Description: PGP signature


Re: PITR: enhance getRecordTimestamp()

2022-01-26 Thread Michael Paquier
On Wed, Nov 03, 2021 at 04:59:04PM +, Simon Riggs wrote:
> Thanks. Fixed and rebased.

+   if (xact_info == XLOG_XACT_PREPARE)
+   {
+   if (recoveryTargetUseOriginTime)
+   {
+   xl_xact_prepare *xlrec = (xl_xact_prepare *) 
XLogRecGetData(record);
+   xl_xact_parsed_prepare parsed;
+
+   ParsePrepareRecord(XLogRecGetInfo(record),
+ xlrec,
+ );
+   *recordXtime = parsed.origin_timestamp;
+   }
+   else
+   *recordXtime = ((xl_xact_prepare *) 
XLogRecGetData(record))->prepared_at;

As I learnt recently with ece8c76, there are cases where an origin
timestamp may not be set in the WAL record that includes the origin
timestamp depending on the setup done on the origin cluster.  Isn't
this code going to finish by returning true when enabling
recovery_target_use_origin_time in some cases, even if recordXtime is
0?  So it seems to me that this is lacking some sanity checks if
recordXtime is 0.

Could you add some tests for this proposal?  This adds various PITR
scenarios that would be uncovered, and TAP should be able to cover
that.
--
Michael


signature.asc
Description: PGP signature


Re: row filtering for logical replication

2022-01-26 Thread Greg Nancarrow
On Thu, Jan 27, 2022 at 4:59 PM Peter Smith  wrote:
>
> On Thu, Jan 27, 2022 at 9:40 AM Greg Nancarrow 
wrote:
> >
> > On Wed, Jan 26, 2022 at 2:08 PM houzj.f...@fujitsu.com
> >  wrote:
> > >
> > > There was a miss in the posted patch which didn't initialize the
parameter in
> > > RelationBuildPublicationDesc, sorry for that. Attach the correct
patch this time.
> > >
> >
> > A few comments for the v71-0001 patch:
> ...
> > (2) check_simple_rowfilter_expr_walker
> >
> > In the function header:
> > (i) "etc" should be "etc."
> > (ii)
> > Is
> >
> > + * - (Var Op Const) Bool (Var Op Const)
> >
> >meant to be:
> >
> > + * - (Var Op Const) Logical-Op (Var Op Const)
> >
> > ?
> >
> > It's not clear what "Bool" means here.
>
> The comment is only intended as a generic example of the kinds of
> acceptable expression format.
>
> The names in the comment used are roughly equivalent to the Node* tag
names.
>
> This particular example is for an expression with AND/OR/NOT, which is
> handled by a BoolExpr.
>
> There is no such animal as LogicalOp, so rather than change like your
> suggestion I feel if this comment is going to change then it would be
> better to change to be "boolop" (because the BoolExpr struct has a
> boolop member). e.g.
>
> BEFORE
> + * - (Var Op Const) Bool (Var Op Const)
> AFTER
> + * - (Var Op Const) boolop (Var Op Const)
>

My use of "LogicalOp" was just indicating that the use of "Bool" in that
line was probably meant to mean "Logical Operator", and these are
documented in "9.1 Logical Operators" here:
https://www.postgresql.org/docs/14/functions-logical.html
(PostgreSQL docs don't refer to AND/OR etc. as boolean operators)

Perhaps, to make it clear, the change for the example compound expression
could simply be:

+ * - (Var Op Const) AND/OR (Var Op Const)

or at least say something like "- where boolop is AND/OR".

Regards,
Greg Nancarrow
Fujitsu Australia


Re: Split xlog.c

2022-01-26 Thread Michael Paquier
On Tue, Jan 25, 2022 at 12:12:40PM +0200, Heikki Linnakangas wrote:
> In last round of review, I spotted one bug: I had mixed up the meaning of
> EndOfLogTLI. It is the TLI in the *filename* of the WAL segment that we read
> the last record from, which can be different from the TLI that the last
> record is actually on. All existing tests were passing with that bug, so I
> added a test case to cover that case.

FYI, this overlaps with a different patch sent recently, as of this
thread:
https://www.postgresql.org/message-id/CAAJ_b94Vjt5cXGza_1MkjLQWciNdEemsmiWuQj0d=m7jfja...@mail.gmail.com
--
Michael


signature.asc
Description: PGP signature


Re: TAP test to cover "EndOfLogTLI != replayTLI" case

2022-01-26 Thread Michael Paquier
On Thu, Jan 20, 2022 at 12:13:08PM +0530, Amul Sul wrote:
> I found the cause for the test failing on window -- is due to the
> custom archive command setting which wasn't setting the correct window
> archive directory path.

After reading this patch and this thread, I have noticed that you are
testing the same thing as Heikki here, patch 0001:
https://www.postgresql.org/message-id/52bc9ccd-8591-431b-0086-15d9acf25...@iki.fi

The patch sent on the other thread has a better description and shape,
so perhaps we'd better drop what is posted here in favor of the other
version.  Thoughts?
--
Michael


signature.asc
Description: PGP signature


Re: Output clause for Upsert aka INSERT...ON CONFLICT

2022-01-26 Thread Justin Pryzby
On Thu, Jan 27, 2022 at 10:24:14AM +0530, Anand Sowmithiran wrote:
> The INSERT...ON CONFLICT is used for doing upserts in one of our app.
> Our app works with both MS SQL and Postgresql, based on customer needs.
> 
> Unlike the MS SQL MERGE command's OUTPUT clause that gives the $action
> 
> [INSERT / UPDATE  /DELETE] that was done during the upsert, the RETURNING
> clause of the pgsql does not return the action done.
> We need this so that the application can use that for auditing and UI
> purposes.
> Is there any workaround to get this info ?

Thomas already answered about the xmax hack, but I dug these up in the
meantime.

https://www.postgresql.org/message-id/flat/CAA-aLv4d%3DzHnx%2BzFKqoszT8xRFpdeRNph1Z2uhEYA33bzmgtaA%40mail.gmail.com#899e15b8b357c6b29c51d94a0767a601
https://www.postgresql.org/message-id/flat/1565486215.7551.0%40finefun.com.au
https://www.postgresql.org/message-id/flat/20190724232439.lpxzjw2jg3ukgcqn%40alap3.anarazel.de
https://www.postgresql.org/message-id/flat/DE57F14C-DB96-4F17-9254-AD0AABB3F81F%40mackerron.co.uk
https://www.postgresql.org/message-id/CAM3SWZRmkVqmRCs34YtZPOCn%2BHmHqtcdEmo6%3D%3Dnqz1kNA43DVw%40mail.gmail.com

https://stackoverflow.com/questions/39058213/postgresql-upsert-differentiate-inserted-and-updated-rows-using-system-columns-x/39204667
https://stackoverflow.com/questions/40878027/detect-if-the-row-was-updated-or-inserted/40880200#40880200




Re: row filtering for logical replication

2022-01-26 Thread Peter Smith
On Thu, Jan 27, 2022 at 9:40 AM Greg Nancarrow  wrote:
>
> On Wed, Jan 26, 2022 at 2:08 PM houzj.f...@fujitsu.com
>  wrote:
> >
> > There was a miss in the posted patch which didn't initialize the parameter 
> > in
> > RelationBuildPublicationDesc, sorry for that. Attach the correct patch this 
> > time.
> >
>
> A few comments for the v71-0001 patch:
...
> (2) check_simple_rowfilter_expr_walker
>
> In the function header:
> (i) "etc" should be "etc."
> (ii)
> Is
>
> + * - (Var Op Const) Bool (Var Op Const)
>
>meant to be:
>
> + * - (Var Op Const) Logical-Op (Var Op Const)
>
> ?
>
> It's not clear what "Bool" means here.

The comment is only intended as a generic example of the kinds of
acceptable expression format.

The names in the comment used are roughly equivalent to the Node* tag names.

This particular example is for an expression with AND/OR/NOT, which is
handled by a BoolExpr.

There is no such animal as LogicalOp, so rather than change like your
suggestion I feel if this comment is going to change then it would be
better to change to be "boolop" (because the BoolExpr struct has a
boolop member). e.g.

BEFORE
+ * - (Var Op Const) Bool (Var Op Const)
AFTER
+ * - (Var Op Const) boolop (Var Op Const)

--
Kind Regards,
Peter Smith.
Fujitsu Australia.




Re: Output clause for Upsert aka INSERT...ON CONFLICT

2022-01-26 Thread Thomas Munro
On Thu, Jan 27, 2022 at 5:54 PM Anand Sowmithiran  wrote:
> Is there any workaround to get this info ?

There's an undocumented trick:

https://stackoverflow.com/questions/34762732/how-to-find-out-if-an-upsert-was-an-update-with-postgresql-9-5-upsert




Re: Skipping logical replication transactions on subscriber side

2022-01-26 Thread Masahiko Sawada
On Wed, Jan 26, 2022 at 8:02 PM Amit Kapila  wrote:
>
> On Wed, Jan 26, 2022 at 12:51 PM Masahiko Sawada  
> wrote:
> >
> > On Wed, Jan 26, 2022 at 1:43 PM David G. Johnston
> >  wrote:
> > >
> > > We probably should just provide an option for the user to specify 
> > > "subrelid".  If null, only the main apply worker will skip the given xid, 
> > > otherwise only the worker tasked with syncing that particular table will 
> > > do so.  It might take a sequence of ALTER SUBSCRIPTION SET commands to 
> > > get a broken initial table synchronization to load completely but at 
> > > least there will not be any surprises as to which tables had transactions 
> > > skipped and which did not.
> >
> > That would work but I’m concerned that the users can specify it
> > properly. Also, we would need to change the errcontext message
> > generated by apply_error_callback() so the user can know that the
> > error occurred in either apply worker or tablesync worker.
> >
> > Or, as another idea, since an error during table synchronization is
> > not common and could be resolved by truncating the table and
> > restarting the synchronization in practice, there might be no need
> > this much and we can support it only for apply worker errors.
> >
>
> Yes, that is what I have also in mind. We can always extend this
> feature for tablesync process because it can not only fail for the
> specified skip_xid but also for many other reasons during the initial
> copy.

I'll update the patch accordingly to test and verify this approach.

In the meantime, I’d like to discuss the possible ideas of storing the
error XID somewhere the worker can see it even after a restart. It has
been proposed that the worker updates the catalog when an error
occurs, which was criticized as updating the catalog in such a
situation is not a good idea.

The next idea I considered was to store the error XID somewhere on
shmem (e.g., ReplicationState). But It requires entries at least as
much as subscriptions in principle, not
max_logical_replcation_workers. Since we don’t know it at startup
time, we need to use DSM or cache with a fixed number of entries. It
seems overkill to me.

The third idea, which is slightly better than others, is to update the
catalog by the launcher process, not the worker process; when an error
occurs, the apply worker stores the error XID (and maybe its
subscription OID) into its LogicalRepWorker entry, and the launcher
updates the corresponding entry of pg_subscription catalog before
launching workers. After the worker restarts, it clears the error XID
on the catalog if it successfully applied the transaction with the
error XID. The user can enable the skipping transaction behavior by a
query say ALTER SUBSCRIPTION SKIP ENABLED. The user cannot enable the
skipping behavior if the error XID is not set. If the skipping
behavior is enabled and the error XID is a valid value, the worker
skips the transaction and then clears both the error XID and a flag of
skipping behavior on the catalog.

With this idea, we don’t need a complex mechanism to store the error
XID for each subscription and can ensure to skip only the transaction
in question. But my concern is that the launcher updates the catalog.
Since it doesn’t connect to any database, probably it cannot open the
catalog indexes (because it requires lookup pg_class). Therefore, we
have to use in-place updates here. Through quick tests, I’ve confirmed
that using heap_inplace_update() to update the error XID on
pg_subscription tuples seems to work but not sure using an in-place
update here is a legitimate approach.

What do you think and any ideas?


Regards,

--
Masahiko Sawada
EDB:  https://www.enterprisedb.com/




Re: Design of pg_stat_subscription_workers vs pgstats

2022-01-26 Thread Andres Freund
Hi,

I didn't quite get to responding in depth, but I wanted to at least respond to
one point today.

On 2022-01-25 20:27:07 +0900, Masahiko Sawada wrote:
> > The pgstat entries are quite wide (292 bytes), because of the error message
> > stored. That's nearly twice the size of PgStat_StatTabEntry. And as far as I
> > can tell, once there was an error, we'll just keep the stats entry around
> > until the subscription is dropped.
> 
> We can drop the particular statistics by
> pg_stat_reset_subscription_worker() function.

Only if either the user wants to drop all stats, or somehow knows the oids of
already dropped tables...



> > Why isn't this just storing data in pg_subscription_rel?
> 
> These need to be updated on error which means for a failed xact and we
> don't want to update the system catalog in that state.

Rightly so! In fact, I'm concerned with sending a pgstats message in that
state as well. But: You don't need to. Just abort the current transaction,
start a new one, and update the state.


> There will be some challenges in a case where updating pg_subscription_rel
> also failed too (what to report to the user, etc.). And moreover, we don't
> want to consume space for temporary information in the system catalog.

You're consuming resources in a *WAY* worse way right now. The stats file gets
constantly written out, and quite often read back by backends. In contrast to
parts of pg_subscription_rel or such that data can't be removed from
shared_buffers under pressure.

Greetings,

Andres Freund




Re: Printing backtrace of postgres processes

2022-01-26 Thread vignesh C
On Wed, Jan 26, 2022 at 11:07 AM Bharath Rupireddy
 wrote:
>
> On Tue, Jan 25, 2022 at 12:00 PM vignesh C  wrote:
> > Thanks for the comments, attached v17 patch has the fix for the same.
>
> Have a minor comment on v17:
>
> can we modify the elog(LOG, to new style ereport(LOG, ?
> + elog(LOG_SERVER_ONLY, "current backtrace:%s", errtrace.data);
>
> /*--
>  * New-style error reporting API: to be used in this way:
>  *  ereport(ERROR,
>  *  errcode(ERRCODE_UNDEFINED_CURSOR),
>  *  errmsg("portal \"%s\" not found", stmt->portalname),
>  *  ... other errxxx() fields as needed ...);
>  *

Attached v18 patch has the changes for the same.

Regards,
Vignesh
From 3599e6940e16eb164737a8428af4b8ba5ef6bf59 Mon Sep 17 00:00:00 2001
From: Vigneshwaran C 
Date: Tue, 25 Jan 2022 08:21:22 +0530
Subject: [PATCH v18] Add function to log the backtrace of the specified
 postgres process.

This commit adds pg_log_backtrace() function that requests to log
the backtrace of the specified backend or auxiliary process except
logger and statistic collector.

Only superusers are allowed to request to log the backtrace
because allowing any users to issue this request at an unbounded rate
would cause lots of log messages and which can lead to denial of service.

On receipt of the request, at the next CHECK_FOR_INTERRUPTS(),
the target backend logs its backtrace at LOG_SERVER_ONLY level,
so that the backtrace will appear in the server log but not
be sent to the client.

Bump catalog version.

Authors: Vignesh C, Bharath Rupireddy
---
 doc/src/sgml/func.sgml| 62 +++
 src/backend/catalog/system_functions.sql  |  2 +
 src/backend/postmaster/autovacuum.c   |  4 ++
 src/backend/postmaster/checkpointer.c |  4 ++
 src/backend/postmaster/interrupt.c|  4 ++
 src/backend/postmaster/pgarch.c   | 10 +++-
 src/backend/postmaster/startup.c  |  4 ++
 src/backend/postmaster/walwriter.c|  4 ++
 src/backend/storage/ipc/procarray.c   | 56 +
 src/backend/storage/ipc/procsignal.c  | 42 +
 src/backend/storage/ipc/signalfuncs.c | 73 ---
 src/backend/tcop/postgres.c   |  4 ++
 src/backend/utils/adt/mcxtfuncs.c | 40 +++--
 src/backend/utils/error/elog.c| 19 --
 src/backend/utils/init/globals.c  |  1 +
 src/include/catalog/pg_proc.dat   |  5 ++
 src/include/miscadmin.h   |  1 +
 src/include/storage/procarray.h   |  2 +
 src/include/storage/procsignal.h  |  3 +-
 src/include/utils/elog.h  |  2 +
 src/test/regress/expected/backtrace.out   | 49 +++
 src/test/regress/expected/backtrace_1.out | 55 +
 src/test/regress/parallel_schedule|  2 +-
 src/test/regress/sql/backtrace.sql| 33 ++
 24 files changed, 416 insertions(+), 65 deletions(-)
 create mode 100644 src/test/regress/expected/backtrace.out
 create mode 100644 src/test/regress/expected/backtrace_1.out
 create mode 100644 src/test/regress/sql/backtrace.sql

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 0ee6974f1c..855ccc8902 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -25318,6 +25318,30 @@ SELECT collation for ('foo' COLLATE "de_DE");

   
 
+  
+   
+
+ pg_log_backtrace
+
+pg_log_backtrace ( pid integer )
+boolean
+   
+   
+Requests to log the backtrace of the backend with the
+specified process ID.  This function can send the request to
+backends and auxiliary processes except logger and statistics
+collector.  These backtraces will be logged at LOG
+message level. They will appear in the server log based on the log
+configuration set (See  for
+more information), but will not be sent to the client regardless of
+. A backtrace identifies
+which function a process is currently executing and it may be useful
+for developers to diagnose stuck processes and other problems. This
+function is supported only if PostgreSQL was built with the ability to
+capture backtraces, otherwise it will emit a warning.
+   
+  
+
   

 
@@ -25537,6 +25561,44 @@ LOG:  Grand total: 1651920 bytes in 201 blocks; 622360 free (88 chunks); 1029560
 because it may generate a large number of log messages.

 
+   
+pg_log_backtrace can be used to log the backtrace of
+a backend process. For example:
+
+postgres=# select pg_log_backtrace(pg_backend_pid());
+ pg_log_backtrace
+--
+ t
+(1 row)
+
+The backtrace will be logged as specified by the logging configuration.
+For example:
+
+2021-01-27 11:33:50.247 IST [111735] LOG:  current backtrace:
+postgres: postgresdba postgres [local] SELECT(set_backtrace+0x38) [0xae06c5]
+

Re: Output clause for Upsert aka INSERT...ON CONFLICT

2022-01-26 Thread David G. Johnston
On Wednesday, January 26, 2022, Anand Sowmithiran 
wrote:

> The INSERT...ON CONFLICT is used for doing upserts in one of our app.
> Our app works with both MS SQL and Postgresql, based on customer needs.
>
> Unlike the MS SQL MERGE command's OUTPUT clause that gives the $action
> 
> [INSERT / UPDATE  /DELETE] that was done during the upsert, the RETURNING
> clause of the pgsql does not return the action done.
> We need this so that the application can use that for auditing and UI
> purposes.
>


> Is there any workaround to get this info ?
>

There is not.  But I’d presume the correct trigger is fired for whichever
DML is ultimately applied so maybe you have a way through that.


> Or is there a way this enhancement can be requested in future PG versions ?
>
>
You just did.  There is nothing formal.  But presently there isn’t anyone
championing improvements to this feature (just my unresearched impression,
searching our public mailing lists and commitfest would let you form a
researched impression).

David J.


Re: Is it correct to update db state in control file as "shutting down" during end-of-recovery checkpoint?

2022-01-26 Thread Michael Paquier
On Wed, Jan 26, 2022 at 09:57:59AM +0900, Michael Paquier wrote:
> Yeah, that sounds like a good compromise.  At least, I find the whole
> a bit easier to follow.

So, I have been checking this idea in details, and spotted what looks
like one issue in CreateRestartPoint(), as of:
/*
 * Update pg_control, using current time.  Check that it still shows
 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do 
nothing;
 * this is a quick hack to make sure nothing really bad happens if 
somehow
 * we get here after the end-of-recovery checkpoint.
 */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
ControlFile->checkPointCopy.redo < lastCheckPoint.redo)

This change increases the window making this code path reachable if an
end-of-recovery checkpoint is triggered but not finished at the end of
recovery (possible of course at the end of crash recovery, but
DB_IN_ARCHIVE_RECOVERY is also possible when !IsUnderPostmaster with a
promotion request), before updating ControlFile->checkPointCopy at the
end of the checkpoint because the state could still be
DB_IN_ARCHIVE_RECOVERY.  The window is wider the longer the
end-of-recovery checkpoint.  And this would be the case of an instance
restarted, when a restart point is created.

> Heikki was planning to commit a large refactoring of xlog.c, so we'd
> better wait for that to happen before concluding here.

I have checked that as well, and both are independent.
--
Michael


signature.asc
Description: PGP signature


Output clause for Upsert aka INSERT...ON CONFLICT

2022-01-26 Thread Anand Sowmithiran
The INSERT...ON CONFLICT is used for doing upserts in one of our app.
Our app works with both MS SQL and Postgresql, based on customer needs.

Unlike the MS SQL MERGE command's OUTPUT clause that gives the $action

[INSERT / UPDATE  /DELETE] that was done during the upsert, the RETURNING
clause of the pgsql does not return the action done.
We need this so that the application can use that for auditing and UI
purposes.
Is there any workaround to get this info ?
Or is there a way this enhancement can be requested in future PG versions ?

thanks,
Anand.


Re: Two noncritical bugs of pg_waldump

2022-01-26 Thread Kyotaro Horiguchi
(this is off-topic)

At Thu, 27 Jan 2022 13:23:06 +0900 (JST), Kyotaro Horiguchi 
 wrote in 
> At Wed, 26 Jan 2022 17:25:14 -0800, Nathan Bossart  
> wrote in 
> > On Thu, Jan 27, 2022 at 10:07:38AM +0900, Kyotaro Horiguchi wrote:
> > > pg_waldump complains at the end in any case.  I noticed that the LSN
> > > it shows in the finish message is incorrect.  (I faintly thought that
> > > I posted about this but I didn't find it..)
> > 
> > Is this thread [0] what you are remembering?
> > 
> > [0] https://postgr.es/m/2B4510B2-3D70-4990-BFE3-0FE64041C08A%40amazon.com
> 
> Maybe exactly.  I rememberd the discussion.
> 
> So the issue there is neither EndRecPtr and ReadRecPtr always points
> to the current read LSN. The first proposal from Nathen was to use

Mmm. Sorry for my fat finger, Nathan.

> currRecPtr but it was a private member.  But after discussion, it
> seems to me it is (at least now) exactly what we need here so if we
> ofccially exposed the member the problem would be blown away.  Do you
> want to conitnue that?
> 
> Otherwise, I think we need to add a comment there about the known
> issue.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: snapper and skink and fairywren (oh my!)

2022-01-26 Thread Noah Misch
On Wed, Jan 26, 2022 at 04:17:19PM -0500, Robert Haas wrote:
> 1. snapper failed 4 out of the last 5 runs in recoveryCheck. The
> latest run as of this writing shows this:
> 
> [19:09:50] t/026_overwrite_contrecord.pl  ok43136 ms
> # poll_query_until timed out executing this query:
> # SELECT '0/1415E310' <= replay_lsn AND state = 'streaming' FROM
> pg_catalog.pg_stat_replication WHERE application_name = 'standby_1';
> # expecting this output:
> # t
> # last actual query output:
> #

I expect commit ce6d793 will end the failures on snapper, kittiwake, and
tadarida.  I apologize for letting the damage last months.




Re: Two noncritical bugs of pg_waldump

2022-01-26 Thread Kyotaro Horiguchi
At Wed, 26 Jan 2022 17:25:14 -0800, Nathan Bossart  
wrote in 
> On Thu, Jan 27, 2022 at 10:07:38AM +0900, Kyotaro Horiguchi wrote:
> > pg_waldump complains at the end in any case.  I noticed that the LSN
> > it shows in the finish message is incorrect.  (I faintly thought that
> > I posted about this but I didn't find it..)
> 
> Is this thread [0] what you are remembering?
> 
> [0] https://postgr.es/m/2B4510B2-3D70-4990-BFE3-0FE64041C08A%40amazon.com

Maybe exactly.  I rememberd the discussion.

So the issue there is neither EndRecPtr and ReadRecPtr always points
to the current read LSN. The first proposal from Nathen was to use
currRecPtr but it was a private member.  But after discussion, it
seems to me it is (at least now) exactly what we need here so if we
ofccially exposed the member the problem would be blown away.  Do you
want to conitnue that?

Otherwise, I think we need to add a comment there about the known
issue.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: snapper and skink and fairywren (oh my!)

2022-01-26 Thread Tom Lane
Kyotaro Horiguchi  writes:
> I'm not sure why walsender of the standby continued running not knowing the 
> primary has been once dead for such a long time.

Isn't this precisely the problem that made us revert the "graceful
disconnection" patch [1]?  Munro seems to have a theory about
why that broke things, but in any case it's not Haas' fault ;-)

regards, tom lane

[1] 
https://www.postgresql.org/message-id/flat/CA%2BhUKG%2BOeoETZQ%3DQw5Ub5h3tmwQhBmDA%3DnuNO3KG%3DzWfUypFAw%40mail.gmail.com




Re: pg_log_backend_memory_contexts() and log level

2022-01-26 Thread Fujii Masao




On 2022/01/26 14:28, torikoshia wrote:

On 2022-01-26 13:17, Fujii Masao wrote:

Hi,

pg_log_backend_memory_contexts() should be designed not to send the
messages about the memory contexts to the client regardless of
client_min_messages. But I found that the message "logging memory
contexts of PID %d" can be sent to the client because it's ereport()'d
with LOG level instead of LOG_SERVER_ONLY. Is this a bug, and
shouldn't we use LOG_SERVER_ONLY level to log that message? Patch
attached.

Regards,


Thanks! I think it's a bug.

There are two clients: "the client that executed pg_log_backend_memory_contexts()" and 
"the client that issued the query that was the target of the memory context logging", but 
I was only concerned about the former when I wrote that part of the code.
The latter is not expected.

It seems better to use LOG_SERVER_ONLY as the attached patch.

The patch was successfully applied and there was no regression test error on my 
environment.


Thanks for the review! So barring any objection, I will commit the patch and 
backport it to v14 where pg_log_backend_memory_contexts() is added.

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION




Re: pg_log_backend_memory_contexts() and log level

2022-01-26 Thread Fujii Masao




On 2022/01/26 14:27, Bharath Rupireddy wrote:

On Wed, Jan 26, 2022 at 10:46 AM Bharath Rupireddy
 wrote:


On Wed, Jan 26, 2022 at 9:48 AM Fujii Masao  wrote:


Hi,

pg_log_backend_memory_contexts() should be designed not to send the messages about the 
memory contexts to the client regardless of client_min_messages. But I found that the 
message "logging memory contexts of PID %d" can be sent to the client because 
it's ereport()'d with LOG level instead of LOG_SERVER_ONLY. Is this a bug, and shouldn't 
we use LOG_SERVER_ONLY level to log that message? Patch attached.


+1. The patch LGTM.


Thanks for the review!



While we are here, I think it's enough to have the
has_function_privilege, not needed to set role regress_log_memory and
then execute the function as we already have code covering the
function above (SELECT
pg_log_backend_memory_contexts(pg_backend_pid());). Thought?

SELECT has_function_privilege('regress_log_memory',
   'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- yes

SET ROLE regress_log_memory;
SELECT pg_log_backend_memory_contexts(pg_backend_pid());
RESET ROLE;


Or it's enough to set role and execute the function? Either those or 
has_function_privilege() can be removed from the test, but I don't have strong 
reason to do that for now...

Regards,

--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION




Re: snapper and skink and fairywren (oh my!)

2022-01-26 Thread Kyotaro Horiguchi
At Wed, 26 Jan 2022 18:45:53 -0500, Andrew Dunstan  wrote 
in 
> It's very unlikely any of this is your fault. In any case, intermittent
> failures are very hard to nail down.

The primary starts at 2022-01-26 16:30:06.613 the accepted a connectin
from the standby at 2022-01-26 16:30:09.911.

P: 2022-01-26 16:30:06.613 UTC [74668:1] LOG:  starting PostgreSQL 15devel on 
x86_64-w64-mingw32, compiled by gcc.exe (Rev2, Built by MSYS2 project) 10.3.0, 
64-bit
S: 2022-01-26 16:30:09.637 UTC [72728:1] LOG:  starting PostgreSQL 15devel on 
x86_64-w64-mingw32, compiled by gcc.exe (Rev2, Built by MSYS2 project) 10.3.0, 
64-bit
P: 2022-01-26 16:30:09.911 UTC [74096:3] [unknown] LOG:  replication connection 
authorized: user=pgrunner application_name=standby
S: 2022-01-26 16:30:09.912 UTC [73932:1] LOG:  started streaming WAL from 
primary at 0/300 on timeline 1

After this the primary restarts.

P: 2022-01-26 16:30:11.832 UTC [74668:7] LOG:  database system is shut down
P: 2022-01-26 16:30:12.010 UTC [72140:1] LOG:  starting PostgreSQL 15devel on 
x86_64-w64-mingw32, compiled by gcc.exe (Rev2, Built by MSYS2 project) 10.3.0, 
64-bit

But the standby doesn't notice the disconnection and continue with the
poll_query_until() on the stale connection. But the query should have
executed after reconnection finally.  The following log lines are not
thinned out.

S: 2022-01-26 16:30:09.912 UTC [73932:1] LOG:  started streaming WAL from 
primary at 0/300 on timeline 1
S: 2022-01-26 16:30:12.825 UTC [72760:1] [unknown] LOG:  connection received: 
host=127.0.0.1 port=60769
S: 2022-01-26 16:30:12.830 UTC [72760:2] [unknown] LOG:  connection 
authenticated: identity="EC2AMAZ-P7KGG90\\pgrunner" method=sspi 
(C:/tools/msys64/home/pgrunner/bf/root/HEAD/pgsql.build/src/test/modules/commit_ts/tmp_check/t_003_standby_2_standby_data/pgdata/pg_hba.conf:2)
S: 2022-01-26 16:30:12.830 UTC [72760:3] [unknown] LOG:  connection authorized: 
user=pgrunner database=postgres application_name=003_standby_2.pl
S: 2022-01-26 16:30:12.838 UTC [72760:4] 003_standby_2.pl LOG:  statement: 
SELECT '0/303C7D0'::pg_lsn <= pg_last_wal_replay_lsn()

Since the standby dones't receive WAL from the restarted server so the
poll_query_until() doesn't return.

I'm not sure why walsender of the standby continued running not knowing the 
primary has been once dead for such a long time.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: Question on partition pruning involving stable operator: timestamptz_ge_date

2022-01-26 Thread Amit Langote
Hi,

On Thu, Jan 27, 2022 at 10:28 AM TAO TANG  wrote:
> the plan shows all the partitions are pruned, but in gdb tracing, it shows 
> that
> the pruning happens in ExecInitAppend, and during planning stage pg does not
> prune any partitions. this is because in function 
> match_clause_to_partition_key
> do not handle the case for STABLE operator:
>
> if (op_volatile(opno) != PROVOLATILE_IMMUTABLE)
> {
> context->has_mutable_op = true;
>
> /*
> * When pruning in the planner, we cannot prune with mutable
> * operators.
> */
> if (context->target == PARTTARGET_PLANNER)
> return PARTCLAUSE_UNSUPPORTED;
> }
>
> the procs for timestamptz compare with date are STABLE:
>
>proname| provolatile
> --+-
>  timestamptz_lt_date  | s
>  timestamptz_le_date  | s
>  timestamptz_eq_date  | s
>  timestamptz_gt_date  | s
>  timestamptz_ge_date  | s
>  timestamptz_ne_date  | s
>  timestamptz_cmp_date | s
> (7 rows)
>
> but in ExecInitAppend call perform_pruning_base_step which do not consider 
> the STABLE
> property of the cmpfn.
>
> so I have serveral questions:
> 1) why in planning the function volatility is considered but not in 
> execInitAppend;

The value of a STABLE expression can change based on runtime
parameters, so while it is guaranteed to remain the same during a
particular execution of a plan in which it is contained, it can change
across multiple executions of that plan (if it is cached, for
example).  So the planner cannot assume a particular value of such
expressions when choosing partitions to add to the plan, because each
execution of the plan (each may run in a separate transaction) can
produce different values.  ExecInitAppend(), on the other hand, can
assume a particular value when choosing partitions to initialize,
because the value is fixed for a particular execution during which it
runs.

> 2) why timestamptz_xxx_date is STABLE not IMMUTABLE;

Because calculations involving timestamptz values can produce
different results dependkng on volatile settings like timezone,
datestyle, etc.

-- 
Amit Langote
EDB: http://www.enterprisedb.com




Re: Make mesage at end-of-recovery less scary.

2022-01-26 Thread Kyotaro Horiguchi
At Tue, 25 Jan 2022 17:34:56 +0900 (JST), Kyotaro Horiguchi 
 wrote in 
> This v8 is changed in...
> 
> - Added tests to 011_crash_recovery.pl
> 
> - Fixed a bug that server emits "end-of-wal" messages even if it have
>   emitted an error message for the same LSN.
> 
> - Changed XLogReaderValidatePageHeader() so that it recognizes an
>   empty page as end-of-WAL.
> 
> - Made pg_waldump conscious of end-of-wal.
> 
> While doing the last item, I noticed that pg_waldump shows the wrong
> LSN as the error position.  Concretely it emits the LSN of the last
> sound WAL record as the error position.  I will post a bug-fix patch
> for the issue after confirmation.

I noticed that I added a useless error message "garbage record
header", but it is a kind of invalid record length.  So I removed the
message. That change makes the logic for EOW in ValidXLogRecordHeader
and XLogReaderValidatePageHeader share the same flow.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 57cb251f7cacbb96066ead4543b9f12f5b3c7062 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi 
Date: Fri, 28 Feb 2020 15:52:58 +0900
Subject: [PATCH v9] Make End-Of-Recovery error less scary

When recovery in any type ends, we see a bit scary error message like
"invalid record length" that suggests something serious is
happening. Actually if recovery meets a record with length = 0, that
usually means it finished applying all available WAL records.

Make this message less scary as "reached end of WAL". Instead, raise
the error level for other kind of WAL failure to WARNING.
---
 src/backend/access/transam/xlog.c |  91 +-
 src/backend/access/transam/xlogreader.c   |  61 
 src/backend/replication/walreceiver.c |   3 +-
 src/bin/pg_waldump/pg_waldump.c   |  13 ++-
 src/include/access/xlogreader.h   |   1 +
 src/test/recovery/t/011_crash_recovery.pl | 110 +-
 6 files changed, 251 insertions(+), 28 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index dfe2a0bcce..5727e0939f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4480,6 +4480,7 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 	for (;;)
 	{
 		char	   *errormsg;
+		XLogRecPtr	ErrRecPtr = InvalidXLogRecPtr;
 
 		record = XLogReadRecord(xlogreader, );
 		if (record == NULL)
@@ -4495,6 +4496,18 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 			{
 abortedRecPtr = xlogreader->abortedRecPtr;
 missingContrecPtr = xlogreader->missingContrecPtr;
+ErrRecPtr = abortedRecPtr;
+			}
+			else
+			{
+/*
+ * NULL ReadRecPtr means we could not read a record at the
+ * beginning. In that case EndRecPtr is storing the LSN of the
+ * record we tried to read.
+ */
+ErrRecPtr =
+	xlogreader->ReadRecPtr ?
+	xlogreader->ReadRecPtr : xlogreader->EndRecPtr;
 			}
 
 			if (readFile >= 0)
@@ -4504,13 +4517,12 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 			}
 
 			/*
-			 * We only end up here without a message when XLogPageRead()
-			 * failed - in that case we already logged something. In
-			 * StandbyMode that only happens if we have been triggered, so we
-			 * shouldn't loop anymore in that case.
+			 * If we get here for other than end-of-wal, emit the error message
+			 * right now. Otherwise the message if any is shown as a part of
+			 * the end-of-WAL message below.
 			 */
-			if (errormsg)
-ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+			if (!xlogreader->EndOfWAL && errormsg)
+ereport(emode_for_corrupt_record(emode, ErrRecPtr),
 		(errmsg_internal("%s", errormsg) /* already translated */ ));
 		}
 
@@ -4541,11 +4553,12 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 			/* Great, got a record */
 			return record;
 		}
-		else
+
+		/* No valid record available from this source */
+		lastSourceFailed = true;
+
+		if (!fetching_ckpt)
 		{
-			/* No valid record available from this source */
-			lastSourceFailed = true;
-
 			/*
 			 * If archive recovery was requested, but we were still doing
 			 * crash recovery, switch to archive recovery and retry using the
@@ -4558,11 +4571,17 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 			 * we'd have no idea how far we'd have to replay to reach
 			 * consistency.  So err on the safe side and give up.
 			 */
-			if (!InArchiveRecovery && ArchiveRecoveryRequested &&
-!fetching_ckpt)
+			if (!InArchiveRecovery && ArchiveRecoveryRequested)
 			{
+/*
+ * We don't report this as LOG, since we don't stop recovery
+ * here
+ */
 ereport(DEBUG1,
-		(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
+		(errmsg_internal("reached end of WAL at %X/%X on timeline %u in %s during crash recovery, entering archive recovery",
+		 LSN_FORMAT_ARGS(ErrRecPtr),
+		 replayTLI,
+		 xlogSourceNames[currentSource])));
 		

Question on partition pruning involving stable operator: timestamptz_ge_date

2022-01-26 Thread TAO TANG
Hi,

I tested the following case in PostgreSQL master:58e2e6

the partition table created:

create table tbl_dts (dts timestamp with time zone not null) partition
by range(dts);
create table tbl_dts_1 partition of tbl_dts for values from
('2021-07-02') to ('2021-08-01');
create table tbl_dts_2 partition of tbl_dts for values from
('2021-08-02') to ('2021-09-01');
create table tbl_dts_3 partition of tbl_dts for values from
('2021-09-02') to ('2021-10-01');
create table tbl_dts_4 partition of tbl_dts for values from
('2021-10-02') to ('2021-11-01');

and the query:

explain select * from tbl_dts where dts between '2022-01-20'::date and
'2022-01-26'::date;

 QUERY PLAN
-
 Append  (cost=0.00..175.82 rows=44 width=8)
   Subplans Removed: 4
(2 rows)

the plan shows all the partitions are pruned, but in gdb tracing, it shows
that
the pruning happens in ExecInitAppend, and during planning stage pg does not
prune any partitions. this is because in function
match_clause_to_partition_key
do not handle the case for STABLE operator:

if (op_volatile(opno) != PROVOLATILE_IMMUTABLE)
{
context->has_mutable_op = true;

/*
* When pruning in the planner, we cannot prune with mutable
* operators.
*/
if (context->target == PARTTARGET_PLANNER)
return PARTCLAUSE_UNSUPPORTED;
}

the procs for timestamptz compare with date are STABLE:

   proname| provolatile
--+-
 timestamptz_lt_date  | s
 timestamptz_le_date  | s
 timestamptz_eq_date  | s
 timestamptz_gt_date  | s
 timestamptz_ge_date  | s
 timestamptz_ne_date  | s
 timestamptz_cmp_date | s
(7 rows)

but in ExecInitAppend call perform_pruning_base_step which do not consider
the STABLE
property of the cmpfn.

so I have serveral questions:
1) why in planning the function volatility is considered but not in
execInitAppend;
2) why timestamptz_xxx_date is STABLE not IMMUTABLE;

thanks.


Re: Two noncritical bugs of pg_waldump

2022-01-26 Thread Nathan Bossart
On Thu, Jan 27, 2022 at 10:07:38AM +0900, Kyotaro Horiguchi wrote:
> pg_waldump complains at the end in any case.  I noticed that the LSN
> it shows in the finish message is incorrect.  (I faintly thought that
> I posted about this but I didn't find it..)

Is this thread [0] what you are remembering?

[0] https://postgr.es/m/2B4510B2-3D70-4990-BFE3-0FE64041C08A%40amazon.com

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com/




Re: row filtering for logical replication

2022-01-26 Thread Greg Nancarrow
On Wed, Jan 26, 2022 at 2:08 PM houzj.f...@fujitsu.com
 wrote:
>
> There was a miss in the posted patch which didn't initialize the parameter in
> RelationBuildPublicationDesc, sorry for that. Attach the correct patch this 
> time.
>

I have some additional doc update suggestions for the v71-0001 patch:


(1) Patch commit comment

BEFORE:
row filter evaluates to NULL, it returns false. The WHERE clause only
AFTER:
row filter evaluates to NULL, it is regarded as "false". The WHERE clause only


doc/src/sgml/catalogs.sgml

(2) ALTER PUBLICATION

BEFORE:
+  expression returns
false or null will
AFTER:
+  expression
evaluates to false or null will


doc/src/sgml/ref/alter_subscription.sgml

(3) ALTER SUBSCRIPTION

BEFORE:
+  filter WHERE clause had been modified.
AFTER:
+  filter WHERE clause has since been modified.


doc/src/sgml/ref/create_publication.sgml

(4) CREATE PUBLICATION

BEFORE:
+  which the expression returns
+  false or null will not be published. Note that parentheses are required
AFTER:
+  which the expression evaluates
+  to false or null will not be published. Note that parentheses
are required


doc/src/sgml/ref/create_subscription.sgml

(5) CREATE SUBSCRIPTION

BEFORE:
+   returns false or null will not be published. If the subscription has several
AFTER:
+   evaluates to false or null will not be published. If the
subscription has several


Regards,
Greg Nancarrow
Fujitsu Australia




Two noncritical bugs of pg_waldump

2022-01-26 Thread Kyotaro Horiguchi
Hello.

pg_waldump complains at the end in any case.  I noticed that the LSN
it shows in the finish message is incorrect.  (I faintly thought that
I posted about this but I didn't find it..)

> pg_waldump: fatal: error in WAL record at 0/15073F8: invalid record length at 
> 0/1507470: wanted 24, got 0

xlogreader found the error at the record begins at 1507470, but
pg_waldump tells that error happens at 15073F8, which is actually the
beginning of the last sound record.


If I give an empty file to the tool it complains as the follows.

> pg_waldump: fatal: could not read file "hoge": No such file or directory

No, the file exists.  The cause is it reads uninitialized errno to
detect errors from the system call.  read(2) is defined to set errno
always when it returns -1 and doesn't otherwise. Thus it seems to me
that it is better to check that the return value is less than zero
than to clear errno before the call to read().

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 773f14f01c4a5cfd334d0a80778aa819c55c1cb7 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi 
Date: Thu, 27 Jan 2022 09:47:10 +0900
Subject: [PATCH] Fix errornious messages of pg_waldump

Fix pg_waldump to give the LSN where read error happens instead of the
LSN of the last record successfully read. And fix to give the correct
error message for zero-sized blocks instead of a bogus system-call
error message.
---
 src/bin/pg_waldump/pg_waldump.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index a6251e1a96..58dc4044b5 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -222,15 +222,12 @@ search_directory(const char *directory, const char *fname)
 	 WalSegSz),
 			fname, WalSegSz);
 		}
+		else if (r < 0)
+			fatal_error("could not read file \"%s\": %m",
+		fname);
 		else
-		{
-			if (errno != 0)
-fatal_error("could not read file \"%s\": %m",
-			fname);
-			else
-fatal_error("could not read file \"%s\": read %d of %d",
-			fname, r, XLOG_BLCKSZ);
-		}
+			fatal_error("could not read file \"%s\": read %d of %d",
+		fname, r, XLOG_BLCKSZ);
 		close(fd);
 		return true;
 	}
@@ -1177,7 +1174,7 @@ main(int argc, char **argv)
 
 	if (errormsg)
 		fatal_error("error in WAL record at %X/%X: %s",
-	LSN_FORMAT_ARGS(xlogreader_state->ReadRecPtr),
+	LSN_FORMAT_ARGS(xlogreader_state->EndRecPtr),
 	errormsg);
 
 	XLogReaderFree(xlogreader_state);
-- 
2.27.0



Re: Avoid erroring out when unable to remove or parse logical rewrite files to save checkpoint work

2022-01-26 Thread Nathan Bossart
On Fri, Jan 21, 2022 at 11:49:56AM -0800, Andres Freund wrote:
> On 2022-01-20 20:41:16 +, Bossart, Nathan wrote:
>> Here's this part.
> 
> And pushed to all branches. Thanks.

Thanks!

I spent some time thinking about the right way to proceed here, and I came
up with the attached patches.  The first patch just adds error checking for
various lstat() calls in the replication code.  If lstat() fails, then it
probably doesn't make sense to try to continue processing the file.

The second patch changes some nearby calls to ereport() to ERROR.  If these
failures are truly unexpected, and we don't intend to support use-cases
like concurrent manual deletion, then failing might be the right way to go.
I think it's a shame that such failures could cause checkpointing to
continually fail, but that topic is already being discussed elsewhere [0].

[0] https://postgr.es/m/C1EE64B0-D4DB-40F3-98C8-0CED324D34CB%40amazon.com

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com/
>From df7a07d27c2858ec3ac383335af3e55e5428e3b1 Mon Sep 17 00:00:00 2001
From: Nathan Bossart 
Date: Wed, 26 Jan 2022 14:03:11 -0800
Subject: [PATCH v4 1/2] add error checking for calls to lstat() in replication
 code

---
 src/backend/access/heap/rewriteheap.c   | 6 +-
 src/backend/replication/logical/reorderbuffer.c | 6 +-
 src/backend/replication/logical/snapbuild.c | 6 +-
 src/backend/replication/slot.c  | 6 +-
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736..81319e0c78 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -1227,7 +1227,11 @@ CheckPointLogicalRewriteHeap(void)
 			continue;
 
 		snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name);
-		if (lstat(path, ) == 0 && !S_ISREG(statbuf.st_mode))
+		if (lstat(path, ) != 0)
+			ereport(ERROR,
+	(errcode_for_file_access(),
+	 errmsg("could not stat file \"%s\": %m", path)));
+		else if (!S_ISREG(statbuf.st_mode))
 			continue;
 
 		/* Skip over files that cannot be ours. */
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 19b2ba2000..d8d784a42f 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -4407,7 +4407,11 @@ ReorderBufferCleanupSerializedTXNs(const char *slotname)
 	sprintf(path, "pg_replslot/%s", slotname);
 
 	/* we're only handling directories here, skip if it's not ours */
-	if (lstat(path, ) == 0 && !S_ISDIR(statbuf.st_mode))
+	if (lstat(path, ) != 0)
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", path)));
+	else if (!S_ISDIR(statbuf.st_mode))
 		return;
 
 	spill_dir = AllocateDir(path);
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 83fca8a77d..57f5a5e81f 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1955,7 +1955,11 @@ CheckPointSnapBuild(void)
 
 		snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
 
-		if (lstat(path, ) == 0 && !S_ISREG(statbuf.st_mode))
+		if (lstat(path, ) != 0)
+			ereport(ERROR,
+	(errcode_for_file_access(),
+	 errmsg("could not stat file \"%s\": %m", path)));
+		else if (!S_ISREG(statbuf.st_mode))
 		{
 			elog(DEBUG1, "only regular files expected: %s", path);
 			continue;
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index e5e0cf8768..21fb7536e2 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1413,7 +1413,11 @@ StartupReplicationSlots(void)
 		snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name);
 
 		/* we're only creating directories here, skip if it's not our's */
-		if (lstat(path, ) == 0 && !S_ISDIR(statbuf.st_mode))
+		if (lstat(path, ) != 0)
+			ereport(ERROR,
+	(errcode_for_file_access(),
+	 errmsg("could not stat file \"%s\": %m", path)));
+		else if (!S_ISDIR(statbuf.st_mode))
 			continue;
 
 		/* we crashed while a slot was being setup or deleted, clean up */
-- 
2.25.1

>From 28a06a2cb84102a92da3fca0f0055dd67902d7de Mon Sep 17 00:00:00 2001
From: Nathan Bossart 
Date: Wed, 26 Jan 2022 16:24:04 -0800
Subject: [PATCH v4 2/2] minor improvements to replication code

---
 src/backend/replication/logical/snapbuild.c | 19 ++-
 src/backend/replication/slot.c  |  5 +
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 57f5a5e81f..ce6cb85c1a 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1970,16 +1970,10 @@ CheckPointSnapBuild(void)
 		 * everything but are postfixed by 

Re: make MaxBackends available in _PG_init

2022-01-26 Thread Michael Paquier
On Tue, Jan 25, 2022 at 07:30:33PM +, Bossart, Nathan wrote:
> I think the patch is in decent shape.  There may be a few remaining
> places where GetMaxBackends() is called repeatedly in the same
> function, but IIRC v4 already clears up the obvious ones.  I don't
> know if this is worth worrying about too much, but I can create a new
> version if you think it is important.

There are such cases in FindLockCycleRecurse(), GetLockConflicts(),
GetLockStatusData() and InitProcGlobal(), as far as I can see.

Hmm.  I have been looking at this patch, and the lack of centralized
solution that could be used for other GUCs worries me like Fujii-san,
even if this would prevent an incorrect use of MaxBackends in contexts
where it should not be used because it is not initialized yet.  I
don't think it is a good idea in the long-term to apply this as-is.
--
Michael


signature.asc
Description: PGP signature


Re: Support for NSS as a libpq TLS backend

2022-01-26 Thread Jacob Champion
On Wed, 2022-01-26 at 15:59 -0800, Andres Freund wrote:
> > > Do we have a testcase for embedded NULLs in common names?
> > 
> > We don't, neither for OpenSSL or NSS.  AFAICR Jacob spent days trying to 
> > get a
> > certificate generation to include an embedded NULL byte but in the end gave 
> > up.
> > We would have to write our own tools for generating certificates to add that
> > (which may or may not be a bad idea, but it hasn't been done).
> 
> Hah, that's interesting.

Yeah, OpenSSL just refused to do it, with any method I could find at
least. My personal test suite is using pyca/cryptography and psycopg2
to cover that case.

--Jacob


Re: Support for NSS as a libpq TLS backend

2022-01-26 Thread Andres Freund
Hi,

On 2022-01-26 21:39:16 +0100, Daniel Gustafsson wrote:
> > What about
> > reconfiguring (note: add --enable-depend) the linux tasks to build against
> > nss, and then run the relevant subset of tests with it?  Most tests don't 
> > use
> > tcp / SSL anyway, so rerunning a small subset of tests should be feasible?
> 
> That's an interesting idea, I think that could work and be reasonably readable
> at the same time (and won't require in-depth knowledge of Cirrus).  As it's 
> the
> same task it does spend more time towards the max runtime per task, but that's
> not a problem for now.  It's worth keeping in mind though if we deem this to 
> be
> a way forward with testing multiple settings.

I think it's a way for a limited number of settings, that each only require a
limited amount of tests... Rerunning all tests etc is a different story.



> > Is it a great idea to have common/nss.h when there's a library header nss.h?
> > Perhaps we should have a pg_ssl_{nss,openssl}.h or such?
> 
> That's a good point, I modelled it after common/openssl.h but I agree it's
> better to differentiate the filenames.  I've renamed it to common/pg_nss.h and
> we should IMO rename common/openssl.h regardless of what happens to this 
> patch.

+1


> > Does this make some things notably more expensive? Presumably it does 
> > remove a
> > bunch of COW opportunities, but likely that's not a huge factor compared to
> > assymetric crypto negotiation...
> 
> Right, the context of setting up crypto across a network connection it's 
> highly
> likely to drown out the costs.

If you start to need to run a helper to decrypt an encrypted private key, and
do all the initialization, I'm not sure sure that holds true anymore... Have
you done any connection speed tests? pgbench -C is helpful for that.


> > Maybe soem of this commentary should migrate to the file header or such?
> 
> Maybe, or perhaps README.ssl?  Not sure where it would be most reasonable to
> keep it such that it's also kept up to date.

Either would work for me.


> >> This introduce
> >> + * differences with the OpenSSL support where some errors are only 
> >> reported
> >> + * at runtime with NSS where they are reported at startup with OpenSSL.
> > 
> > Found this sentence hard to parse somehow.
> > 
> > It seems pretty unfriendly to only have minimal error checking at postmaster
> > startup time. Seems at least the presence and usability of keys should be 
> > done
> > *also* at that time?
> 
> I'll look at adding some setup, and subsequent teardown, of NSS at startup
> during which we could do checking to be more on par with how the OpenSSL
> backend will report errors.

Cool.


> >> +/*
> >> + * raw_subject_common_name
> >> + *
> >> + * Returns the Subject Common Name for the given certificate as a raw char
> >> + * buffer (that is, without any form of escaping for unprintable 
> >> characters or
> >> + * embedded nulls), with the length of the buffer returned in the len 
> >> param.
> >> + * The buffer is allocated in the TopMemoryContext and is given a NULL
> >> + * terminator so that callers are safe to call strlen() on it.
> >> + *
> >> + * This is used instead of CERT_GetCommonName(), which always performs 
> >> quoting
> >> + * and/or escaping. NSS doesn't appear to give us a way to easily 
> >> unescape the
> >> + * result, and we need to store the raw CN into port->peer_cn for 
> >> compatibility
> >> + * with the OpenSSL implementation.
> >> + */
> > 
> > Do we have a testcase for embedded NULLs in common names?
> 
> We don't, neither for OpenSSL or NSS.  AFAICR Jacob spent days trying to get a
> certificate generation to include an embedded NULL byte but in the end gave 
> up.
> We would have to write our own tools for generating certificates to add that
> (which may or may not be a bad idea, but it hasn't been done).

Hah, that's interesting.


> >> +/*
> >> + * pg_SSLShutdownFunc
> >> + *Callback for NSS shutdown
> >> + *
> >> + * If NSS is terminated from the outside when the connection is still in 
> >> use
> > 
> > What does "NSS is terminated from the outside when the connection" really
> > mean? Does this mean the client initiating something?
> 
> If an extension, or other server-loaded code, interfered with NSS and managed
> to close contexts in order to interfere with connections this would ensure us
> closing it down cleanly.
> 
> That being said, I was now unable to get my old testcase working so I've for
> now removed this callback from the patch until I can work out if we can make
> proper use of it.  AFAICS other mature NSS implementations aren't using it
> (OpenLDAP did in the past but have since removed it, will look at how/why).

I think that'd be elog(FATAL) time if we want to do anything (after changing
state so that no data is sent to client).


> >> +  /*
> >> +   * The error cases for PR_Recv are not 
> >> documented, but can be
> >> +   * 

Re: autovacuum prioritization

2022-01-26 Thread Greg Stark
On Wed, 26 Jan 2022 at 18:46, Greg Stark  wrote:
>
> On Thu, 20 Jan 2022 at 14:31, Robert Haas  wrote:
> >
> > In my view, previous efforts in this area have been too simplistic.
> >
>
> One thing I've been wanting to do something about is I think
> autovacuum needs to be a little cleverer about when *not* to vacuum a
> table because it won't do any good.
>
> I've seen a lot of cases where autovacuum kicks off a vacuum of a
> table even though the globalxmin hasn't really advanced significantly
> over the oldest frozen xid. When it's a large table this really hurts
> because it could be hours or days before it finishes and at that point
> there's quite a bit of bloat.


Another case I would like to see autovacuum get clever about is when
there is a wide disparity in the size of tables. If you have a few
large tables and a few small tables there could be enough bandwidth
for everyone but you can get in trouble if the workers are all tied up
vacuuming the large tables.

This is a case where autovacuum scheduling can create a problem where
there shouldn't be one. It often happens when you have a set of large
tables that were all loaded with data around the same time and you
have your busy tables that are well designed small tables receiving
lots of updates. They can happily be getting vacuumed every 15-30min
and finishing promptly maintaining a nice steady state until one day
all the large tables suddenly hit the freeze threshold and suddenly
all your workers are busy vacuuming huge tables that take hours or
days to vacuum and your small tables bloat by orders of magnitude.

I was thinking of dividing the eligible tables up into ntiles based on
size and then making sure one worker was responsible for each ntile.
I'm not sure that would actually be quite right though.


-- 
greg




Re: autovacuum prioritization

2022-01-26 Thread Peter Geoghegan
On Wed, Jan 26, 2022 at 3:46 PM Greg Stark  wrote:
> One thing I've been wanting to do something about is I think
> autovacuum needs to be a little cleverer about when *not* to vacuum a
> table because it won't do any good.

There was a thread about this exact thing not too long ago:

https://postgr.es/m/CAH2-Wzmx6+PrfpmmFw8JZbxD+kkwhQWPOhE5RUBy6S4_Jwty=q...@mail.gmail.com

If everything goes according to plan, then Postgres 15 will have my
work on freezing and dynamically advancing relfrozenxid. Meaning that
you'll be able to see (in autovacuum log output and in VACUUM VERBOSE
output) how much relfrozenxid has been advanced by, if at all. You'll
also directly see how far behind the VACUUM operation's OldestXmin
that is (and how far behind the OldestXmin is at the end of the VACUUM
operation).

It seems as if this offers you exactly what you need. You'll be able
to notice the inherent futility of an anti-wraparound VACUUM that runs
against a table whose relfrozenxid is already exactly equal to the
VACUUM's OldestXmin (say because of a leaked replication slot --
anything that makes vacuuming fundamentally unable to advance
relfrozenxid, really).

-- 
Peter Geoghegan




Re: autovacuum prioritization

2022-01-26 Thread Greg Stark
On Thu, 20 Jan 2022 at 14:31, Robert Haas  wrote:
>
> In my view, previous efforts in this area have been too simplistic.
>

One thing I've been wanting to do something about is I think
autovacuum needs to be a little cleverer about when *not* to vacuum a
table because it won't do any good.

I've seen a lot of cases where autovacuum kicks off a vacuum of a
table even though the globalxmin hasn't really advanced significantly
over the oldest frozen xid. When it's a large table this really hurts
because it could be hours or days before it finishes and at that point
there's quite a bit of bloat.

This isn't a common occurrence, it happens when the system is broken
in some way. Either there's an idle-in-transaction session or
something else keeping the global xmin held back.

What it does though is make things *much* worse and *much* harder for
a non-expert to hit on the right remediation. It's easy enough to tell
them to look for these idle-in-transaction sessions or set timeouts.
It's much harder to determine whether it's a good idea for them to go
and kill the vacuum that's been running for days. And it's not a great
thing for people to be getting in the habit of doing either.

I want to be able to stop telling people to kill vacuums kicked off by
autovacuum. I feel like it's a bad thing for someone to ever have to
do and I know some fraction of the time I'm telling them to do it
it'll have been a terrible thing to have done (but we'll never know
which times those were). Determining whether a running vacuum is
actually doing any good is pretty hard and on older versions probably
impossible.

I was thinking of just putting a check in before kicking off a vacuum
and if the globalxmin is a significant fraction of the distance to the
relfrozenxid then instead log a warning. Basically it means "we can't
keep the bloat below the threshold due to the idle transactions et al,
not because there's insufficient i/o bandwidth".

At the same time it would be nice if autovacuum could recognize when
the i/o bandwidth is insufficient. If it finishes a vacuum it could
recheck whether the table is eligible for vacuuming and log that it's
unable to keep up with the vacuuming requirements -- but right now
that would be a lie much of the time when it's not a lack of bandwidth
preventing it from keeping up.


-- 
greg




Re: snapper and skink and fairywren (oh my!)

2022-01-26 Thread Andrew Dunstan


On 1/26/22 16:17, Robert Haas wrote
> 3. fairywren failed the last run in module-commit_tsCheck. It's unhappy 
> because:
>
> [16:30:02] t/002_standby.pl  ok13354 ms ( 0.06 usr  0.00 sys +
>  1.11 cusr  3.20 csys =  4.37 CPU)
> # poll_query_until timed out executing this query:
> # SELECT '0/303C7D0'::pg_lsn <= pg_last_wal_replay_lsn()
> # expecting this output:
> # t
> # last actual query output:
> # f
>
> I don't know what is causing any of these failures, and I don't know
> if there's already some discussion elsewhere that I've missed, but
> maybe this email will be helpful to someone. I also noticed that 2 out
> of the 3 failures report 2dbb7b9b2279d064f66ce9008869fd0e2b794534 "Fix
> pg_hba_file_rules for authentication method cert" as the only new
> commit since the last run, and it's hardly believable that that commit
> would have broken this. Nor do I see any other recent changes that
> look like likely culprits. Apologies in advance if any of this is my
> fault.
>

Intermittent failures give a false positive against the latest set of
commits. These failures started happening regularly about 49 days ago.
So we need to look at what exactly happened around then. See


It's very unlikely any of this is your fault. In any case, intermittent
failures are very hard to nail down.

I should add that both drongo and fairywren run on a single
AWS/EC2/Windows2019 instance. I haven't updated the OS in a while. so
I'm doing that to see if matters improve. FWIW. I just reproduced the
error on a similar instance where I'm testing some stuff for Andres'
project to make TAP tests more portable.


cheers


andrew


--
Andrew Dunstan
EDB: https://www.enterprisedb.com





Re: Bug in ProcArrayApplyRecoveryInfo for snapshots crossing 4B, breaking replicas

2022-01-26 Thread Michael Paquier
On Wed, Jan 26, 2022 at 07:31:00PM +0100, Tomas Vondra wrote:
> I actually tried doing that, but I was not very happy with the result. The
> test has to call pg_resetwal, but then it also has to fake pg_xact data and
> so on, which seemed a bit ugly so did not include the test in the patch.

Indeed, the dependency to /dev/zero is not good either.  The patch
logic looks good to me.
--
Michael


signature.asc
Description: PGP signature


Re: row filtering for logical replication

2022-01-26 Thread Greg Nancarrow
On Wed, Jan 26, 2022 at 2:08 PM houzj.f...@fujitsu.com
 wrote:
>
> There was a miss in the posted patch which didn't initialize the parameter in
> RelationBuildPublicationDesc, sorry for that. Attach the correct patch this 
> time.
>

A few comments for the v71-0001 patch:

doc/src/sgml/catalogs.sgml

(1)

+
+ 
+  
+  prqual pg_node_tree
+  
+  Expression tree (in nodeToString()
+  representation) for the relation's qualifying condition. Null if
+  there is no qualifying condition.
+ 

"qualifying condition" sounds a bit vague here.
Wouldn't it be better to say "publication qualifying condition"?


src/backend/commands/publicationcmds.c

(2) check_simple_rowfilter_expr_walker

In the function header:
(i) "etc" should be "etc."
(ii)
Is

+ * - (Var Op Const) Bool (Var Op Const)

   meant to be:

+ * - (Var Op Const) Logical-Op (Var Op Const)

?

It's not clear what "Bool" means here.

(3) check_simple_rowfilter_expr_walker
We should say "Built-in functions" instead of "System-functions":

+   * User-defined functions are not allowed. System-functions that are

Regards,
Greg Nancarrow
Fujitsu Australia




Re: row filtering for logical replication

2022-01-26 Thread Peter Smith
On Tue, Jan 25, 2022 at 2:18 PM houzj.f...@fujitsu.com
 wrote:
>
> On Monday, January 24, 2022 4:38 PM Peter Smith 
> >
...
> > 5. src/include/catalog/pg_publication.h - typedef struct PublicationDesc
> >
> > +typedef struct PublicationDesc
> > +{
> > + /*
> > + * true if the columns referenced in row filters which are used for
> > +UPDATE
> > + * or DELETE are part of the replica identity, or the publication
> > +actions
> > + * do not include UPDATE or DELETE.
> > + */
> > + bool rf_valid_for_update;
> > + bool rf_valid_for_delete;
> > +
> > + AttrNumber invalid_rfcol_update;
> > + AttrNumber invalid_rfcol_delete;
> > +
> > + PublicationActions pubactions;
> > +} PublicationDesc;
> > +
> >
> > I did not see any point really for the pairs of booleans and AttNumbers.
> > AFAIK both of them shared exactly the same validation logic so I think you 
> > can
> > get by using fewer members here.
>
> the pairs of booleans are intended to fix the problem[2] reported earlier.
> [2] 
> https://www.postgresql.org/message-id/OS0PR01MB611367BB85115707CDB2F40CFB5A9%40OS0PR01MB6113.jpnprd01.prod.outlook.com
> >

OK. Thanks for the info.

--
Kind Regards,
Peter Smith.
Fujitsu Australia




Re: autovacuum prioritization

2022-01-26 Thread Peter Geoghegan
> On Wed, Jan 26, 2022 at 10:55 AM Robert Haas  wrote:
> On Tue, Jan 25, 2022 at 3:32 PM Peter Geoghegan  wrote:
> > For example, a
> > page that has 5 dead heap-only tuples is vastly different to a similar
> > page that has 5 LP_DEAD items instead -- and yet our current approach
> > makes no distinction. Chances are very high that if the only dead
> > tuples are heap-only tuples, then things are going just fine on that
> > page -- opportunistic pruning is actually keeping up.
>
> Hmm, I think that's a really good insight. Perhaps we ought to forget
> about counting updates and deletes and instead count dead line
> pointers. Or maybe we can estimate the number of dead line pointers by
> knowing how many updates and deletes there were, as long as we can
> distinguish hot updates from non-HOT updates, which I think we can.

All that we have to go on is a bunch of observations in any case,
though -- the map is not the territory. And so it seems to me that the
sensible thing to do is just embrace that we won't ever really exactly
know what's going on in a given database, at any given time.
Fortunately, we don't really have to know. We should be able to get
away with only having roughly the right idea, by focussing on the few
things that we are sure of -- things like the difference between
LP_DEAD items and dead heap-only tuples, which are invariant to
workload characteristics.

I recently said (on the ANALYZE related thread) that we should be
thinking probabilistically here [1]. Our understanding of the amount
of bloat could very usefully be framed that way. Maybe the model we
use is a probability density function (maybe not formally, not sure).
A PDF has an exact expectation, which for us might be the most
probable number of dead tuples in total in a given table right now
(let's just assume it's still dead tuples, ignoring the problems with
that metric for now).

This is a useful basis for making better decisions by weighing
competing considerations -- which might themselves be another PDF.
Example: For a given table that is approaching the point where the
model says "time to VACUUM", we may very well spend hours, days, or
even weeks approaching the crossover point. The exact expectation
isn't truly special here -- there is actually zero practical reason to
have special reverence for that precise point (with a good model,
within certain reasonable bounds). If our model says that there is
only a noise-level difference between doing a VACUUM on a given table
today, tomorrow, or next week, why not take advantage? For example,
why not do the VACUUM when the system appears to not be busy at all
(typically in the dead of night), just because it'll definitely be
both cheaper in absolute terms (FPIs can be avoided by spreading
things out over multiple checkpoints), and less disruptive?

There are many opportunities like that, I believe. It's hard for me to
suppress the urge to blurt out 17 more ideas like that. What are the
chances that you won't have at least a few real winners among all of
the ideas that everybody will come up with, in the end?

> > if successive ANALYZE operations notice
> > a consistent pattern where pages that had a non-zero number of LP_DEAD
> > items last time now have a significantly higher number, then it's a
> > good idea to err in the direction of more aggressive vacuuming.
> > *Growing* concentrations of LP_DEAD items signal chaos. I think that
> > placing a particular emphasis on pages with non-zero LP_DEAD items as
> > a qualitatively distinct category of page might well make sense --
> > relatively few blocks with a growing number of LP_DEAD items seems
> > like it should be enough to make autovacuum run aggressively.
>
> I think measuring the change over time here might be fraught with
> peril.

I'd say that that depends on how you define the problem we're trying
to solve. If you define the problem as coming up with a significantly
improved statistical model that determines (say) how many dead tuples
there are in the table right now, given a set of observations made by
ANALYZE in the past, then yes, it's fraught with peril. But why would
you define it that way? It seems far easier to improve things by
putting model error and *actual* exposure to real known issues (e.g.
line pointer bloat) front and center.

It doesn't necessarily matter if we're *usually* wrong with a good
model. But with a bad model we may need to consistently get the
correct answer. And so the model that is the most accurate
quantitatively is probably *not* the best available model, all things
considered. Most of the time we shouldn't VACUUM right this second,
and so a model that consists of "return false" is very frequently
correct. But that doesn't mean it's a good model. You get the idea.

> If vacuum makes a single pass over the indexes, it can retire
> as many dead line pointers as we have, or as will fit in memory, and
> the effort doesn't really depend too much on exactly how many dead
> line pointers we're 

snapper and skink and fairywren (oh my!)

2022-01-26 Thread Robert Haas
I was thinking about whether it made sense to try to commit anything
and decided it would be a good idea to check how the buildfarm looks
first. It doesn't look great.

1. snapper failed 4 out of the last 5 runs in recoveryCheck. The
latest run as of this writing shows this:

[19:09:50] t/026_overwrite_contrecord.pl  ok43136 ms
# poll_query_until timed out executing this query:
# SELECT '0/1415E310' <= replay_lsn AND state = 'streaming' FROM
pg_catalog.pg_stat_replication WHERE application_name = 'standby_1';
# expecting this output:
# t
# last actual query output:
#

2. skink failed the last run in this MiscCheck phase. I had no idea
what this phase was, because the name isn't very descriptive. It seems
that it runs some stuff in contrib and some stuff in src/test/modules,
which seems a bit confusing. Anyway, the failure here is:

test oldest_xmin  ... FAILED 5533 ms

To find the failure, the regression test suite suggests looking at
file contrib/test_decoding/output_iso/regression.diffs or
regression.out. But neither file is in the buildfarm results so far as
I can see. It does have the postmaster log but I can't tell what's
gone wrong from looking at that. In fact I'm not really sure it's the
correct log file, because oldest_xmin.spec uses a slot called
"isolation_slot" and test_decoding/log/postmaster.log refers only to
"regression_slot", so it seems like this postmaster.log file might
cover only the pg_regress tests and not the results from the isolation
tester.

3. fairywren failed the last run in module-commit_tsCheck. It's unhappy because:

[16:30:02] t/002_standby.pl  ok13354 ms ( 0.06 usr  0.00 sys +
 1.11 cusr  3.20 csys =  4.37 CPU)
# poll_query_until timed out executing this query:
# SELECT '0/303C7D0'::pg_lsn <= pg_last_wal_replay_lsn()
# expecting this output:
# t
# last actual query output:
# f

I don't know what is causing any of these failures, and I don't know
if there's already some discussion elsewhere that I've missed, but
maybe this email will be helpful to someone. I also noticed that 2 out
of the 3 failures report 2dbb7b9b2279d064f66ce9008869fd0e2b794534 "Fix
pg_hba_file_rules for authentication method cert" as the only new
commit since the last run, and it's hardly believable that that commit
would have broken this. Nor do I see any other recent changes that
look like likely culprits. Apologies in advance if any of this is my
fault.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: refactoring basebackup.c

2022-01-26 Thread Robert Haas
On Tue, Jan 25, 2022 at 11:22 AM tushar  wrote:
> A)Getting syntax error if -z is used along with -t
>
> [edb@centos7tushar bin]$ ./pg_basebackup -t server:/tmp/data902 -z -Xfetch
> pg_basebackup: error: could not initiate base backup: ERROR:  syntax error

Oops. The attached patch should fix this.

> B)No information of "client-gzip" or "server-gzip" added under
> "--compress" option/method of ./pg_basebackup --help.

Already fixed by e1f860f13459e186479319aa9f65ef184277805f.

> C) -R option is silently ignoring

The attached patch should fix this, too.

Thanks for finding these issues.

-- 
Robert Haas
EDB: http://www.enterprisedb.com


fix-mistakes-found-by-tushar.patch
Description: Binary data


Re: Refactoring of compression options in pg_basebackup

2022-01-26 Thread Robert Haas
On Tue, Jan 25, 2022 at 8:15 PM Michael Paquier  wrote:
> Sure, but I don't think that it is a good idea to unify that yet, at
> least not until pg_dump is able to handle LZ4 as an option, as the
> main benefit that we'd gain here is to be able to change the code to a
> switch/case without defaults where we would detect code paths that
> require a refresh once adding support for a new option.

I think those places could just throw a "lz4 compression is not
supported" elog() and then you could just grep for everyplace where
that string appears. But I am not of a mind to fight about it. I was
just pointing out the duplication.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: autovacuum prioritization

2022-01-26 Thread Robert Haas
On Tue, Jan 25, 2022 at 3:34 PM John Naylor
 wrote:
> I was thinking along the same lines as Dilip: If the anti-wraparound
> risk is really far in the future, there might not be much eligible
> freezing work to do. Dead tuples can be removed as soon as visibility
> rules allow it. With a separate bloat queue, there might always be
> some work to do.

Isn't the same thing true of bloat, though? If the XID threshold
hasn't advanced that much, then there may be nothing that's worth
doing about XID wraparound in the short term. If there aren't many
dead tuples in any of the tables, then there may be nothing that's
worth doing about bloat. Then we should just do nothing. On the other
hand, we may have a relatively urgent problem in one of those areas
but not the other. Then we should work on that one. Or we may have
problems in both areas, and then we need to somehow decide which one
is more urgent -- that's the situation in which I feel like we need to
unify the prioritization or ordering in some way.

It is an interesting point that we could have low priority work with a
low cost limit and high priority work with a higher cost limit, or as
I think Peter suggested, just use a single process for low-priority
stuff but allow multiple processes when there's high-priority stuff.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: autovacuum prioritization

2022-01-26 Thread Robert Haas
On Tue, Jan 25, 2022 at 3:32 PM Peter Geoghegan  wrote:
> For example, a
> page that has 5 dead heap-only tuples is vastly different to a similar
> page that has 5 LP_DEAD items instead -- and yet our current approach
> makes no distinction. Chances are very high that if the only dead
> tuples are heap-only tuples, then things are going just fine on that
> page -- opportunistic pruning is actually keeping up.

Hmm, I think that's a really good insight. Perhaps we ought to forget
about counting updates and deletes and instead count dead line
pointers. Or maybe we can estimate the number of dead line pointers by
knowing how many updates and deletes there were, as long as we can
distinguish hot updates from non-HOT updates, which I think we can.

> if successive ANALYZE operations notice
> a consistent pattern where pages that had a non-zero number of LP_DEAD
> items last time now have a significantly higher number, then it's a
> good idea to err in the direction of more aggressive vacuuming.
> *Growing* concentrations of LP_DEAD items signal chaos. I think that
> placing a particular emphasis on pages with non-zero LP_DEAD items as
> a qualitatively distinct category of page might well make sense --
> relatively few blocks with a growing number of LP_DEAD items seems
> like it should be enough to make autovacuum run aggressively.

I think measuring the change over time here might be fraught with
peril. If vacuum makes a single pass over the indexes, it can retire
as many dead line pointers as we have, or as will fit in memory, and
the effort doesn't really depend too much on exactly how many dead
line pointers we're trying to find. (I hear that it does depend more
than you'd think ... but I still don't think that should be the
dominant consideration here.) So to be efficient, we want to do that
pass over the indexes when we have a suitably large batch of dead line
pointers. I don't think it really depends on how long it took the
batch to get to that size. I don't want to vacuum a terabyte of
indexes with a much-smaller-than-normal batch of dead TIDs just
because the number of dead TIDs seems to be increasing quickly at the
moment: it's hard to imagine that the results will be worth the
resources I'll have to expend to get there. On the other hand I also
don't think I want to postpone vacuuming the indexes because the
number is really big but not growing that fast.

I feel like my threshold for the number of dead TIDs that ought to
trigger a vacuum grows as the table gets bigger, capped by how much
memory I've got. But I don't feel like the rate at which it's changing
necessarily matters. Like if I create a million dead line pointers
really quickly, wait a month, and then create another million dead
line pointers, I feel like I want the system to respond just as
aggressively as if the month-long delay were omitted.

Maybe my feelings are wrong here. I'm just saying that, to me, it
doesn't feel like the rate of change is all that relevant.

> Even busy production DBs should usually only be vacuuming one large
> table at a time. Also might make sense to strategically align the work
> with the beginning of a new checkpoint.

I'm not sure that either of those statements are correct. But on the
other hand, I am also not sure that either of those statements are
incorrect.

--
Robert Haas
EDB: http://www.enterprisedb.com




Re: Bug in ProcArrayApplyRecoveryInfo for snapshots crossing 4B, breaking replicas

2022-01-26 Thread Tomas Vondra



On 1/25/22 04:25, Michael Paquier wrote:

On Mon, Jan 24, 2022 at 10:45:48PM +0100, Tomas Vondra wrote:

On 1/24/22 22:28, Bossart, Nathan wrote:

Attached patch is fixing this by just sorting the XIDs logically. The
xidComparator is meant for places that can't do logical ordering. But
these XIDs come from RUNNING_XACTS, so they actually come from the same
wraparound epoch (so sorting logically seems perfectly fine).


The patch looks reasonable to me.


Thanks!


Could it be possible to add a TAP test?  One idea would be to rely on
pg_resetwal -x and -e close to the 4B limit to set up a node before
stressing the scenario of this bug, so that would be rather cheap.


I actually tried doing that, but I was not very happy with the result. 
The test has to call pg_resetwal, but then it also has to fake pg_xact 
data and so on, which seemed a bit ugly so did not include the test in 
the patch.


But maybe there's a better way to do this, so here it is. I've kept it 
separately, so that it's possible to apply it without the fix, to verify 
it actually triggers the issue.



regards

--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL CompanyFrom efb0e79d752102dd0fa066959cea517288cc6085 Mon Sep 17 00:00:00 2001
From: Tomas Vondra 
Date: Wed, 26 Jan 2022 19:24:21 +0100
Subject: [PATCH 1/2] Add TAP test

---
 src/test/recovery/t/028_wraparound_restart.pl | 172 ++
 1 file changed, 172 insertions(+)
 create mode 100644 src/test/recovery/t/028_wraparound_restart.pl

diff --git a/src/test/recovery/t/028_wraparound_restart.pl b/src/test/recovery/t/028_wraparound_restart.pl
new file mode 100644
index 000..2be5537b6b3
--- /dev/null
+++ b/src/test/recovery/t/028_wraparound_restart.pl
@@ -0,0 +1,172 @@
+# Run the standard regression tests with streaming replication
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More tests => 36;
+use File::Basename;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->adjust_conf('postgresql.conf', 'max_connections', '25', 1);
+$node_primary->append_conf('postgresql.conf', 'max_prepared_transactions = 10');
+
+# needed so that vacuumdb freezes the template too, and we don't get warnings
+# about possible data loss due to wraparound
+$node_primary->start;
+$node_primary->psql(
+'postgres',
+qq[UPDATE pg_database SET datallowconn = 't']);
+$node_primary->stop;
+
+# reset WAL close to max XID (4294967295), in 4 smaller steps (~1B each)
+command_checks_all(
+	[ 'pg_resetwal', '-x', 10, $node_primary->data_dir ],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 1B XID');
+
+command_checks_all(
+	[ 'dd', 'if=/dev/zero', 'of=' . $node_primary->data_dir . '/pg_xact/03B9', 'bs=1K', 'count=256'],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 1B XID');
+
+$node_primary->start;
+$node_primary->issues_sql_like(
+	[ 'vacuumdb', '--all', '--freeze', '--echo' ],
+	qr/statement: VACUUM.*statement: VACUUM/s,
+	'vacuum all databases');
+$node_primary->stop;
+
+command_checks_all(
+	[ 'pg_resetwal', '-x', 20, $node_primary->data_dir ],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 2B XID');
+
+command_checks_all(
+	[ 'dd', 'if=/dev/zero', 'of=' . $node_primary->data_dir . '/pg_xact/0773', 'bs=1K', 'count=256'],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 2B XID');
+
+$node_primary->start;
+$node_primary->issues_sql_like(
+	[ 'vacuumdb', '--all', '--freeze', '--echo' ],
+	qr/statement: VACUUM.*statement: VACUUM/s,
+	'vacuum all databases');
+$node_primary->stop;
+
+command_checks_all(
+	[ 'pg_resetwal', '-x', 30, $node_primary->data_dir ],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 3B XID');
+
+command_checks_all(
+	[ 'dd', 'if=/dev/zero', 'of=' . $node_primary->data_dir . '/pg_xact/0B2D', 'bs=1K', 'count=256'],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 3B XID');
+
+$node_primary->start;
+
+is( $node_primary->psql(
+'postgres',
+qq[SELECT datname
+, age(datfrozenxid)
+, current_setting('autovacuum_freeze_max_age') 
+FROM pg_database 
+ORDER BY 2 DESC]),
+0,
+'physical slot created on primary');
+
+$node_primary->issues_sql_like(
+	[ 'vacuumdb', '--all', '--freeze', '--echo' ],
+	qr/statement: VACUUM.*statement: VACUUM/s,
+	'vacuum all databases');
+$node_primary->stop;
+
+# the max is 4294967295, so this is within 100 transactions of wraparound
+command_checks_all(
+	[ 'pg_resetwal', '-x', 4294967200, $node_primary->data_dir ],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 4B XID');
+
+command_checks_all(
+	[ 'dd', 'if=/dev/zero', 'of=' . $node_primary->data_dir . '/pg_xact/0FFF', 'bs=1K', 'count=256'],
+	0,
+	[qr''],
+	[qr''],
+	'reset WAL to 4B XID');
+
+$node_primary->start;
+$node_primary->issues_sql_like(
+	[ 'vacuumdb', '--all', '--freeze', '--echo' ],
+	qr/statement: VACUUM.*statement: VACUUM/s,
+	'vacuum all databases');
+
+
+# now, start 

Re: Support for NSS as a libpq TLS backend

2022-01-26 Thread Jacob Champion
On Tue, 2022-01-25 at 22:26 +, Jacob Champion wrote:
> On Wed, 2022-01-19 at 10:01 +0100, Daniel Gustafsson wrote:
> > > On 18 Jan 2022, at 17:37, Jacob Champion  wrote:
> > > 
> > > On Wed, 2021-12-15 at 23:10 +0100, Daniel Gustafsson wrote:
> > > > I've attached a v50 which fixes the issues found by Joshua upthread, as 
> > > > well as
> > > > rebases on top of all the recent SSL and pgcrypto changes.
> > > 
> > > I'm currently tracking down a slot leak. When opening and closing large
> > > numbers of NSS databases, at some point we appear to run out of slots
> > > and then NSS starts misbehaving, even though we've closed all of our
> > > context handles.
> > 
> > Interesting, are you able to share a reproducer for this so I can assist in
> > debugging it?
> 
> (This was in my spam folder, sorry for the delay...) Let me see if I
> can minimize my current reproduction case and get it ported out of
> Python.

Here's my attempt at a Bash port. It has races but reliably reproduces
on my machine after 98 connections (there's a hardcoded slot limit of
100, so that makes sense when factoring in the internal NSS slots).

--Jacob


nss-repro.tar.gz
Description: nss-repro.tar.gz


Re: BRIN summarization vs. WAL logging

2022-01-26 Thread Alvaro Herrera
On 2022-Jan-26, Robert Haas wrote:

> On Tue, Jan 25, 2022 at 10:12 PM Tomas Vondra
>  wrote:

> > 2) brin_summarize_range()
> >
> > Now, the issue I think is more serious, more likely to happen, and
> > harder to fix. When summarizing a range, we write two WAL records:
> >
> > INSERT heapBlk 2 pagesPerRange 2 offnum 2, blkref #0: rel 1663/63 ...
> > SAMEPAGE_UPDATE offnum 2, blkref #0: rel 1663/63341/73957 blk 2
> >
> > So, what happens if we lost the second WAL record, e.g. due to a crash?
> 
> Ouch. As you say, XLogFlush() won't fix that. The WAL logging scheme
> needs to be redesigned.

I'm not sure what a good fix is.  I was thinking that maybe if a
placeholder tuple is found during index scan, and the corresponding
process is no longer running, then the index scanner would remove the
placeholder tuple, making the range unsummarized again.  However, how
would we know that the process is gone?

Another idea is to use WAL's rm_cleanup: during replay, remember that a
placeholder tuple was seen, then remove the info if we see an update
from the originating process that replaces the placeholder tuple with a
real one; at cleanup time, if the list of remembered placeholder tuples
is nonempty, remove them.

(I vaguely recall we used the WAL rm_cleanup mechanism for something
like this, but we no longer do AFAICS.)

... Oh, but if there is a promotion involved, we may end up with a
placeholder insertion before the promotion and the update afterwards.
That would probably not be handled well.


One thing not completely clear to me is whether this only affects
placeholder tuples.  Is it possible to have this problem with regular
BRIN tuples?  I think it isn't.

-- 
Álvaro Herrera  Valdivia, Chile  —  https://www.EnterpriseDB.com/




Re: slowest tap tests - split or accelerate?

2022-01-26 Thread Andres Freund
Hi,

On 2022-01-19 18:18:59 -0800, Andres Freund wrote:
> >  robocopy /e /NFL /NDL tmp_install\initdb_template t\
> >  575ms
> > 
> > So I guess we could use robocopy? That's shipped as part of windows 
> > starting in
> > windows 10... xcopy has been there for longer, so I might just default to 
> > that.
> 
> It's part of of the OS back to at least windows 2016. I've found some random
> links on the webs saying that it's included "This command is available in
> Vista and Windows 7 by default. For Windows XP and Server 2003 this tool can
> be downloaded as part of Server 2003 Windows Resource Kit tools. ".
> 
> Given that our oldest supported msvc version only runs on Windows 7 upwards
> [2], I think we should be good?

One thing I'm not sure about is where to perform the creation of the
"template" for the msvc scripts. The prototype upthread created it
unconditionally in Install.pm, but that's clearly not right.

The buildfarm currently creates the temporary installation using a generic
perl install.pl "$installdir" and then uses NO_TEMP_INSTALL.

I don't really have a better idea than to introduce a dedicated vcregress.pl
command to create the temporary installation? :(

Greetings,

Andres Freund




Re: BRIN summarization vs. WAL logging

2022-01-26 Thread Robert Haas
On Tue, Jan 25, 2022 at 10:12 PM Tomas Vondra
 wrote:
> 1) brin_desummarize_range()
>
> But if the cluster/VM/... crashes right after you ran the function (and
> it completed just fine, possibly even in an explicit transaciton), that
> change will get lost. Not really a serious data corruption/loss, and you
> can simply run it again, but IMHO rather surprising.

I don't have a big problem with inserting an XLogFlush() here, but I
also don't think there's a hard and fast rule that maintenance
operations have to have full transactional behavior. Because that's
violated in various different ways by various different DDL commands.
CREATE INDEX CONCURRENTLY can leave detritus around if it fails. At
least one variant of ALTER TABLE does a non-MVCC-aware rewrite of the
table contents. COPY FREEZE violates MVCC. Concurrent partition attach
and detach aren't fully serializable with concurrent transactions.
Years ago TRUNCATE didn't have MVCC semantics. We often make small
compromises in these areas for implementation simplicity or other
benefits.

> 2) brin_summarize_range()
>
> Now, the issue I think is more serious, more likely to happen, and
> harder to fix. When summarizing a range, we write two WAL records:
>
> INSERT heapBlk 2 pagesPerRange 2 offnum 2, blkref #0: rel 1663/63 ...
> SAMEPAGE_UPDATE offnum 2, blkref #0: rel 1663/63341/73957 blk 2
>
> So, what happens if we lost the second WAL record, e.g. due to a crash?

Ouch. As you say, XLogFlush() won't fix that. The WAL logging scheme
needs to be redesigned.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: refactoring basebackup.c

2022-01-26 Thread Robert Haas
On Tue, Jan 25, 2022 at 8:23 PM Michael Paquier  wrote:
> On Tue, Jan 25, 2022 at 03:54:52AM +, Shinoda, Noriyoshi (PN Japan FSIP) 
> wrote:
> > Michael, I am proposing to that we remove this message as part of
> > this commit:
> >
> > -pg_log_info("no value specified for compression
> > level, switching to default");
> >
> > I think most people won't want to specify a compression level, so
> > emitting a message when they don't seems too verbose.
>
> (Just noticed this message as I am not in CC.)
> Removing this message is fine by me, thanks!

Oh, I thought I'd CC'd you. I know I meant to do so.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: pg14 psql broke \d datname.nspname.relname

2022-01-26 Thread Mark Dilger


> On Jan 17, 2022, at 1:54 PM, Robert Haas  wrote:
> 
> + * dotcnt: how many separators were parsed from the pattern, by reference.
> + * Can be NULL.
> 
> But then:
> 
> +Assert(dotcnt != NULL);

Removed the "Can be NULL" part, as that use case doesn't make sense.  The 
caller should always care whether the number of dots was greater than they are 
prepared to handle.

> On a related note, it's unclear why you've added three new arguments
> to processSQLNamePattern() but only one of them gets a mention in the
> function header comment.

Updated the header comments to include all parameters.

> It's also pretty clear that the behavior of patternToSQLRegex() is
> changing, but the function header comments are not.

Updated the header comments for this, too.

Also, rebased as necessary:



v5-0001-Reject-patterns-with-too-many-parts-or-wrong-db.patch
Description: Binary data


—
Mark Dilger
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company





Re: Non-decimal integer literals

2022-01-26 Thread Andrew Dunstan


On 1/25/22 13:43, Alvaro Herrera wrote:
> On 2022-Jan-24, Peter Eisentraut wrote:
>
>> +decinteger  {decdigit}(_?{decdigit})*
>> +hexinteger  0[xX](_?{hexdigit})+
>> +octinteger  0[oO](_?{octdigit})+
>> +bininteger  0[bB](_?{bindigit})+
> I think there should be test cases for literals that these seemingly
> strange expressions reject, which are a number with trailing _ (0x123_),
> and one with consecutive underscores __ (0x12__34).
>
> I like the idea of these literals.  I would have found them useful on
> many occassions.


+1. I can't remember the number of times I have miscounted a long string
of digits in a literal.


cheers


andrew


--
Andrew Dunstan
EDB: https://www.enterprisedb.com





Re: Non-decimal integer literals

2022-01-26 Thread Peter Eisentraut

On 26.01.22 01:02, Tom Lane wrote:

Robert Haas  writes:

On Tue, Jan 25, 2022 at 5:34 AM Peter Eisentraut
 wrote:

Which part exactly?  There are several different changes proposed here.



I was just going based on the description of the feature in your
original post. If someone is hoping that int4in() will accept only
^\d+$ then they will be disappointed by this patch.


Maybe I misunderstood, but I thought this was about what you could
write as a SQL literal, not about the I/O behavior of the integer
types.  I'd be -0.1 on changing the latter.


I think it would be strange if I/O routines would accept a different 
syntax from the literals.  Also, the behavior of a cast from string/text 
to a numeric type is usually defined in terms of what the literal syntax 
is, so they need to be aligned.





Re: JSONB docs patch

2022-01-26 Thread Andrew Dunstan


On 1/25/22 18:08, David G. Johnston wrote:
> On Tue, Jan 25, 2022 at 3:38 PM Mikhail Dobrinin
>  wrote:
>
> Hello,
>
> I have come across some missing documentation that I think could
> benefit the community.
>
> Several functions like `jsonb_exists`, `jsonb_exists_any`,
> `jsonb_exists_all` have existed for many PG versions but were not
> documented. They are equivalent to `?`, `?|`, and `?&` operators.
> But some JDBC drivers have issues with native queries containing
> these operators (see
> 
> https://stackoverflow.com/questions/38370972/how-do-i-use-postgresql-jsonb-operators-containing-a-question-mark-via-jdb),
> so it is useful for users of PG to know the function equivalents
> of these operators.
>
> I have attached the patch as an attachment to this email. The
> documentation builds correctly without any lint errors after
> applying the patch locally. This is my first time contributing, so
> let me know if there is anything else I should do (add to
> commitfest etc).
>
>
> I'm doubtful that encouraging use of these functions for JDBC-users is
> better than them learning to write queries using the proper operator. 
> The reality is that the usage of indexes depends upon operators being
> used in query text, not function names (unless you define a functional
> index, which doesn't happen).  Your SO post says as much and does
> mention that ?? is indeed the coding that is required.
>
> What I think we should do in light of this reality, though, is indeed
> prohibit "??" as (or within) an operator in PostgreSQL.  Since core is
> not presently using that operator its prohibition should be reasonably
> simple - though maybe extension authors got too creative?
>
> -1 to this patch on the above grounds.  As for the patch itself:
> The parentheticals you wrote might be appropriate for a commit message
> but do not belong in the documentation.  Mentioning JDBC is simply a
> no-no; and we don't document "why" we decided to document something.
> We also don't go around pointing out what functions and operators
> perform the same behavior (mostly because we generally just don't do
> that, see above).
>
> I didn't actually review the material parts of the table.  Nothing
> seems obviously incorrect there though.
>
>

Yeah. The omission from the docs is not accidental - we generally don't
document the functions underlying operators.

These jsonb operators are not the only ones with '?' in the name -
there's a whole heap of them for geometric types. Are we supposed to
document all those too?

I feel the pain that JDBC users have here. It's a pity there are no
alternative placeholder mechanisms in JDBC. Perl's DBD::Pg, which also
has '?' placeholders, provides a couple of quite workable ways around
the issue.


cheers


andrew



--
Andrew Dunstan
EDB: https://www.enterprisedb.com





Re: decoupling table and index vacuum

2022-01-26 Thread Dilip Kumar
On Mon, Sep 27, 2021 at 8:48 AM Masahiko Sawada  wrote:
>
Hi,

Here is the first WIP patch for the decoupling table and index vacuum.
The first mail of the thread has already explained the complete
background of why we want to do this so instead of describing that I
will directly jump into explaining what these patches do.

Currently, the table vacuum and index vacuum are executed as a single
operation.  Basically, we vacuum the table by performing hot pruning
and remembering dead items in the cache and then we perform the index
vacuum and perform the second pass of the heap vacuum under which we
mark items unused.

In this patch, we make these multiple vacuum passes as independent
operations.  So the idea is that we provide multiple vacuum options
under that, the user can perform the independent operations i.e.
"VACUUM (heap_hot_prune) tbl_name" for performing just the hot prune
(first vacuum) pass, "VACUUM (heap_vacuum) tbl_name" for the second
heap pass to set dead item unused for which index vacuum is done. And
additionally, we are now allowing users to just perform the index
vacuum i.e. "VACUUM idx_name".

So under the heap_hot_prune pass, we will generate the dead tids and
instead of directly performing the index vacuum we will flush those
dead tids into the conveyor belt using Deadtidstore interfaces.  Then
in the index pass, we will read the data from the conveyor belt and
perform the index vacuum and at last, in the heap_vacuum pass, we will
read the data from the conveyor belt and mark all dead items unused.
However, in the second pass, we can only mark those items unused which
are dead, and for which all the indexes for the table are already
vacuumed.  So for identifying that in the pg_class entry we store the
conveyor belt pageno up to which we have already done the index vacuum
for the index related entry and we have already done the heap_vacuum
pass for the table related entry.  Additionally while doing the
hot_prune pass we also check if the item is already dead and index
vacuum is also done for that then we directly set it unused, for this,
we use Deadtidstore interfaces.

Deadtidstore provides interfaces over the conveyor belt for storing
and retrieving dead tids into the conveyor belt.  This module
maintains a DeadTidState which keeps track of the current insertion
progress i.e the first and the last conveyor belt page for the current
vacuum run. And on the completion of the vacuum run, this takes care
of setting the complete vacuum run bound by storing the last conveyor
belt pageno of the current vacuum run into the special space of the
first conveyor belt page for this run.  This also provides the
infrastructure to avoid adding duplicate tids into the conveyor belt.
Basically, if we perform the first vacuum pass multiple times without
executing the second vacuum pass then it is possible that we encounter
the same dead tids in the conveyor belt so this module maintains a
cache over the conveyor belt such that it only loads the data into the
cache w.r.t the current block the vacuum is processing so we don't
need to maintain a huge cache.

Test example:

CREATE TABLE t (a int);
CREATE INDEX idx on t(a);
INSERT INTO t VALUES (generate_series(1,100));
DELETE FROM t where a > 300;
VACUUM (heap_hot_prune) t;
VACUUM idx;
"VACUUM (heap_vacuum) t;

TODO:
- This is just a POC patch to discuss the design idea and needs a lot
of improvement and testing.
- We are using a slightly different format for storing the dead tids
into the conveyor belt which is explained in the patch but the
traditional multi-pass vacuum is still using the same format (array of
ItemPointeData), so we need to unify that format.
- Performance testing.
- Cleaner interfaces so that we can easily be integrated with auto
vacuum, currently, this is not provided for the manual vacuum.
- Add test cases.

Patches can be applied on the latest conveyor belt patches[1]

[1] 
https://www.postgresql.org/message-id/CAFiTN-sQUddO9JPiH3tz%2BvbNqRqi_pgndecy8k2yXAnO3ymqZA%40mail.gmail.com

-- 
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com
From 7fb07fcb8aca82e3629dcf90113c1bbaf511c343 Mon Sep 17 00:00:00 2001
From: Dilip Kumar 
Date: Sat, 23 Oct 2021 15:33:13 +0530
Subject: [PATCH v1 1/3] Add DEADTID_FORK

---
 src/common/relpath.c | 3 ++-
 src/include/common/relpath.h | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/common/relpath.c b/src/common/relpath.c
index 636c96e..617b720 100644
--- a/src/common/relpath.c
+++ b/src/common/relpath.c
@@ -34,7 +34,8 @@ const char *const forkNames[] = {
 	"main",		/* MAIN_FORKNUM */
 	"fsm",		/* FSM_FORKNUM */
 	"vm",		/* VISIBILITYMAP_FORKNUM */
-	"init"		/* INIT_FORKNUM */
+	"init",		/* INIT_FORKNUM */
+	"tid"		/* DEADTID_FORKNUM */
 };
 
 StaticAssertDecl(lengthof(forkNames) == (MAX_FORKNUM + 1),
diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h
index a4b5dc8..b110d82 100644
--- 

Re: Schema variables - new implementation for Postgres 15

2022-01-26 Thread Julien Rouhaud
Hi,

On Wed, Jan 26, 2022 at 02:43:54PM +0100, Pavel Stehule wrote:
> 
> I think this is still necessary. The lock protects the variable against
> drop from the second session, but not for reverted deletion from the
> current session.
> 
> This implementation is due Tomas's request for
> 
> CREATE VARIABLE xx AS int;
> LET xx = 100;
> BEGIN;
> DROP VARIABLE xx;
> ROLLBACK;
> SELECT xx; --> 100
> 
> and the variable still holds the last value before DROP

I thought about this case, but assumed that the own session wouldn't process
the inval until commit. Agreed then, although the comment should clarify the
transactional behavior and why it's still necessary.

> Personally, this is a corner case (for me, and I think so for users it is
> not too interesting, and important),  and this behavior is not necessary -
> originally I implemented just the RESET variable in this case. On the other
> hand, this is a nice feature, and there is an analogy with TRUNCATE
> behavior.
> 
> More, I promised, as a second step, implementation of optional
> transactional behavior of session variables. And related code is necessary
> for it. So I prefer to use related code without change.

That's another good reason, so fine by me!




Re: libpq async duplicate error results

2022-01-26 Thread Fabien COELHO




command = SELECT pg_terminate_backend(pg_backend_pid());
result 1 status = PGRES_FATAL_ERROR
error message = "FATAL:  terminating connection due to administrator command
"
result 2 status = PGRES_FATAL_ERROR
error message = "FATAL:  terminating connection due to administrator command
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
"



Also, why are there multiple results being generated in the first place?


My interpretation is that the first message is a close message issued by 
the server before actually severing the connection, and the second message 
is generated by libpq when it notices that the connection has been closed, 
so there is some sense in having to results to report these two 
consecutive errors, and the question might be about when the buffer should 
be reset.


--
Fabien.




Re: Schema variables - new implementation for Postgres 15

2022-01-26 Thread Pavel Stehule
> sessionvariable.c:
>
> + * Although session variables are not transactional, we don't
> + * want (and we cannot) to run cleaning immediately (when we
> + * got sinval message). The value of session variables can
> + * be still used or the operation that emits cleaning can be
> + * reverted. Unfortunatelly, this check can be done only in
> + * when transaction is committed (the check against system
> + * catalog requires transaction state).
>
> This was the original idea, but since there's now locking to make all DDL
> safe,
> the metadata should be considered fully transactional and no session should
> still be able to use a concurrently dropped variable.  Also, the
> invalidation
> messages are not sent until the transaction is committed.  So is that
> approach
> still needed (at least for things outside ON COMMIT DROP / ON TRANSACTION
> END
> RESET
>

I think this is still necessary. The lock protects the variable against
drop from the second session, but not for reverted deletion from the
current session.

This implementation is due Tomas's request for

CREATE VARIABLE xx AS int;
LET xx = 100;
BEGIN;
DROP VARIABLE xx;
ROLLBACK;
SELECT xx; --> 100

and the variable still holds the last value before DROP

Personally, this is a corner case (for me, and I think so for users it is
not too interesting, and important),  and this behavior is not necessary -
originally I implemented just the RESET variable in this case. On the other
hand, this is a nice feature, and there is an analogy with TRUNCATE
behavior.

More, I promised, as a second step, implementation of optional
transactional behavior of session variables. And related code is necessary
for it. So I prefer to use related code without change.

Regards

Pavel


Re: [PATCH] Tracking statements entry timestamp in pg_stat_statements

2022-01-26 Thread Andrei Zubkov
Hi Julien,

On Tue, 2022-01-25 at 20:22 +0800, Julien Rouhaud wrote:
> To be honest I don't see how any monitoring solution could make any
> use of
> those fields as-is.  For that reason in PoWA we unfortunately have to
> entirely
> ignore them.  There was a previous attempt to provide a way to reset
> those
> counters only (see [1]), but it was returned with feedback due to
> lack of TLC
> from the author.

Thank you, I've just seen a thread in [1], it was of course a weak
attempt and I think it could be done better.
> 
> 
> I don't think it's a problem, as once you have a solution on top of
> pg_stat_statements, you get information order of magnitude better
> from that solution instead of pg_stat_statements.

Agreed. I'm worry about having different solutions running
simultaneously. All of them may want to get accurate min/max values,
but if they all will reset min/max statistics, they will interfere each
other. It seems that min/max resetting should be configurable in each
sampler as a partial problem solution. At least, every sampling
solution can make a decision based on reset timestamps provided by
pg_stat_statements now.
> 
> 
> If you're worried about some external table having a NOT NULL
> constraint for
> those fields, how about returning NaN instead?  That's a valid value
> for a
> double precision data type.

I don't know for sure what we can expect to be wrong here. My opinion
is to use NULLs, as they seems more suitable here. But this can be done
only if we are not expecting significant side effects.
-- 
Andrei Zubkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company






RE: logical replication empty transactions

2022-01-26 Thread osumi.takami...@fujitsu.com
On Tuesday, January 11, 2022 6:43 PM From: Ajin Cherian  
wrote:
> Minor update to rebase the patch so that it applies clean on HEAD.
Hi, let me share some additional comments on v16.


(1) comment of pgoutput_change

@@ -630,11 +688,15 @@ pgoutput_change(LogicalDecodingContext *ctx, 
ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change)
 {
PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
+   PGOutputTxnData *txndata = (PGOutputTxnData *) 
txn->output_plugin_private;
MemoryContext old;
RelationSyncEntry *relentry;
TransactionId xid = InvalidTransactionId;
Relationancestor = NULL;

+   /* If not streaming, should have setup txndata as part of BEGIN/BEGIN 
PREPARE */
+   Assert(in_streaming || txndata);
+

In my humble opinion, the comment should not touch BEGIN PREPARE,
because this patch's scope doesn't include two phase commit.
(We could add this in another patch to extend the scope after the commit ?)

This applies to pgoutput_truncate's comment.

(2) "keep alive" should be "keepalive" in WalSndUpdateProgress

/*
+* When skipping empty transactions in synchronous replication, we need
+* to send a keep alive to keep the MyWalSnd locations updated.
+*/
+   force_keepalive_syncrep = send_keepalive && SyncRepEnabled();
+

Also, this applies to the comment for force_keepalive_syncrep.

(3) Should finish the second sentence with period in the comment of 
pgoutput_message.

@@ -845,6 +923,19 @@ pgoutput_message(LogicalDecodingContext *ctx, 
ReorderBufferTXN *txn,
if (in_streaming)
xid = txn->xid;

+   /*
+* Output BEGIN if we haven't yet.
+* Avoid for streaming and non-transactional messages

(4) "begin" can be changed to "BEGIN" in the comment of PGOutputTxnData 
definition.

In the entire patch, when we express BEGIN message,
we use capital letters "BEGIN" except for one place.
We can apply the same to this place as well.

+typedef struct PGOutputTxnData
+{
+   bool sent_begin_txn;/* flag indicating whether begin has been sent 
*/
+} PGOutputTxnData;
+

(5) inconsistent way to write Assert statements with blank lines

In the below case, it'd be better to insert one blank line
after the Assert();

+static void
+pgoutput_begin(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
+{
boolsend_replication_origin = txn->origin_id != 
InvalidRepOriginId;
+   PGOutputTxnData *txndata = (PGOutputTxnData *) 
txn->output_plugin_private;

+   Assert(txndata);
OutputPluginPrepareWrite(ctx, !send_replication_origin);


(6) new codes in the pgoutput_commit_txn looks messy slightly

@@ -419,7 +455,25 @@ static void
 pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
XLogRecPtr commit_lsn)
 {
-   OutputPluginUpdateProgress(ctx);
+   PGOutputTxnData *txndata = (PGOutputTxnData *) 
txn->output_plugin_private;
+   boolskip;
+
+   Assert(txndata);
+
+   /*
+* If a BEGIN message was not yet sent, then it means there were no 
relevant
+* changes encountered, so we can skip the COMMIT message too.
+*/
+   skip = !txndata->sent_begin_txn;
+   pfree(txndata);
+   txn->output_plugin_private = NULL;
+   OutputPluginUpdateProgress(ctx, skip);

Could we conduct a refactoring for this new part ?
IMO, writing codes to free the data structure at the top
of function seems weird.

One idea is to export some part there
and write a new function, something like below.

static bool
txn_sent_begin(ReorderBufferTXN *txn)
{
PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
boolneeds_skip;

Assert(txndata);

needs_skip = !txndata->sent_begin_txn;

pfree(txndata);
txn->output_plugin_private = NULL;

return needs_skip;
}

FYI, I had a look at the 
v12-0002-Skip-empty-prepared-transactions-for-logical-rep.patch
for reference of pgoutput_rollback_prepared_txn and 
pgoutput_commit_prepared_txn.
Looks this kind of function might work for future extensions as well.
What did you think ?

Best Regards,
Takamichi Osumi



Re: logical decoding and replication of sequences

2022-01-26 Thread Petr Jelinek
I would not remove it altogether, there is plenty of consumers of this 
extension's output in the wild (even if I think it's unfortunate) that might 
not be interested in sequences, but changing it to off by default certainly 
makes sense.

--
Petr Jelinek

> On 26. 1. 2022, at 11:09, Peter Eisentraut 
>  wrote:
> 
> On 26.01.22 03:16, Tomas Vondra wrote:
>> Here's a rebased version of the patch series. I decided to go back to the 
>> version from 2021/12/14, which does not include the changes to WAL logging.
> 
> I notice that test_decoding still has skip-sequences enabled by default, "for 
> backward compatibility".  I think we had concluded in previous discussions 
> that we don't need that.  I would remove the option altogether.
> 





Re: Replace uses of deprecated Python module distutils.sysconfig

2022-01-26 Thread Julien Rouhaud
Hi,

Sorry I somehow missed that email.

On Mon, Jan 24, 2022 at 7:53 AM Tom Lane  wrote:
>
> Now, most of those are BSD machines --- but lapwing isn't.
> It says
>
> checking for python... (cached) /usr/bin/python
> configure: using python 3.6.9 (default, Jan 14 2022, 06:45:55)
> checking for Python distutils module... yes
> checking Python configuration directory... 
> /usr/local/lib/python3.6/config-3.6m-i386-linux-gnu
> checking Python include directories... -I/usr/local/include/python3.6m
> checking how to link an embedded Python application... -L/usr/local/lib 
> -lpython3.6m -lpthread -ldl  -lutil -lrt -lm
>
> Not sure what to make of that --- maybe that's a handmade build?

Yes it is.  Lapwing is a vanilla debian 7, and the backports only
offers python 3.2.  So in anticipiation to the switch to meson I
compiled the latest python 3.6, as it's supposed to be the oldest
supported version.




Re: Skipping logical replication transactions on subscriber side

2022-01-26 Thread Amit Kapila
On Wed, Jan 26, 2022 at 12:51 PM Masahiko Sawada  wrote:
>
> On Wed, Jan 26, 2022 at 1:43 PM David G. Johnston
>  wrote:
> >
> > We probably should just provide an option for the user to specify 
> > "subrelid".  If null, only the main apply worker will skip the given xid, 
> > otherwise only the worker tasked with syncing that particular table will do 
> > so.  It might take a sequence of ALTER SUBSCRIPTION SET commands to get a 
> > broken initial table synchronization to load completely but at least there 
> > will not be any surprises as to which tables had transactions skipped and 
> > which did not.
>
> That would work but I’m concerned that the users can specify it
> properly. Also, we would need to change the errcontext message
> generated by apply_error_callback() so the user can know that the
> error occurred in either apply worker or tablesync worker.
>
> Or, as another idea, since an error during table synchronization is
> not common and could be resolved by truncating the table and
> restarting the synchronization in practice, there might be no need
> this much and we can support it only for apply worker errors.
>

Yes, that is what I have also in mind. We can always extend this
feature for tablesync process because it can not only fail for the
specified skip_xid but also for many other reasons during the initial
copy.

-- 
With Regards,
Amit Kapila.




Re: row filtering for logical replication

2022-01-26 Thread Amit Kapila
On Wed, Jan 26, 2022 at 8:37 AM houzj.f...@fujitsu.com
 wrote:
>
> On Monday, January 24, 2022 4:38 PM Peter Smith  wrote:
> >
> >
> > 3. src/backend/utils/cache/relcache.c - RelationBuildPublicationDesc
> >
> > +RelationBuildPublicationDesc(Relation relation)
> >  {
> >   List*puboids;
> >   ListCell   *lc;
> >   MemoryContext oldcxt;
> >   Oid schemaid;
> > - PublicationActions *pubactions = palloc0(sizeof(PublicationActions));
> > + List*ancestors = NIL;
> > + Oid relid = RelationGetRelid(relation); AttrNumber invalid_rfcolnum =
> > + InvalidAttrNumber; PublicationDesc *pubdesc =
> > + palloc0(sizeof(PublicationDesc)); PublicationActions *pubactions =
> > + >pubactions;
> > +
> > + pubdesc->rf_valid_for_update = true;
> > + pubdesc->rf_valid_for_delete = true;
> >
> > IMO it wold be better to change the "sense" of those variables.
> > e.g.
> >
> > "rf_valid_for_update" --> "rf_invalid_for_update"
> > "rf_valid_for_delete" --> "rf_invalid_for_delete"
> >
> > That way they have the same 'sense' as the AttrNumbers so it all reads 
> > better to
> > me.
> >
> > Also, it means no special assignment is needed because the palloc0 will set
> > them correctly
>
> Think again, I am not sure it's better to have an invalid_... flag.
> It seems more natural to have a valid_... flag.
>

Can't we do without these valid_ flags? AFAICS, if we check for
"invalid_" attributes, it should serve our purpose because those can
have some attribute number only when the row filter contains some
column that is not part of RI. A few possible optimizations in
RelationBuildPublicationDesc:

a. It calls contain_invalid_rfcolumn with pubid and then does cache
lookup to again find a publication which its only caller has access
to, so can't we pass the same?
b. In RelationBuildPublicationDesc(), we call
GetRelationPublications() to get the list of publications and then
process those publications. I think if none of the publications has
row filter and the relation has replica identity then we don't need to
build the descriptor at all. If we do this optimization inside
RelationBuildPublicationDesc, we may want to rename function as
CheckAndBuildRelationPublicationDesc or something like that?

-- 
With Regards,
Amit Kapila.




Re: logical decoding and replication of sequences

2022-01-26 Thread Peter Eisentraut

On 26.01.22 03:16, Tomas Vondra wrote:
Here's a rebased version of the patch series. I decided to go back to 
the version from 2021/12/14, which does not include the changes to WAL 
logging.


I notice that test_decoding still has skip-sequences enabled by default, 
"for backward compatibility".  I think we had concluded in previous 
discussions that we don't need that.  I would remove the option altogether.






RE: logical replication empty transactions

2022-01-26 Thread osumi.takami...@fujitsu.com
On Tuesday, January 11, 2022 6:43 PM Ajin Cherian  wrote:
> Minor update to rebase the patch so that it applies clean on HEAD.
Hi, thanks for you rebase.

Several comments.

(1) the commit message

"
transactions, keepalive messages are sent to keep the LSN locations updated
on the standby.
This patch does not skip empty transactions that are "streaming" or "two-phase".
"

I suggest that one blank line might be needed before the last paragraph.

(2) Could you please remove one pair of curly brackets for one sentence below ?

@@ -1546,10 +1557,13 @@ WalSndWaitForWal(XLogRecPtr loc)
 * otherwise idle, this keepalive will trigger a reply. 
Processing the
 * reply will update these MyWalSnd locations.
 */
-   if (MyWalSnd->flush < sentPtr &&
+   if (force_keepalive_syncrep ||
+   (MyWalSnd->flush < sentPtr &&
MyWalSnd->write < sentPtr &&
-   !waiting_for_ping_response)
+   !waiting_for_ping_response))
+   {
WalSndKeepalive(false);
+   }


(3) Is this patch's reponsibility to intialize the data in 
pgoutput_begin_prepare_txn ?

@@ -433,6 +487,8 @@ static void
 pgoutput_begin_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
 {
boolsend_replication_origin = txn->origin_id != 
InvalidRepOriginId;
+   PGOutputTxnData*txndata = MemoryContextAllocZero(ctx->context,
+   
 sizeof(PGOutputTxnData));

OutputPluginPrepareWrite(ctx, !send_replication_origin);
logicalrep_write_begin_prepare(ctx->out, txn);


Even if we need this initialization for either non streaming case
or non two_phase case, there can be another issue.
We don't free the allocated memory for this data, right ?
There's only one place to use free in the entire patch,
which is in the pgoutput_commit_txn(). So,
corresponding free of memory looked necessary
in the two phase commit functions.

(4) SyncRepEnabled's better alignment.

IIUC, SyncRepEnabled is called not only by the walsender but also by other 
backends
via CommitTransaction -> RecordTransactionCommit -> SyncRepWaitForLSN.
Then, the place to add the prototype function for SyncRepEnabled seems not 
appropriate,
strictly speaking or requires a comment like /* called by wal sender or other 
backends */.

@@ -90,6 +90,7 @@ extern void SyncRepCleanupAtProcExit(void);
 /* called by wal sender */
 extern void SyncRepInitConfig(void);
 extern void SyncRepReleaseWaiters(void);
+extern bool SyncRepEnabled(void);

Even if we intend it is only used by the walsender, the current code place
of SyncRepEnabled in the syncrep.c might not be perfect.
In this file, seemingly we have a section for functions for wal sender processes
and the place where you wrote it is not here.

at src/backend/replication/syncrep.c, find a comment below.
/*
 * ===
 * Synchronous Replication functions for wal sender processes
 * ===
 */

(5) minor alignment for expressing a couple of messages.

@@ -777,6 +846,9 @@ pgoutput_truncate(LogicalDecodingContext *ctx, 
ReorderBufferTXN *txn,
Oid*relids;
TransactionId xid = InvalidTransactionId;

+   /* If not streaming, should have setup txndata as part of BEGIN/BEGIN 
PREPARE */
+   Assert(in_streaming || txndata);


In the commit message, the way you write is below.
...
skip BEGIN / COMMIT messages for transactions that are empty. The patch
...

In this case, we have spaces back and forth for "BEGIN / COMMIT".
Then, I suggest to unify all of those to show better alignment.

Best Regards,
Takamichi Osumi



Re: Corruption during WAL replay

2022-01-26 Thread Kyotaro Horiguchi
At Mon, 24 Jan 2022 23:33:20 +0300, Daniel Shelepanov  
wrote in 
> Hi. This is my first attempt to review a patch so feel free to tell me
> if I missed something.

Welcome!

> As of today's state of REL_14_STABLE
> (ef9706bbc8ce917a366e4640df8c603c9605817a), the problem is
> reproducible using the script provided by Daniel Wood in this
> (1335373813.287510.1573611814...@connect.xfinity.com) message. Also,
> the latest patch seems not to be applicable and requires some minor
> tweaks.

Thanks for the info.  The reason for my failure is checksum was
enabled.. After disalbing both fpw and checksum (and wal_log_hints)
allows me to reproduce the issue.  And what I found is:

v3 patch:
 >  vxids = GetVirtualXIDsDelayingChkpt(, DELAY_CHKPT_COMPLETE);
!>  if (0 && nvxids > 0)
 >  {

Ugggh!  It looks like a debugging tweak but it prevents everything
from working.

The attached is the fixed version and it surely works with the repro.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From e4155f4e777f31c019cbbd00f41c7b5eb2320ef7 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi 
Date: Wed, 26 Jan 2022 15:07:58 +0900
Subject: [PATCH v4] Delay checkpoint completion until truncation completes

When a relation is being truncated, the buffers for the relation are
dropped without being written to underlying relation file, so after
that moment the content of the to-be-truncated file is not guaranteed
to be consistent. This is fine in the normal cases where the file is
eventually removed. However, if a checkpoint after the buffer dropping
but finally the file truncation doesn't happen due to server crash or
filesystem failure, the files left behind is inconsistent with WAL.

This commit delays checkpoint completion until all ongoing file
truncation are gone to prevent such inconsistency.
---
 src/backend/access/transam/multixact.c  |  6 +++---
 src/backend/access/transam/twophase.c   | 12 +++-
 src/backend/access/transam/xact.c   |  5 +++--
 src/backend/access/transam/xlog.c   | 16 +--
 src/backend/access/transam/xloginsert.c |  2 +-
 src/backend/catalog/storage.c   | 13 +
 src/backend/storage/buffer/bufmgr.c |  6 --
 src/backend/storage/ipc/procarray.c | 26 ++---
 src/backend/storage/lmgr/proc.c |  4 ++--
 src/include/storage/proc.h  |  7 ++-
 src/include/storage/procarray.h |  7 ---
 11 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 806f2e43ba..70593238ab 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
 	 * crash/basebackup, even though the state of the data directory would
 	 * require it.
 	 */
-	Assert(!MyProc->delayChkpt);
-	MyProc->delayChkpt = true;
+	Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+	MyProc->delayChkpt |= DELAY_CHKPT_START;
 
 	/* WAL log truncation */
 	WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
 	/* Then offsets */
 	PerformOffsetsTruncation(oldestMulti, newOldestMulti);
 
-	MyProc->delayChkpt = false;
+	MyProc->delayChkpt &= ~DELAY_CHKPT_START;
 
 	END_CRIT_SECTION();
 	LWLockRelease(MultiXactTruncationLock);
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 271a3146db..2b1a25e723 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	}
 	proc->xid = xid;
 	Assert(proc->xmin == InvalidTransactionId);
-	proc->delayChkpt = false;
+	proc->delayChkpt = 0;
 	proc->statusFlags = 0;
 	proc->pid = 0;
 	proc->databaseId = databaseid;
@@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact)
 
 	START_CRIT_SECTION();
 
-	MyProc->delayChkpt = true;
+	Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+	MyProc->delayChkpt |= DELAY_CHKPT_START;
 
 	XLogBeginInsert();
 	for (record = records.head; record != NULL; record = record->next)
@@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact)
 	 * checkpoint starting after this will certainly see the gxact as a
 	 * candidate for fsyncing.
 	 */
-	MyProc->delayChkpt = false;
+	MyProc->delayChkpt &= ~DELAY_CHKPT_START;
 
 	/*
 	 * Remember that we have this GlobalTransaction entry locked for us.  If
@@ -2267,7 +2268,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	START_CRIT_SECTION();
 
 	/* See notes in RecordTransactionCommit */
-	MyProc->delayChkpt = true;
+	Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+	MyProc->delayChkpt |= DELAY_CHKPT_START;
 
 	/*
 	 * Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2315,7 +2317,7 @@