Re: pg_walfile_name_offset can return inconsistent values

2023-11-24 Thread Bruce Momjian
On Mon, Nov 13, 2023 at 02:12:12PM -0500, Bruce Momjian wrote:
> On Mon, Nov 13, 2023 at 09:49:53AM -0800, Andres Freund wrote:
> > Hi,
> > 
> > On 2023-11-13 12:14:57 -0500, Bruce Momjian wrote:
> > > +SELECT *
> > > +FROM (values ('0/16ff'), ('0/1700'), ('0/1701')) as t(lsn),
> > > + LATERAL pg_walfile_name_offset(lsn::pg_lsn),
> > > + LATERAL pg_walfile_name(lsn::pg_lsn);
> > > +lsn |file_name | file_offset | 
> > > pg_walfile_name  
> > > ++--+-+--
> > > + 0/16ff | 00010016 |16777215 | 
> > > 00010016
> > > + 0/1700 | 00010017 |   0 | 
> > > 00010017
> > > + 0/1701 | 00010017 |   1 | 
> > > 00010017
> > > +(3 rows)
> > 
> > These would break when testing with a different segment size. Today that's 
> > not
> > the case...
> 
> Okay, test removed in the updated patch.

Patch applied to master.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Only you can decide what is important to you.




Re: pg_walfile_name_offset can return inconsistent values

2023-11-13 Thread Bruce Momjian
On Mon, Nov 13, 2023 at 09:49:53AM -0800, Andres Freund wrote:
> Hi,
> 
> On 2023-11-13 12:14:57 -0500, Bruce Momjian wrote:
> > +SELECT *
> > +FROM (values ('0/16ff'), ('0/1700'), ('0/1701')) as t(lsn),
> > + LATERAL pg_walfile_name_offset(lsn::pg_lsn),
> > + LATERAL pg_walfile_name(lsn::pg_lsn);
> > +lsn |file_name | file_offset | pg_walfile_name 
> >  
> > ++--+-+--
> > + 0/16ff | 00010016 |16777215 | 
> > 00010016
> > + 0/1700 | 00010017 |   0 | 
> > 00010017
> > + 0/1701 | 00010017 |   1 | 
> > 00010017
> > +(3 rows)
> 
> These would break when testing with a different segment size. Today that's not
> the case...

Okay, test removed in the updated patch.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Only you can decide what is important to you.
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index d963f0a0a0..7c22420839 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -27044,11 +27044,6 @@ postgres=# SELECT * FROM pg_walfile_name_offset((pg_backup_stop()).lsn);
 (1 row)
 
 Similarly, pg_walfile_name extracts just the write-ahead log file name.
-When the given write-ahead log location is exactly at a write-ahead log file boundary, both
-these functions return the name of the preceding write-ahead log file.
-This is usually the desired behavior for managing write-ahead log archiving
-behavior, since the preceding file is the last one that currently
-needs to be archived.

 

diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index 45a70668b1..45452d937c 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -374,10 +374,6 @@ pg_last_wal_replay_lsn(PG_FUNCTION_ARGS)
 /*
  * Compute an xlog file name and decimal byte offset given a WAL location,
  * such as is returned by pg_backup_stop() or pg_switch_wal().
- *
- * Note that a location exactly at a segment boundary is taken to be in
- * the previous segment.  This is usually the right thing, since the
- * expected usage is to determine which xlog file(s) are ready to archive.
  */
 Datum
 pg_walfile_name_offset(PG_FUNCTION_ARGS)
@@ -414,7 +410,7 @@ pg_walfile_name_offset(PG_FUNCTION_ARGS)
 	/*
 	 * xlogfilename
 	 */
-	XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+	XLByteToSeg(locationpoint, xlogsegno, wal_segment_size);
 	XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
  wal_segment_size);
 
@@ -457,7 +453,7 @@ pg_walfile_name(PG_FUNCTION_ARGS)
  errhint("%s cannot be executed during recovery.",
 		 "pg_walfile_name()")));
 
-	XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+	XLByteToSeg(locationpoint, xlogsegno, wal_segment_size);
 	XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
  wal_segment_size);
 
diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
index c669948370..9302134077 100644
--- a/src/test/regress/expected/misc_functions.out
+++ b/src/test/regress/expected/misc_functions.out
@@ -619,7 +619,7 @@ SELECT count(*) > 0 AS ok FROM pg_control_system();
  t
 (1 row)
 
--- pg_split_walfile_name
+-- pg_split_walfile_name, pg_walfile_name & pg_walfile_name_offset
 SELECT * FROM pg_split_walfile_name(NULL);
  segment_number | timeline_id 
 +-
@@ -642,3 +642,31 @@ SELECT segment_number > 0 AS ok_segment_number, timeline_id
  t |  4294967295
 (1 row)
 
+SELECT setting::int8 AS segment_size
+FROM pg_settings
+WHERE name = 'wal_segment_size'
+\gset
+SELECT segment_number, file_offset
+FROM pg_walfile_name_offset('0/0'::pg_lsn + :segment_size),
+ pg_split_walfile_name(file_name);
+ segment_number | file_offset 
++-
+  1 |   0
+(1 row)
+
+SELECT segment_number, file_offset
+FROM pg_walfile_name_offset('0/0'::pg_lsn + :segment_size + 1),
+ pg_split_walfile_name(file_name);
+ segment_number | file_offset 
++-
+  1 |   1
+(1 row)
+
+SELECT segment_number, file_offset = :segment_size - 1
+FROM pg_walfile_name_offset('0/0'::pg_lsn + :segment_size - 1),
+ pg_split_walfile_name(file_name);
+ segment_number | ?column? 
++--
+  0 | t
+(1 row)
+
diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql
index b57f01f3e9..d3dc591173 100644
--- a/src/test/regress/sql/misc_functions.sql
+++ b/src/test/regress/sql/misc_functions.sql
@@ -230,10 +230,23 @@ SELECT count(*) > 0 AS ok FROM pg_control_init();
 SELECT count(*) > 0 AS ok FROM 

Re: pg_walfile_name_offset can return inconsistent values

2023-11-13 Thread Andres Freund
Hi,

On 2023-11-13 12:14:57 -0500, Bruce Momjian wrote:
> +SELECT *
> +FROM (values ('0/16ff'), ('0/1700'), ('0/1701')) as t(lsn),
> + LATERAL pg_walfile_name_offset(lsn::pg_lsn),
> + LATERAL pg_walfile_name(lsn::pg_lsn);
> +lsn |file_name | file_offset | pg_walfile_name   
>
> ++--+-+--
> + 0/16ff | 00010016 |16777215 | 
> 00010016
> + 0/1700 | 00010017 |   0 | 
> 00010017
> + 0/1701 | 00010017 |   1 | 
> 00010017
> +(3 rows)

These would break when testing with a different segment size. Today that's not
the case...

Greetings,

Andres Freund




Re: pg_walfile_name_offset can return inconsistent values

2023-11-13 Thread Bruce Momjian
On Fri, Nov 10, 2023 at 07:59:43PM -0800, Andres Freund wrote:
> Hi,
> 
> On 2023-11-09 16:14:07 -0500, Bruce Momjian wrote:
> > On Thu, Nov  9, 2023 at 09:49:48PM +0100, Matthias van de Meent wrote:
> > > Either way, I think fix #1 is most correct (as was attached in
> > > offset2.diff, and quoted verbatim here), because that has no chance of
> > > having surprising underflowing behaviour when you use '0/0'::lsn as
> > > input.
> > 
> > Attached is the full patch that changes pg_walfile_name_offset() and
> > pg_walfile_name().  There is no need for doc changes.
> 
> I think this needs to add tests "documenting" the changed behaviour and
> perhaps also for a few other edge cases.  You could e.g. test
>   SELECT * FROM pg_walfile_name_offset('0/0')
> which today returns bogus values, and which is independent of the wal segment
> size.
> 
> And with
> SELECT setting::int8 AS segment_size FROM pg_settings WHERE name = 
> 'wal_segment_size' \gset
> you can test real things too, e.g.:
> SELECT segment_number, file_offset FROM pg_walfile_name_offset('0/0'::pg_lsn 
> + :segment_size), pg_split_walfile_name(file_name);
> SELECT segment_number, file_offset FROM pg_walfile_name_offset('0/0'::pg_lsn 
> + :segment_size + 1), pg_split_walfile_name(file_name);
> SELECT segment_number, file_offset = :segment_size - 1 FROM 
> pg_walfile_name_offset('0/0'::pg_lsn + :segment_size - 1), 
> pg_split_walfile_name(file_name);

Sure, I have added these tests.

FYI, pg_walfile_name_offset() has this C comment, which I have removed
in this patch;

* Note that a location exactly at a segment boundary is taken to be in
* the previous segment.  This is usually the right thing, since the
* expected usage is to determine which xlog file(s) are ready to 
archive.

There is also a documentation mention of this behavior:

When the given write-ahead log location is exactly at a write-ahead log 
file boundary, both
these functions return the name of the preceding write-ahead log file.
This is usually the desired behavior for managing write-ahead log archiving
behavior, since the preceding file is the last one that currently
needs to be archived.

After seeing the doc mention, I started digging into the history of this
feature.  It was originally called pg_current_xlogfile_offset() and
proposed in this email thread, which started on 2006-07-31:


https://www.postgresql.org/message-id/flat/1154384790.3226.21.camel%40localhost.localdomain

In the initial patch by Simon Riggs, there was no "previous segment
file" behavior, just a simple filename/offset calculation.

This was applied on 2006-08-06 with this commit:

commit 704ddaaa09
Author: Tom Lane 
Date:   Sun Aug 6 03:53:44 2006 +

Add support for forcing a switch to a new xlog file; cause such a 
switch
to happen automatically during pg_stop_backup().  Add some 
functions for
interrogating the current xlog insertion point and for easily 
extracting
WAL filenames from the hex WAL locations displayed by pg_stop_backup
and friends.  Simon Riggs with some editorialization by Tom Lane.

and the email of the commit said:

https://www.postgresql.org/message-id/18457.1154836638%40sss.pgh.pa.us

I also made the new user-level functions a bit more orthogonal, so
that filenames could be extracted from the existing functions like
pg_stop_backup.

There is later talk about returning last write pointer vs. the current
insert pointer, and having it match what is returned by pg_stop_backup():


https://www.postgresql.org/message-id/1155124565.2368.95.camel%40localhost.localdomain

Methinks it should be the Write pointer all of the time, since I can't
think of a valid reason for wanting to know where the Insert pointer is
*before* we've written to the xlog file. Having it be the Insert pointer
could lead to some errors.

and I suspect that it was the desire to return the last write pointer
that caused the function to return the previous segment on a boundary
offset. This was intended to simplify log shipping implementations, I
think.

The function eventually was renamed in the xlog-to-wal renaming and moved
from xlog.c to xlogfuncs.c.  This thread in 2022 mentioned the
inconsistency for 0/0, but didn't seem to talk about the inconsistency
of offset vs file name:


https://www.postgresql.org/message-id/flat/20220204225057.GA1535307%40nathanxps13#d964275c9540d8395e138efc0a75f7e8

and it concluded with:

Yes, its the deliberate choice of design, or a kind of
questionable-but-unoverturnable decision.  I think there are many
external tools conscious of this behavior.

However, with the report about the inconsistency, the attached patch
fixes the behavior and removes the documentation about the odd behavior.
This will need to be mentioned as an 

Re: pg_walfile_name_offset can return inconsistent values

2023-11-12 Thread Michael Paquier
On Fri, Nov 10, 2023 at 07:59:43PM -0800, Andres Freund wrote:
> I think this needs to add tests "documenting" the changed behaviour and
> perhaps also for a few other edge cases.  You could e.g. test
>   SELECT * FROM pg_walfile_name_offset('0/0')
> which today returns bogus values, and which is independent of the wal segment
> size.

+1.
--
Michael


signature.asc
Description: PGP signature


Re: pg_walfile_name_offset can return inconsistent values

2023-11-10 Thread Andres Freund
Hi,

On 2023-11-09 16:14:07 -0500, Bruce Momjian wrote:
> On Thu, Nov  9, 2023 at 09:49:48PM +0100, Matthias van de Meent wrote:
> > Either way, I think fix #1 is most correct (as was attached in
> > offset2.diff, and quoted verbatim here), because that has no chance of
> > having surprising underflowing behaviour when you use '0/0'::lsn as
> > input.
> 
> Attached is the full patch that changes pg_walfile_name_offset() and
> pg_walfile_name().  There is no need for doc changes.

I think this needs to add tests "documenting" the changed behaviour and
perhaps also for a few other edge cases.  You could e.g. test
  SELECT * FROM pg_walfile_name_offset('0/0')
which today returns bogus values, and which is independent of the wal segment
size.

And with
SELECT setting::int8 AS segment_size FROM pg_settings WHERE name = 
'wal_segment_size' \gset
you can test real things too, e.g.:
SELECT segment_number, file_offset FROM pg_walfile_name_offset('0/0'::pg_lsn + 
:segment_size), pg_split_walfile_name(file_name);
SELECT segment_number, file_offset FROM pg_walfile_name_offset('0/0'::pg_lsn + 
:segment_size + 1), pg_split_walfile_name(file_name);
SELECT segment_number, file_offset = :segment_size - 1 FROM 
pg_walfile_name_offset('0/0'::pg_lsn + :segment_size - 1), 
pg_split_walfile_name(file_name);

Greetings,

Andres Freund




Re: pg_walfile_name_offset can return inconsistent values

2023-11-09 Thread Bruce Momjian
On Fri, Nov 10, 2023 at 08:25:35AM +0900, Michael Paquier wrote:
> On Thu, Nov 09, 2023 at 04:14:07PM -0500, Bruce Momjian wrote:
> > Attached is the full patch that changes pg_walfile_name_offset() and
> > pg_walfile_name().  There is no need for doc changes.  We need to
> > document this as incompatible in case users are realying on the old
> > behavior for WAL archiving purposes.  If they want the old behavior they
> > need to check for an offset of zero and subtract one from the file name.
> 
> FWIW, I am not really convinced that there is a strong need to
> backpatch any of that.  There's a risk that some queries relying on
> the old behavior suddenly break after a minor release, and that's
> always annoying.  A HEAD-only change seems like a safer bet to me.

Yes, this cannot be backpatched, clearly.

> > Can someone check that all other calls to XLByteToPrevSeg() are correct?
> 
> On a quick check, all the other calls use that for end record LSNs, so
> that looks fine.

Thank you.

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Only you can decide what is important to you.




Re: pg_walfile_name_offset can return inconsistent values

2023-11-09 Thread Michael Paquier
On Thu, Nov 09, 2023 at 04:14:07PM -0500, Bruce Momjian wrote:
> Attached is the full patch that changes pg_walfile_name_offset() and
> pg_walfile_name().  There is no need for doc changes.  We need to
> document this as incompatible in case users are realying on the old
> behavior for WAL archiving purposes.  If they want the old behavior they
> need to check for an offset of zero and subtract one from the file name.

FWIW, I am not really convinced that there is a strong need to
backpatch any of that.  There's a risk that some queries relying on
the old behavior suddenly break after a minor release, and that's
always annoying.  A HEAD-only change seems like a safer bet to me.

> Can someone check that all other calls to XLByteToPrevSeg() are correct?

On a quick check, all the other calls use that for end record LSNs, so
that looks fine.
--
Michael


signature.asc
Description: PGP signature


Re: pg_walfile_name_offset can return inconsistent values

2023-11-09 Thread Bruce Momjian
On Thu, Nov  9, 2023 at 09:49:48PM +0100, Matthias van de Meent wrote:
> > I have attached fix #1 as offset1.diff and fix #2 as offset2.diff.
> 
> I believe you got the references wrong; fix #1 looks like the output
> of offset2's changes, and fix #2 looks like the result of offset1's
> changes.

Sorry, I swaped them around when I realized the order I was posting them
in the email, and got it wrong.

> Either way, I think fix #1 is most correct (as was attached in
> offset2.diff, and quoted verbatim here), because that has no chance of
> having surprising underflowing behaviour when you use '0/0'::lsn as
> input.

Attached is the full patch that changes pg_walfile_name_offset() and
pg_walfile_name().  There is no need for doc changes.  We need to
document this as incompatible in case users are realying on the old
behavior for WAL archiving purposes.  If they want the old behavior they
need to check for an offset of zero and subtract one from the file name.

Can someone check that all other calls to XLByteToPrevSeg() are correct?

-- 
  Bruce Momjian  https://momjian.us
  EDB  https://enterprisedb.com

  Only you can decide what is important to you.
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index 45a70668b1..d8b300638b 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -414,7 +414,7 @@ pg_walfile_name_offset(PG_FUNCTION_ARGS)
 	/*
 	 * xlogfilename
 	 */
-	XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+	XLByteToSeg(locationpoint, xlogsegno, wal_segment_size);
 	XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
  wal_segment_size);
 
@@ -457,7 +457,7 @@ pg_walfile_name(PG_FUNCTION_ARGS)
  errhint("%s cannot be executed during recovery.",
 		 "pg_walfile_name()")));
 
-	XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+	XLByteToSeg(locationpoint, xlogsegno, wal_segment_size);
 	XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
  wal_segment_size);
 


Re: pg_walfile_name_offset can return inconsistent values

2023-11-09 Thread Matthias van de Meent
On Thu, 9 Nov 2023 at 20:22, Bruce Momjian  wrote:
> I know this bug report is four years old, but it is still a
> pg_walfile_name_offset() bug.  Here is the bug:
>
> SELECT *
> FROM (VALUES ('0/16ff'), ('0/1700'), ('0/1701')) AS 
> t(lsn),
>  LATERAL pg_walfile_name_offset(lsn::pg_lsn);
>
> lsn |file_name | file_offset
> +--+-
>  0/16ff | 00010016 |16777215
> -->  0/1700 | 00010016 |   0
>  0/1701 | 00010017 |   1
>
> The bug is in the indicated line --- it shows the filename as 00016 but
> offset as zero, when clearly the LSN is pointing to 17/0.  The bug is
> essentially that the code for pg_walfile_name_offset() uses the exact
> offset from the LSN, but uses the file name from the previous byte of
> the LSN.

Yes, that's definitely not a correct result.

> The fix involves deciding what the description or purpose of
> pg_walfile_name_offset() means, and adjusting it to be clearer.  The
> current documentation says:
>
> Converts a write-ahead log location to a WAL file name and byte
> offset within that file.
>
> Fix #1:  If we assume write-ahead log location means LSN, it is saying
> show the file/offset of the LSN, and that is most clearly:
>
> lsn |file_name | file_offset
> +--+-
>  0/16ff | 00010016 |16777215
>  0/1700 | 00010017 |   0
>  0/1701 | 00010017 |   1
>
> Fix #2:  Now, there are some who have said they want the output to be
> the last written WAL byte (the byte before the LSN), not the current
> LSN, for archiving purposes.  However, if we do that, we have to update
> the docs to clarify it.  Its output would be:
>
> lsn |file_name | file_offset
> +--+-
>  0/16ff | 00010016 |16777214
>  0/1700 | 00010016 |16777215
>  0/1701 | 00010017 |   0
>
> I have attached fix #1 as offset1.diff and fix #2 as offset2.diff.

I believe you got the references wrong; fix #1 looks like the output
of offset2's changes, and fix #2 looks like the result of offset1's
changes.

Either way, I think fix #1 is most correct (as was attached in
offset2.diff, and quoted verbatim here), because that has no chance of
having surprising underflowing behaviour when you use '0/0'::lsn as
input.

> diff --git a/src/backend/access/transam/xlogfuncs.c 
> b/src/backend/access/transam/xlogfuncs.c
> index 45a70668b1..e65502d51e 100644
> --- a/src/backend/access/transam/xlogfuncs.c
> +++ b/src/backend/access/transam/xlogfuncs.c
> @@ -414,7 +414,7 @@ pg_walfile_name_offset(PG_FUNCTION_ARGS)
> /*
>  * xlogfilename
>  */
> -XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
> +XLByteToSeg(locationpoint, xlogsegno, wal_segment_size);
> XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
>  wal_segment_size);

Kind regards,

Matthias van de Meent




Re: pg_walfile_name_offset can return inconsistent values

2023-11-09 Thread Bruce Momjian
On Fri, Jul 26, 2019 at 11:30:19AM +0200, Jehan-Guillaume de Rorthais wrote:
> On Fri, 26 Jul 2019 17:21:20 +0900 (Tokyo Standard Time)
> Kyotaro Horiguchi  wrote:
> 
> > Hello.
> > 
> > While looking [1], I noticed that pg_walfile_name_offset behaves
> > somewhat oddly at segment boundary.
> > 
> > select * from (values ('0/16ff'), ('0/1700'), ('0/1701')) as
> > t(lsn), lateral pg_walfile_name_offset(lsn::pg_lsn);
> > lsn |file_name | file_offset
> > +--+-
> >  0/16ff | 00020016 |16777215
> >  0/1700 | 00020016 |   0
> >  0/1701 | 00020017 |   1
> > 
> > 
> > The file names are right as defined, but the return value of the
> > second line wrong, or at least misleading.
> 
> +1
> I noticed it as well and put this report on hold while working on my patch.
> Thanks for reporting this!
> 
> > It should be (16, 100) or (16, FF). The former is out-of-domain so 
> > we
> > would have no way than choosing the latter. I'm not sure the purpose of
> > the second output parameter, thus the former might be right
> > decision.
> >
> > The function returns the following result after this patch is
> > applied.
> > 
> > select * from (values ('0/16ff'), ('0/1700'), ('0/1701')) as
> > t(lsn), lateral pg_walfile_name_offset(lsn::pg_lsn);
> > lsn |file_name | file_offset
> > +--+-
> >  0/16ff | 00020016 |16777214
> >  0/1700 | 00020016 |16777215
> >  0/1701 | 00020017 |   0
> 
> So you shift the file offset for all LSN by one byte? This could lead to
> regression in various tools relying on this function.
> 
> Moreover, it looks weird as the LSN doesn't reflect the given offset anymore
> (FF <> 16777214, 01 <> 0, etc).
> 
> Another solution might be to return the same result when for both 0/16ff 
> and
> 0/1700, but it doesn't feel right either.
> 
> So in fact, returning 0x100 seems to be the cleaner result to me.

I know this bug report is four years old, but it is still a
pg_walfile_name_offset() bug.  Here is the bug:

SELECT *
FROM (VALUES ('0/16ff'), ('0/1700'), ('0/1701')) AS t(lsn), 
 LATERAL pg_walfile_name_offset(lsn::pg_lsn);

lsn |file_name | file_offset
+--+-
 0/16ff | 00010016 |16777215
-->  0/1700 | 00010016 |   0
 0/1701 | 00010017 |   1

The bug is in the indicated line --- it shows the filename as 00016 but
offset as zero, when clearly the LSN is pointing to 17/0.  The bug is
essentially that the code for pg_walfile_name_offset() uses the exact
offset from the LSN, but uses the file name from the previous byte of
the LSN.

The fix involves deciding what the description or purpose of
pg_walfile_name_offset() means, and adjusting it to be clearer.  The
current documentation says:

Converts a write-ahead log location to a WAL file name and byte
offset within that file.

Fix #1:  If we assume write-ahead log location means LSN, it is saying
show the file/offset of the LSN, and that is most clearly:

lsn |file_name | file_offset
+--+-
 0/16ff | 00010016 |16777215
 0/1700 | 00010017 |   0
 0/1701 | 00010017 |   1

Fix #2:  Now, there are some who have said they want the output to be
the last written WAL byte (the byte before the LSN), not the current
LSN, for archiving purposes.  However, if we do that, we have to update
the docs to clarify it.  Its output would be:

lsn |file_name | file_offset
+--+-
 0/16ff | 00010016 |16777214
 0/1700 | 00010016 |16777215
 0/1701 | 00010017 |   0

The email thread also considered having the second row offset be 16777216
(2^24), which is not a valid offset for a file if we assume a zero-based
offset.

Looking further, pg_walfile_name() also returns the filename for the
previous byte:

SELECT *
FROM (values ('0/16ff'), ('0/1700'), ('0/1701')) as t(lsn),
 LATERAL pg_walfile_name_offset(lsn::pg_lsn), LATERAL 
pg_walfile_name(lsn::pg_lsn);
lsn |file_name | file_offset | 
pg_walfile_name

+--+-+--
 0/16ff | 00010016 |16777215 | 

Re: pg_walfile_name_offset can return inconsistent values

2019-07-26 Thread Jehan-Guillaume de Rorthais
On Fri, 26 Jul 2019 17:21:20 +0900 (Tokyo Standard Time)
Kyotaro Horiguchi  wrote:

> Hello.
> 
> While looking [1], I noticed that pg_walfile_name_offset behaves
> somewhat oddly at segment boundary.
> 
> select * from (values ('0/16ff'), ('0/1700'), ('0/1701')) as
> t(lsn), lateral pg_walfile_name_offset(lsn::pg_lsn);
> lsn |file_name | file_offset
> +--+-
>  0/16ff | 00020016 |16777215
>  0/1700 | 00020016 |   0
>  0/1701 | 00020017 |   1
> 
> 
> The file names are right as defined, but the return value of the
> second line wrong, or at least misleading.

+1
I noticed it as well and put this report on hold while working on my patch.
Thanks for reporting this!

> It should be (16, 100) or (16, FF). The former is out-of-domain so we
> would have no way than choosing the latter. I'm not sure the purpose of
> the second output parameter, thus the former might be right
> decision.
>
> The function returns the following result after this patch is
> applied.
> 
> select * from (values ('0/16ff'), ('0/1700'), ('0/1701')) as
> t(lsn), lateral pg_walfile_name_offset(lsn::pg_lsn);
> lsn |file_name | file_offset
> +--+-
>  0/16ff | 00020016 |16777214
>  0/1700 | 00020016 |16777215
>  0/1701 | 00020017 |   0

So you shift the file offset for all LSN by one byte? This could lead to
regression in various tools relying on this function.

Moreover, it looks weird as the LSN doesn't reflect the given offset anymore
(FF <> 16777214, 01 <> 0, etc).

Another solution might be to return the same result when for both 0/16ff and
0/1700, but it doesn't feel right either.

So in fact, returning 0x100 seems to be the cleaner result to me.

Regards,




pg_walfile_name_offset can return inconsistent values

2019-07-26 Thread Kyotaro Horiguchi
Hello.

While looking [1], I noticed that pg_walfile_name_offset behaves
somewhat oddly at segment boundary.

select * from (values ('0/16ff'), ('0/1700'), ('0/1701')) as 
t(lsn), lateral pg_walfile_name_offset(lsn::pg_lsn);
lsn |file_name | file_offset 
+--+-
 0/16ff | 00020016 |16777215
 0/1700 | 00020016 |   0
 0/1701 | 00020017 |   1


The file names are right as defined, but the return value of the
second line wrong, or at least misleading. It should be (16,
100) or (16, FF). The former is out-of-domain so we would
have no way than choosing the latter. I'm not sure the purpose of
the second output parameter, thus the former might be right
decision.


The function returns the following result after this patch is
applied.

select * from (values ('0/16ff'), ('0/1700'), ('0/1701')) as 
t(lsn), lateral pg_walfile_name_offset(lsn::pg_lsn);
lsn |file_name | file_offset 
+--+-
 0/16ff | 00020016 |16777214
 0/1700 | 00020016 |16777215
 0/1701 | 00020017 |   0


regards.

[1]: https://www.postgresql.org/message-id/20190725193808.1648ddc8@firost

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From ca9e174f53ac4d513163cbe8201746c8d3d2eb62 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi 
Date: Fri, 26 Jul 2019 17:12:24 +0900
Subject: [PATCH] Fix offset of pg_walfile_name_offset.

The function returns the name of the previous segment with the offset
of the given location at segment boundaries. For example, it returns
("...16", 0) returned for '0/1700'. That is inconsistent, or at
least confusing. This patch changes the function to return the given
LSN - 1 as offset.
---
 src/backend/access/transam/xlogfuncs.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index b35043bf71..79ea0495b4 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -447,6 +447,8 @@ pg_last_wal_replay_lsn(PG_FUNCTION_ARGS)
  * Note that a location exactly at a segment boundary is taken to be in
  * the previous segment.  This is usually the right thing, since the
  * expected usage is to determine which xlog file(s) are ready to archive.
+ * To be consistent to filename, returns the offset one byte before the given
+ * location as offset.
  */
 Datum
 pg_walfile_name_offset(PG_FUNCTION_ARGS)
@@ -480,10 +482,16 @@ pg_walfile_name_offset(PG_FUNCTION_ARGS)
 
 	resultTupleDesc = BlessTupleDesc(resultTupleDesc);
 
+	/*
+	 * We assume the given location point as one-byte after the location
+	 * really wanted.
+	 */
+	locationpoint--;
+
 	/*
 	 * xlogfilename
 	 */
-	XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+	XLByteToSeg(locationpoint, xlogsegno, wal_segment_size);
 	XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size);
 
 	values[0] = CStringGetTextDatum(xlogfilename);
-- 
2.16.3