Re: [HACKERS] pg_filedump 9.3: checksums (and a few other fixes)

2013-06-14 Thread Jeff Davis
On Thu, 2013-06-13 at 20:09 -0400, Tom Lane wrote:
 What I propose we do about this is reduce backend/storage/page/checksum.c
 to something like
 
 #include postgres.h
 #include storage/checksum.h
 #include storage/checksum_impl.h
 
 moving all the code currently in the file into a new .h file.  Then,
 any external programs such as pg_filedump can use the checksum code
 by including checksum_impl.h.  This is essentially the same thing we
 did with the CRC support functionality some time ago.

Thank you for taking care of that. After seeing that it needed to be in
a header file, I was going to try doing it all as macros.

I have a question about the commit though: shouldn't both functions be
static if they are in a .h file? Otherwise, it could lead to naming
conflicts. I suppose it's wrong to include the implementation file
twice, but it still might be confusing if someone tries. Two ideas that
come to mind are:
  * make both static and then have a trivial wrapper in checksum.c
  * export one or both functions, but use #ifndef CHECKSUM_IMPL_H to
prevent redefinition

 Also, we have the cut-point between checksum.c and bufpage.c at the
 wrong place.  IMO we should move PageCalcChecksum16 in toto into
 checksum.c (or really now into checksum_impl.h), because that and not
 just checksum_block() is the functionality that is wanted.

Agreed.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Patch for fail-back without fresh backup

2013-06-14 Thread Jeff Davis
On Fri, 2013-06-14 at 16:10 +0200, Andres Freund wrote:
 Jeff Davis has a patch pending
 (1365493015.7580.3240.camel@sussancws0025) that passes the buffer_std
 flag down to MarkBufferDirtyHint() for exactly that reason. I thought we
 were on track committing that, but rereading the thread it doesn't look
 that way.
 
 Jeff, care to update that patch?

Rebased and attached. Changed so all callers use buffer_std=true except
those in freespace.c and fsmpage.c.

Simon, did you (or anyone else) have an objection to this patch? If not,
I'll go ahead and commit it tomorrow morning.

Regards,
Jeff Davis

*** a/src/backend/access/hash/hash.c
--- b/src/backend/access/hash/hash.c
***
*** 287,293  hashgettuple(PG_FUNCTION_ARGS)
  			/*
  			 * Since this can be redone later if needed, mark as a hint.
  			 */
! 			MarkBufferDirtyHint(buf);
  		}
  
  		/*
--- 287,293 
  			/*
  			 * Since this can be redone later if needed, mark as a hint.
  			 */
! 			MarkBufferDirtyHint(buf, true);
  		}
  
  		/*
*** a/src/backend/access/heap/pruneheap.c
--- b/src/backend/access/heap/pruneheap.c
***
*** 262,268  heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
  		{
  			((PageHeader) page)-pd_prune_xid = prstate.new_prune_xid;
  			PageClearFull(page);
! 			MarkBufferDirtyHint(buffer);
  		}
  	}
  
--- 262,268 
  		{
  			((PageHeader) page)-pd_prune_xid = prstate.new_prune_xid;
  			PageClearFull(page);
! 			MarkBufferDirtyHint(buffer, true);
  		}
  	}
  
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***
*** 413,421  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
  	 * crucial. Be sure to mark the proper buffer dirty.
  	 */
  	if (nbuf != InvalidBuffer)
! 		MarkBufferDirtyHint(nbuf);
  	else
! 		MarkBufferDirtyHint(buf);
  }
  			}
  		}
--- 413,421 
  	 * crucial. Be sure to mark the proper buffer dirty.
  	 */
  	if (nbuf != InvalidBuffer)
! 		MarkBufferDirtyHint(nbuf, true);
  	else
! 		MarkBufferDirtyHint(buf, true);
  }
  			}
  		}
*** a/src/backend/access/nbtree/nbtree.c
--- b/src/backend/access/nbtree/nbtree.c
***
*** 1052,1058  restart:
  opaque-btpo_cycleid == vstate-cycleid)
  			{
  opaque-btpo_cycleid = 0;
! MarkBufferDirtyHint(buf);
  			}
  		}
  
--- 1052,1058 
  opaque-btpo_cycleid == vstate-cycleid)
  			{
  opaque-btpo_cycleid = 0;
! MarkBufferDirtyHint(buf, true);
  			}
  		}
  
*** a/src/backend/access/nbtree/nbtutils.c
--- b/src/backend/access/nbtree/nbtutils.c
***
*** 1789,1795  _bt_killitems(IndexScanDesc scan, bool haveLock)
  	if (killedsomething)
  	{
  		opaque-btpo_flags |= BTP_HAS_GARBAGE;
! 		MarkBufferDirtyHint(so-currPos.buf);
  	}
  
  	if (!haveLock)
--- 1789,1795 
  	if (killedsomething)
  	{
  		opaque-btpo_flags |= BTP_HAS_GARBAGE;
! 		MarkBufferDirtyHint(so-currPos.buf, true);
  	}
  
  	if (!haveLock)
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***
*** 7681,7692  XLogRestorePoint(const char *rpName)
   * records. In that case, multiple copies of the same block would be recorded
   * in separate WAL records by different backends, though that is still OK from
   * a correctness perspective.
-  *
-  * Note that this only works for buffers that fit the standard page model,
-  * i.e. those for which buffer_std == true
   */
  XLogRecPtr
! XLogSaveBufferForHint(Buffer buffer)
  {
  	XLogRecPtr	recptr = InvalidXLogRecPtr;
  	XLogRecPtr	lsn;
--- 7681,7689 
   * records. In that case, multiple copies of the same block would be recorded
   * in separate WAL records by different backends, though that is still OK from
   * a correctness perspective.
   */
  XLogRecPtr
! XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
  {
  	XLogRecPtr	recptr = InvalidXLogRecPtr;
  	XLogRecPtr	lsn;
***
*** 7708,7714  XLogSaveBufferForHint(Buffer buffer)
  	 * and reset rdata for any actual WAL record insert.
  	 */
  	rdata[0].buffer = buffer;
! 	rdata[0].buffer_std = true;
  
  	/*
  	 * Check buffer while not holding an exclusive lock.
--- 7705,7711 
  	 * and reset rdata for any actual WAL record insert.
  	 */
  	rdata[0].buffer = buffer;
! 	rdata[0].buffer_std = buffer_std;
  
  	/*
  	 * Check buffer while not holding an exclusive lock.
*** a/src/backend/commands/sequence.c
--- b/src/backend/commands/sequence.c
***
*** 1118,1124  read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple)
  		HeapTupleHeaderSetXmax(seqtuple-t_data, InvalidTransactionId);
  		seqtuple-t_data-t_infomask = ~HEAP_XMAX_COMMITTED;
  		seqtuple-t_data-t_infomask |= HEAP_XMAX_INVALID;
! 		MarkBufferDirtyHint(*buf);
  	}
  
  	seq = (Form_pg_sequence) GETSTRUCT(seqtuple);
--- 1118,1124 
  		HeapTupleHeaderSetXmax(seqtuple-t_data

Re: [HACKERS] fallocate / posix_fallocate for new WAL file creation (etc...)

2013-06-14 Thread Jeff Davis
On Sat, 2013-05-25 at 13:55 -0500, Jon Nelson wrote:
 Ack.  I've revised the patch to always have the GUC (for now), default
 to false, and if configure can't find posix_fallocate (or the user
 disables it by way of pg_config_manual.h) then it remains a GUC that
 simply can't be changed.

Why have a GUC here at all? Perhaps this was already discussed, and I
missed it? Is it just for testing purposes, or did you intend for it to
be in the final version?

If it's supported, it seems like we always want it. I doubt there are
cases where it hurts performance; but if there are, it's pretty hard for
a DBA to know what those cases are, anyway.

Also:

* The other code assumes that no errno means ENOSPC. We should be
consistent about that assumption, and do the same thing in the fallocate
case.

* You check for the presence of posix_fallocate at configure time, but
don't #ifdef the call site. It looks like you removed this from the v2
patch, was there a reason for that? Won't that cause build errors for
platforms without it?

I started looking at this patch and it looks like we are getting a
consensus that it's the right approach. Microbenchmarks appear to show a
benefit, and (thanks to Noah's comment) it seems like the change is
safe. Are there any remaining questions or objections?

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] fallocate / posix_fallocate for new WAL file creation (etc...)

2013-06-14 Thread Jeff Davis
On Tue, 2013-06-11 at 12:58 -0400, Stephen Frost wrote:
 My main question is really- would this be useful for extending
 *relations*?  Apologies if it's already been discussed; I do plan to go
 back and read the threads about this more fully, but I wanted to voice
 my support for using posix_fallocate, when available, in general.

+1, though separate from this patch.

Andres also pointed out that we can try to track a point in the file
that is below any place where a zero page might still exist. That will
allow us to call zero pages invalid unless they are related to a recent
extension, which is a weakness in the current checksums code.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] fallocate / posix_fallocate for new WAL file creation (etc...)

2013-06-14 Thread Jeff Davis
On Fri, 2013-06-14 at 13:21 -0400, Greg Smith wrote:
 I'm planning to duplicate Jon's test program on a few machines here, and 
 then see if that turns into a useful latency improvement for clients. 
 I'm trying to get this pgbench rate limit stuff working first though, 
 because one of the tests I had in mind for WAL creation overhead would 
 benefit from it.

Great, I'll wait on those results.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] pg_filedump 9.3: checksums (and a few other fixes)

2013-06-10 Thread Jeff Davis
On Mon, 2013-06-10 at 01:28 -0400, Alvaro Herrera wrote:
 Hm, note that XMAX_SHR_LOCK is two bits, so when that flag is present
 you will get the three lock modes displayed with the above code, which is
 probably going to be misleading.  htup_details.h does this:
 
 /*
  * Use these to test whether a particular lock is applied to a tuple
  */
 #define HEAP_XMAX_IS_SHR_LOCKED(infomask) \
   (((infomask)  HEAP_LOCK_MASK) == HEAP_XMAX_SHR_LOCK)
 #define HEAP_XMAX_IS_EXCL_LOCKED(infomask) \
   (((infomask)  HEAP_LOCK_MASK) == HEAP_XMAX_EXCL_LOCK)
 #define HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) \
   (((infomask)  HEAP_LOCK_MASK) == HEAP_XMAX_KEYSHR_LOCK)
 
 Presumably it'd be better to do something similar.

I was hesitant to do too much interpretation of the bits. Do you think
it would be better to just remove the test for XMAX_SHR_LOCK?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] pg_filedump 9.3: checksums (and a few other fixes)

2013-06-10 Thread Jeff Davis
On Mon, 2013-06-10 at 11:38 -0400, Tom Lane wrote:
 The thing I'm not too happy about is having to copy the checksum code
 into pg_filedump.  We just got rid of the need to do that for the CRC
 code, and here it is coming back again.  Can't we rearrange the core
 checksum code similarly to what we did for the CRC stuff recently,
 so that you only need access to a .h file for it?

The CRC implementation is entirely in header files. Do you think we need
to go that far, or is it fine to just put it in libpgport and link that
to pg_filedump?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Placing hints in line pointers

2013-06-09 Thread Jeff Davis
On Sat, 2013-06-01 at 15:45 +0100, Simon Riggs wrote:
 Doing this would have two purposes:
 
 * We wouldn't need to follow the pointer if the row is marked aborted.
 This would save a random memory access for that tuple

That's quite similar to LP_DEAD, right? You could potentially set this
new hint sooner, which may be an advantage.

 We wouldn't need to do a FPW when a hint changes, we would only need
 to take a copy of the ItemId array, which is much smaller. And it
 could be protected by its own checksum.

I like the direction of this idea.

I don't see exactly how you want to make this safe, though. During
replay, we can't just replace the ItemId array, because the other
contents on the page might be newer (e.g. some tuples may have been
cleaned out, leaving the item pointers pointing to the wrong place).

 (In addition, if we wanted, this could be used to extend block size to
 64kB if we used 8-byte alignment for tuples)

I think that supporting 64KB blocks has merit.

Interesting observation about the extra bits available. That would be an
on-disk format change, so perhaps this is something to add to the list
of waiting for a format change?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] pg_filedump 9.3: checksums (and a few other fixes)

2013-06-09 Thread Jeff Davis
Attached is a first draft of an update to pg_filedump for 9.3. I know
pg_filedump is a pgfoundry project, but that seems like it's just there
to host the download; so please excuse the slightly off-topic post here
on -hackers.

I made a few changes to support 9.3, which were mostly fixes related two
things:

 * new htup_details.h and changes related to FK concurrency improvements
 * XLogRecPtr is now a uint64

And, of course, I added support for checksums. They are always displayed
and calculated, but it only throws an error if you pass -k. Only the
user knows whether checksums are enabled, because we removed page-level
bits indicating the presence of a checksum.

The patch is a bit ugly: I had to copy some code, and copy the entire
checksum.c file (minus some Asserts, which don't work in an external
program). Suggestions welcome.

Regards,
Jeff Davis

diff -Nc pg_filedump-9.2.0/checksum.c pg_filedump-9.3.0j/checksum.c
*** pg_filedump-9.2.0/checksum.c	1969-12-31 16:00:00.0 -0800
--- pg_filedump-9.3.0j/checksum.c	2013-06-09 21:20:34.036176831 -0700
***
*** 0 
--- 1,157 
+ /*-
+  *
+  * checksum.c
+  *	  Checksum implementation for data pages.
+  *
+  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *
+  * IDENTIFICATION
+  *	  src/backend/storage/page/checksum.c
+  *
+  *-
+  *
+  * Checksum algorithm
+  *
+  * The algorithm used to checksum pages is chosen for very fast calculation.
+  * Workloads where the database working set fits into OS file cache but not
+  * into shared buffers can read in pages at a very fast pace and the checksum
+  * algorithm itself can become the largest bottleneck.
+  *
+  * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand
+  * for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 1
+  * byte at a time according to the formula:
+  *
+  * hash = (hash ^ value) * FNV_PRIME
+  *
+  * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/
+  *
+  * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of
+  * high bits - high order bits in input data only affect high order bits in
+  * output data. To resolve this we xor in the value prior to multiplication
+  * shifted right by 17 bits. The number 17 was chosen because it doesn't
+  * have common denominator with set bit positions in FNV_PRIME and empirically
+  * provides the fastest mixing for high order bits of final iterations quickly
+  * avalanche into lower positions. For performance reasons we choose to combine
+  * 4 bytes at a time. The actual hash formula used as the basis is:
+  *
+  * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value)  17)
+  *
+  * The main bottleneck in this calculation is the multiplication latency. To
+  * hide the latency and to make use of SIMD parallelism multiple hash values
+  * are calculated in parallel. The page is treated as a 32 column two
+  * dimensional array of 32 bit values. Each column is aggregated separately
+  * into a partial checksum. Each partial checksum uses a different initial
+  * value (offset basis in FNV terminology). The initial values actually used
+  * were chosen randomly, as the values themselves don't matter as much as that
+  * they are different and don't match anything in real data. After initializing
+  * partial checksums each value in the column is aggregated according to the
+  * above formula. Finally two more iterations of the formula are performed with
+  * value 0 to mix the bits of the last value added.
+  *
+  * The partial checksums are then folded together using xor to form a single
+  * 32-bit checksum. The caller can safely reduce the value to 16 bits
+  * using modulo 2^16-1. That will cause a very slight bias towards lower
+  * values but this is not significant for the performance of the
+  * checksum.
+  *
+  * The algorithm choice was based on what instructions are available in SIMD
+  * instruction sets. This meant that a fast and good algorithm needed to use
+  * multiplication as the main mixing operator. The simplest multiplication
+  * based checksum primitive is the one used by FNV. The prime used is chosen
+  * for good dispersion of values. It has no known simple patterns that result
+  * in collisions. Test of 5-bit differentials of the primitive over 64bit keys
+  * reveals no differentials with 3 or more values out of 10 random keys
+  * colliding. Avalanche test shows that only high order bits of the last word
+  * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes,
+  * overwriting page from random position to end with 0 bytes, and overwriting
+  * random segments of page with 0x00, 0xFF and random data all show optimal
+  * 2e-16

Re: [HACKERS] Eliminating PD_ALL_VISIBLE, take 2

2013-05-30 Thread Jeff Davis
On Thu, 2013-05-30 at 11:32 +0300, Heikki Linnakangas wrote:
  That could cause a torn page and checksum failure if checksums are
  enabled.

Thank you, I missed that in the rebase; it should be
MarkBufferDirtyHint().

 Come to think of it, even without the torn page  checksum issue, do we 
 really want to actively clear the all-visible flags after upgrade? For 
 tables that haven't been changed, and thus have the all-visible bits 
 set, that amounts to a complete rewrite on the first vacuum after 
 upgrade. That's expensive.

I expected that question and intended to raise it for discussion when
and if we get past performance concerns (I believe Robert is still not
convinced that the patch is viable).

We have a few options: We can ignore the bit entirely, or we can
aggressively unset it, or we can opportunistically unset it if the page
is already dirty. I don't think we're in a hurry to reuse that bit for
something else, so maybe it's best to just ignore it entirely.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] visibilitymap_set and checksums

2013-05-29 Thread Jeff Davis
On Fri, 2013-05-24 at 22:16 +0100, Simon Riggs wrote: 
 I think its perfectly understandable. Robert, Jeff and I discussed
 that for a while before we passed it. I'm still not happy with it, and
 think its a pretty confusing section of code with multiple paths
 through it, but I just can't see a better way.

Agreed on all counts. Comment patches are welcome, of course.

I'll add that if we remove PD_ALL_VISIBLE, this complexity disappears.

Regards,
Jeff Davis





-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] getting rid of freezing

2013-05-29 Thread Jeff Davis
On Tue, 2013-05-28 at 19:51 -0400, Robert Haas wrote:
  If we just wanted to reduce read cost, why not just take a simpler
  approach and give the visibility map a isfrozen bit?  Then we'd know
  which pages didn't need rescanning without nearly as much complexity.
 
 That would break pg_upgrade, which would have to remove visibility map
 forks when upgrading.  More importantly, it would require another
 round of complex changes to the write-ahead logging in this area.
 It's not obvious to me that we'd end up ahead of where we are today,
 although perhaps I am a pessimist.

If we removed PD_ALL_VISIBLE, then this would be very simple, right? We
would just follow normal logging rules for setting the visible or frozen
bit.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] getting rid of freezing

2013-05-29 Thread Jeff Davis
On Tue, 2013-05-28 at 09:29 -0700, Josh Berkus wrote:
 - it would prevent us from getting rid of allvisible, which has a
 documented and known write overhead

It would? I don't think these proposals are necessarily in conflict.
It's not entirely clear to me how they fit together in detail, but it
seems like it may be possible -- it may even simplify things.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] removing PD_ALL_VISIBLE

2013-05-29 Thread Jeff Davis
On Wed, 2013-05-29 at 22:46 -0400, Robert Haas wrote:
 Again independently of Josh's proposal, we could eliminate
 PD_ALL_VISIBLE.  This would require either surrendering the
 optimization whereby sequential scans can skip visibility checks on
 individual tuples within the page, or referring to the visibility map
 to get the bit that way.  I know you tested this and couldn't measure
 an impact, but with all respect I find that result hard to accept.
 Contention around buffer locks and pins is very real; why should it
 matter on other workloads and not matter on this one?

The number of pins required during a sequential scan without my patch
is:

   PAGES_IN_HEAP

The number of pins required during a sequential scan with my patch is:

   PAGES_IN_HEAP + ceil(PAGES_IN_HEAP/HEAP_PAGES_PER_VM_PAGE)

Because HEAP_PAGES_PER_VM_PAGE is huge, the second term only matters
when N is very small and the ceil is significant. So, I simply elected
not to use the VM if the table is less than 32 pages in size. For such
small tables, the benefit of using a page-at-a-time visibility check was
not apparent in my tests anyway.

 AFAICS, the main benefit of eliminating PD_ALL_VISIBLE is that we
 eliminate one write cycle; that is, we won't dirty the page once to
 hint it and then again to mark it all-visible.  But as of 9.3, that
 should really only be a problem in the insert-only case.  And in that
 case, my proposal to consider all-visible pages as frozen would be a
 huge win, because you'd only need to emit XLOG_HEAP_VISIBLE for every
 page in the heap, rather than XLOG_HEAP_FREEZE.

Agreed.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Planning incompatibilities for Postgres 10.0

2013-05-25 Thread Jeff Davis
On Sat, 2013-05-25 at 10:39 +0100, Simon Riggs wrote:
 The constraint on such changes is that we've decided that we must have
 an upgrade path from release to release.

Is this proposal only relaxing the binary upgrade requirement, or would
it also relax other compatibility requirements, such as language and API
compatibility?

We need a couple major drivers of the incompatibility that really show
users some value for going through the upgrade pain. Preferably, at
least one would be a serious performance boost, because the users that
encounter the most logical upgrade pain are also the ones that need a
performance boost the most.

Before we set a specific schedule, I think it would be a good idea to
start prototyping some performance improvements that involve breaking
the data format. Then, depending on how achievable it is, we can plan
for however many more 9.X releases we think we need. That being said, I
agree with you that planning in advance is important here, so that
everyone knows when they need to get format-breaking changes in by.

Regards,
Jeff Davis 





-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-10 Thread Jeff Davis
On Fri, 2013-05-10 at 18:32 +0100, Simon Riggs wrote:
 We don't write() WAL except with an immediate sync(), so the chances
 of what you say happening are very low to impossible.

Are you sure? An XLogwrtRqst contains a write and a flush pointer, so I
assume they can be different.

I agree that it sounds unlikely that blocks 100 and 102 would be
written, but not 101. But perhaps that's more likely in systems like ZFS
where the physical blocks might be in very different places.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-09 Thread Jeff Davis
On Thu, 2013-05-09 at 14:28 -0500, Jim Nasby wrote:
 What about moving some critical data from the beginning of the WAL
 record to the end? That would make it easier to detect that we don't
 have a complete record. It wouldn't necessarily replace the CRC
 though, so maybe that's not good enough.
 
 Actually, what if we actually *duplicated* some of the same WAL header
 info at the end of the record? Given a reasonable amount of data that
 would damn-near ensure that a torn record was detected, because the
 odds of having the exact same sequence of random bytes would be so
 low. Potentially even just duplicating the LSN would suffice.

I think both of these ideas have some false positives and false
negatives.

If the corruption happens at the record boundary, and wipes out the
special information at the end of the record, then you might think it
was not fully flushed, and we're in the same position as today.

If the WAL record is large, and somehow the beginning and the end get
written to disk but not the middle, then it will look like corruption;
but really the WAL was just not completely flushed. This seems pretty
unlikely, but not impossible.

That being said, I like the idea of introducing some extra checks if a
perfect solution is not possible.

 On the separate write idea, if that could be controlled by a GUC I
 think it'd be worth doing. Anyone that needs to worry about this
 corner case probably has hardware that would support that.

It sounds pretty easy to do that naively. I'm just worried that the
performance will be so bad for so many users that it's not a very
reasonable choice.

Today, it would probably make more sense to just use sync rep. If the
master's WAL is corrupt, and it starts up too early, then that should be
obvious when you try to reconnect streaming replication. I haven't tried
it, but I'm assuming that it gives a useful error message.

Regards,
Jeff Davis




--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-09 Thread Jeff Davis
On Thu, 2013-05-09 at 23:13 +0100, Greg Stark wrote:
 However it is possible to reduce the window...

Sounds reasonable.

It's fairly limited though -- the window is already a checkpoint
(typically 5-30 minutes), and we'd bring that down an order of magnitude
(10s). I speculate that, if it got corrupted within 30 minutes, it
probably got corrupted at the time of being written (as happened in Jeff
Janes's case, due to a bug).

So, the question is: if the WAL is corrupted on write, does reducing the
window significantly increase the chances that the wal writer will hang
around long enough before a crash to flush this other file?

On the other hand, checkpoint hides any corrupt WAL records by not
replaying them, whereas your scheme would identify that there is a
problem.

I don't think this would have helped Jeff Janes's case because I think
the crashes were happening too quickly. But that is artificial, so it
may help in real cases.

I just had a thought: we don't necessarily need to flush the auxiliary
file each time; merely writing it to the kernel buffers would help a
lot. Maybe an extra write() of the auxiliary file during a WAL flush
isn't so bad; and combined with periodic fsync()s of the auxiliary file,
should offer a lot of coverage against problems.

Regards,
Jeff Davis




--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-08 Thread Jeff Davis
On Wed, 2013-05-08 at 17:56 -0500, Jim Nasby wrote:
 Apologies if this is a stupid question, but is this mostly an issue
 due to torn pages? IOW, if we had a way to ensure we never see torn
 pages, would that mean an invalid CRC on a WAL page indicated there
 really was corruption on that page?
 
 Maybe it's worth putting (yet more) thought into the torn page
 issue... :/

Sort of. For data, a page is the logically-atomic unit that is expected
to be intact. For WAL, a record is the logically-atomic unit that is
expected to be intact.

So it might be better to say that the issue for the WAL is torn
records. A record might be larger than a page (it can hold up to three
full-page images in one record), but is often much smaller.

We use a CRC to validate that the WAL record is fully intact. The
concern is that, if it fails the CRC check, we *assume* that it's
because it wasn't completely flushed yet (i.e. a torn record). Based
on that assumption, neither that record nor any later record contains
committed transactions, so we can safely consider that the end of the
WAL (as of the crash) and bring the system up.

The problem is that the assumption is not always true: a CRC failure
could also indicate real corruption of WAL records that have been
previously flushed successfully, and may contain committed transactions.
That can mean we bring the system up way too early, corrupting the
database.

Unfortunately, it seems that doing any kind of validation to determine
that we have a valid end-of-the-WAL inherently requires some kind of
separate durable write somewhere. It would be a tiny amount of data (an
LSN and maybe some extra crosscheck information), so I could imagine
that would be just fine given the right hardware; but if we just write
to disk that would be pretty bad. Ideas welcome.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-07 Thread Jeff Davis
On Tue, 2013-05-07 at 13:20 -0400, Robert Haas wrote:
 Hmm.  Rereading your last email, I see your point: since we now have
 HEAP_XLOG_VISIBLE, this is much less of an issue than it would have
 been before.  I'm still not convinced that simplifying that code is a
 good idea, but maybe it doesn't really hurt us much in practice.

Given that there's not a big impact one way or another, I don't mind
whether this particular patch is committed or not. Whichever you think
is more understandable and safer at this late hour. Also, to be clear,
the fact that I posted a patch was not meant to advocate either way;
merely to present the options.

Not sure exactly what you mean about the code simplification, but I
agree that anything more substantial than this patch should be left for
9.4.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-06 Thread Jeff Davis
On Mon, 2013-05-06 at 15:31 -0400, Robert Haas wrote:
 On Wed, May 1, 2013 at 3:04 PM, Jeff Davis pg...@j-davis.com wrote:
  Regardless, you have a reasonable claim that my patch had effects that
  were not necessary. I have attached a draft patch to remedy that. Only
  rudimentary testing was done.
 
 This looks reasonable to me.

Can you please explain the scenario that loses many VM bits at once
during a crash, and results in a bunch of already-all-visible heap pages
being dirtied for no reason?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-03 Thread Jeff Davis
On Fri, 2013-05-03 at 19:52 +0100, Simon Riggs wrote:
 On 1 May 2013 20:40, Jeff Davis pg...@j-davis.com wrote:
 
  Looks easy. There is no additional logic for checksums, so there's no
  third complexity.
 
  So we either have
  * cleanup info with vismap setting info
  * cleanup info only
 
  which is the same number of WAL records as we have now, just that we
  never emit 2 records when one will do.
 
  What if we only set PD_ALL_VISIBLE and the VM bit, and make no other
  changes? Right now, that generates no WAL for the heap page (unless
  checksums are enabled, of course), which means no full page images for
  the heap pages.
 
 The only place I see that code path is when we clear up a heap page
 that is empty.

Or if there are no other changes to make to the heap page. For example,
if you do a fresh data load, then set the hint bits, and then do a
VACCUM. The only changes needed during VACUUM are setting PD_ALL_VISIBLE
and the VM bit.

Right now, that does not cause a wal record to be written for the heap
page, so there are no full-page images for the heap page.

At this point, I don't think more changes are required. Robert seemed
concerned about dirtying extra pages after a crash, but I didn't
entirely understand his reasoning. I am inclined toward simplifying this
part of the code as you suggest, but maybe that's better left for 9.4
(which would give me a chance to reignite the discussion about whether
PD_ALL_VISIBLE is needed at all) unless there is an actual problem.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-01 Thread Jeff Davis
On Wed, 2013-05-01 at 11:33 -0400, Robert Haas wrote:
  The only time the VM and the data page are out of sync during vacuum is
  after a crash, right? If that's the case, I didn't think it was a big
  deal to dirty one extra page (should be extremely rare). Am I missing
  something?
 
  The reason I removed that special case was just code
  complexity/readability. I tried preserving the previous behavior, and
  it's not so bad, but it seemed unnecessarily ugly for the benefit of a
  rare case.
 
  Well, I don't find it that ugly, but then again
 
 ...I've done a fair amount of hacking on this code.  The scenario that
 I find problematic here is that you could easily lose a couple of
 visibility map pages in a crash.  Each one you lose, with this patch,
 potentially involves half a gigabyte of unnecessary disk writes, if
 the relation is being vacuumed at the time.  For the few extra lines
 of code it takes to protect against that scenario, I think we should
 protect against it.

I'm still unclear on how we'd lose so many VM bits at once, because they
are logged. I see how we can lose one if the heap page makes it to disk
before the VM bit's WAL is flushed, but that seems to be bounded to me.

Regardless, you have a reasonable claim that my patch had effects that
were not necessary. I have attached a draft patch to remedy that. Only
rudimentary testing was done.

Regards,
Jeff Davis

*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***
*** 5755,5762  log_heap_freeze(Relation reln, Buffer buffer,
   * corresponding visibility map block.	Both should have already been modified
   * and dirtied.
   *
!  * If checksums are enabled, we also add the heap_buffer to the chain to
!  * protect it from being torn.
   */
  XLogRecPtr
  log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
--- 5755,5762 
   * corresponding visibility map block.	Both should have already been modified
   * and dirtied.
   *
!  * If checksums are enabled and the heap buffer was changed (PD_ALL_VISIBLE
!  * set), we add it to the chain to protect it from being torn.
   */
  XLogRecPtr
  log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
***
*** 5784,5790  log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
  	rdata[1].buffer_std = false;
  	rdata[1].next = NULL;
  
! 	if (DataChecksumsEnabled())
  	{
  		rdata[1].next = (rdata[2]);
  
--- 5784,5790 
  	rdata[1].buffer_std = false;
  	rdata[1].next = NULL;
  
! 	if (DataChecksumsEnabled()  BufferIsValid(heap_buffer))
  	{
  		rdata[1].next = (rdata[2]);
  
*** a/src/backend/access/heap/visibilitymap.c
--- b/src/backend/access/heap/visibilitymap.c
***
*** 233,242  visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
   * marked all-visible; it is needed for Hot Standby, and can be
   * InvalidTransactionId if the page contains no tuples.
   *
!  * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
!  * this function. Except in recovery, caller should also pass the heap
!  * buffer. When checksums are enabled and we're not in recovery, we must add
!  * the heap buffer to the WAL chain to protect it from being torn.
   *
   * You must pass a buffer containing the correct map page to this function.
   * Call visibilitymap_pin first to pin the right one. This function doesn't do
--- 233,243 
   * marked all-visible; it is needed for Hot Standby, and can be
   * InvalidTransactionId if the page contains no tuples.
   *
!  * The caller should supply heapBuf if and only if it set PD_ALL_VISIBLE on the
!  * heap page, and we're not in recovery.  In that case, the caller should have
!  * already set PD_ALL_VISIBLE and marked the page dirty.  When checksums are
!  * enabled and heapBuf is valid, this function will add heapBuf to the WAL
!  * chain to protect it from being torn.
   *
   * You must pass a buffer containing the correct map page to this function.
   * Call visibilitymap_pin first to pin the right one. This function doesn't do
***
*** 257,263  visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
  #endif
  
  	Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
- 	Assert(InRecovery || BufferIsValid(heapBuf));
  
  	/* Check that we have the right heap page pinned, if present */
  	if (BufferIsValid(heapBuf)  BufferGetBlockNumber(heapBuf) != heapBlk)
--- 258,263 
***
*** 290,296  visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
   * If data checksums are enabled, we need to protect the heap
   * page from being torn.
   */
! if (DataChecksumsEnabled())
  {
  	Page heapPage = BufferGetPage(heapBuf);
  
--- 290,296 
   * If data checksums are enabled, we need to protect the heap
   * page from being torn.
   */
! if (DataChecksumsEnabled()  BufferIsValid(heapBuf))
  {
  	Page heapPage

Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-01 Thread Jeff Davis
On Wed, 2013-05-01 at 14:16 -0400, Robert Haas wrote:
 Now that I'm looking at this, I'm a bit confused by the new logic in
 visibilitymap_set().  When checksums are enabled, we set the page LSN,
 which is described like this: we need to protect the heap page from
 being torn.  But how does setting the page LSN do that?  Don't we
 need to mark the buffer dirty or something like that?  Sorry if this
 is a dumb question.

The caller is supposed to dirty the heap page when setting
PD_ALL_VISIBLE. I thought I said that explicitly in the comments
somewhere, but now I don't see it.

It is slightly awkward to put the comment about the page being torn just
before setting the LSN. Feel free to move/remove/reword if you can think
of something better.

Because setting PD_ALL_VISIBLE does not write WAL by itself, my design
was to have it piggyback on the VM WAL record if checksums are turned
on. It would be nice to just use MarkBufferDirtyHint, but that does not
guarantee that the page will actually be marked dirty (see comments);
and we really need it to be properly marked dirty (otherwise we risk the
VM bit making it and the heap page not).

I also explored the idea of passing more options to MarkBufferDirty so
that we could force it to do what we want in each case, but decided
against it:

http://www.postgresql.org/message-id/1357798000.5162.38.camel@jdavis-laptop

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-05-01 Thread Jeff Davis
On Wed, 2013-05-01 at 20:06 +0100, Simon Riggs wrote:
  Why aren't we writing just one WAL record for this action?

...

 
  I thought about that, too.  It certainly seems like more than we want
  to try to do for 9.3 at this point.  The other complication is that
  there's a lot of conditional logic here.

...

 Looks easy. There is no additional logic for checksums, so there's no
 third complexity.
 
 So we either have
 * cleanup info with vismap setting info
 * cleanup info only
 
 which is the same number of WAL records as we have now, just that we
 never emit 2 records when one will do.

What if we only set PD_ALL_VISIBLE and the VM bit, and make no other
changes? Right now, that generates no WAL for the heap page (unless
checksums are enabled, of course), which means no full page images for
the heap pages.

I think we have to special-case that somehow, and I believe that's the
conditional logic that Robert is referring to.

That being said, there may be a simpler way to achieve the same result,
and that may be what you have in mind.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-30 Thread Jeff Davis
On Tue, 2013-04-30 at 08:34 -0400, Robert Haas wrote:
 Uh, wait a minute.  I think this is completely wrong.  The buffer is
 LOCKED for this entire sequence of operations.  For a checkpoint to
 happen, it's got to write every buffer, which it will not be able to
 do for so long as the buffer is locked.

I went back and forth on this, so you could be right, but here was my
reasoning:

I was worried because SyncOneBuffer checks whether it needs writing
without taking a content lock, so the exclusive lock doesn't help. That
makes sense, because you don't want a checkpoint to have to get a
content lock on every buffer in the buffer pool. But it also means we
need to follow the rules laid out in transam/README and dirty the pages
before writing WAL.

 The effect of the change to lazy_scan_heap is to force the buffer to
 be written even if we're only updating the visibility map page.
 That's a bad idea and should be reverted.

The only time the VM and the data page are out of sync during vacuum is
after a crash, right? If that's the case, I didn't think it was a big
deal to dirty one extra page (should be extremely rare). Am I missing
something?

The reason I removed that special case was just code
complexity/readability. I tried preserving the previous behavior, and
it's not so bad, but it seemed unnecessarily ugly for the benefit of a
rare case.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Substituting Checksum Algorithm (was: Enabling Checksums)

2013-04-26 Thread Jeff Davis
On Fri, Apr 26, 2013 at 7:09 AM, Simon Riggs si...@2ndquadrant.com wrote:
 I'm expecting to spend some time on this over the weekend, once I've
 re-read the thread and patches to see if there is something to commit.

 That's my last time window, so this looks like the last chance to make
 changes before beta.

I updated the patch and split it into two parts (attached).

The first patch is the checksum algorithm itself. I have done
some documentation updates and moved it into the C file (rather
than the README), but further explanation of the shift right 3
modification will need to be by Ants or Florian.

The second patch adds the configure-time check for the right
compilation flags, and uses them when compiling checksum.c. I
called the new variable CFLAGS_EXTRA, for lack of a better idea,
so feel free to come up with a new name. It doesn't check for, or
use, -msse4.1, but that can be specified by the user by
configuring with CFLAGS_EXTRA=-msse4.1.

I don't know of any more required changes, aside from
documentation improvements.

Regards,
 Jeff Davis


fnv-jeff-20130426.patch
Description: Binary data


fnv-jeff-20130426-cflags-extra.patch
Description: Binary data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Substituting Checksum Algorithm (was: Enabling Checksums)

2013-04-26 Thread Jeff Davis
On Fri, 2013-04-26 at 16:40 -0400, Greg Smith wrote:
 I think I need to do two baselines:  master without checksums, and 
 master with extra optimizations but still without checksums.  It may be 
 the case that using better compile time optimizations gives a general 
 speedup that's worth considering regardless.  The optimizations seem to 
 have a very significant impact on the checksum feature, but I'd like to 
 quantify how they change the code a little bit before even getting into 
 that.

The patch only affects optimization flags used when compiling
checksum.c, so it should have no effect on other areas of the code.

If you want to compile the whole source with those flags, then just do:

  CFLAGS=-msse4.1 -funroll-loops -ftree-vectorize ./configure

Changing the optimization flags for existing code will have a larger
impact and should be considered separately from checksums.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Substituting Checksum Algorithm (was: Enabling Checksums)

2013-04-26 Thread Jeff Davis
On Sat, 2013-04-27 at 00:20 +0200, Andres Freund wrote:
 CFLAGS_VECTORIZATION? EXTRA sounds to generic to me.

I went with CFLAGS_VECTOR to be a little shorter while still keeping
some meaning.

 I think it would be better to have a PGAC_PROG_CC_VAR_OPT or so which
 assigns the flag to some passed variable name. Then we can reuse it for
 different vars and I have the feeling those will come. And having a
 CFLAGS_VECTOR_OPT would just be stupid ;)

Good suggestion; done.

Thank you for the review. New renamed patch attached for the build
options only (the other patch for the FNV checksum algorithm is
unchanged).

Regards,
Jeff Davis
*** a/config/c-compiler.m4
--- b/config/c-compiler.m4
***
*** 242,247  undefine([Ac_cachevar])dnl
--- 242,272 
  
  
  
+ # PGAC_PROG_CC_VAR_OPT
+ # ---
+ # Given a variable name and a string, check if the compiler supports
+ # the string as a command-line option. If it does, add the string to
+ # the given variable.
+ AC_DEFUN([PGAC_PROG_CC_VAR_OPT],
+ [define([Ac_cachevar], [AS_TR_SH([pgac_cv_prog_cc_cflags_$2])])dnl
+ AC_CACHE_CHECK([whether $CC supports $2], [Ac_cachevar],
+ [pgac_save_CFLAGS=$CFLAGS
+ CFLAGS=$pgac_save_CFLAGS $2
+ ac_save_c_werror_flag=$ac_c_werror_flag
+ ac_c_werror_flag=yes
+ _AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+[Ac_cachevar=yes],
+[Ac_cachevar=no])
+ ac_c_werror_flag=$ac_save_c_werror_flag
+ CFLAGS=$pgac_save_CFLAGS])
+ if test x$Ac_cachevar = xyes; then
+   $1=${$1} $2
+ fi
+ undefine([Ac_cachevar])dnl
+ ])# PGAC_PROG_CC_CFLAGS_OPT
+ 
+ 
+ 
  # PGAC_PROG_CC_LDFLAGS_OPT
  # 
  # Given a string, check if the compiler supports the string as a
*** a/configure
--- b/configure
***
*** 731,736  autodepend
--- 731,737 
  TAS
  GCC
  CPP
+ CFLAGS_VECTOR
  SUN_STUDIO_CC
  OBJEXT
  EXEEXT
***
*** 3944,3949  else
--- 3945,3955 
fi
  fi
  
+ # set CFLAGS_VECTOR from the environment, if available
+ if test $ac_env_CFLAGS_VECTOR_set = set; then
+   CFLAGS_VECTOR=$ac_env_CFLAGS_VECTOR_value
+ fi
+ 
  # Some versions of GCC support some additional useful warning flags.
  # Check whether they are supported, and add them to CFLAGS if so.
  # ICC pretends to be GCC but it's lying; it doesn't support these flags,
***
*** 4376,4381  if test x$pgac_cv_prog_cc_cflags__fexcess_precision_standard = xyes; then
--- 4382,4508 
CFLAGS=$CFLAGS -fexcess-precision=standard
  fi
  
+   # Optimization flags for specific files that benefit from vectorization
+   { $as_echo $as_me:$LINENO: checking whether $CC supports -funroll-loops 5
+ $as_echo_n checking whether $CC supports -funroll-loops...  6; }
+ if test ${pgac_cv_prog_cc_cflags__funroll_loops+set} = set; then
+   $as_echo_n (cached)  6
+ else
+   pgac_save_CFLAGS=$CFLAGS
+ CFLAGS=$pgac_save_CFLAGS -funroll-loops
+ ac_save_c_werror_flag=$ac_c_werror_flag
+ ac_c_werror_flag=yes
+ cat conftest.$ac_ext _ACEOF
+ /* confdefs.h.  */
+ _ACEOF
+ cat confdefs.h conftest.$ac_ext
+ cat conftest.$ac_ext _ACEOF
+ /* end confdefs.h.  */
+ 
+ int
+ main ()
+ {
+ 
+   ;
+   return 0;
+ }
+ _ACEOF
+ rm -f conftest.$ac_objext
+ if { (ac_try=$ac_compile
+ case (($ac_try in
+   *\* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+   *) ac_try_echo=$ac_try;;
+ esac
+ eval ac_try_echo=\\$as_me:$LINENO: $ac_try_echo\
+ $as_echo $ac_try_echo) 5
+   (eval $ac_compile) 2conftest.er1
+   ac_status=$?
+   grep -v '^ *+' conftest.er1 conftest.err
+   rm -f conftest.er1
+   cat conftest.err 5
+   $as_echo $as_me:$LINENO: \$? = $ac_status 5
+   (exit $ac_status); }  {
+ 	 test -z $ac_c_werror_flag ||
+ 	 test ! -s conftest.err
+}  test -s conftest.$ac_objext; then
+   pgac_cv_prog_cc_cflags__funroll_loops=yes
+ else
+   $as_echo $as_me: failed program was: 5
+ sed 's/^/| /' conftest.$ac_ext 5
+ 
+ 	pgac_cv_prog_cc_cflags__funroll_loops=no
+ fi
+ 
+ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ ac_c_werror_flag=$ac_save_c_werror_flag
+ CFLAGS=$pgac_save_CFLAGS
+ fi
+ { $as_echo $as_me:$LINENO: result: $pgac_cv_prog_cc_cflags__funroll_loops 5
+ $as_echo $pgac_cv_prog_cc_cflags__funroll_loops 6; }
+ if test x$pgac_cv_prog_cc_cflags__funroll_loops = xyes; then
+   CFLAGS_VECTOR=${CFLAGS_VECTOR} -funroll-loops
+ fi
+ 
+   { $as_echo $as_me:$LINENO: checking whether $CC supports -ftree-vectorize 5
+ $as_echo_n checking whether $CC supports -ftree-vectorize...  6; }
+ if test ${pgac_cv_prog_cc_cflags__ftree_vectorize+set} = set; then
+   $as_echo_n (cached)  6
+ else
+   pgac_save_CFLAGS=$CFLAGS
+ CFLAGS=$pgac_save_CFLAGS -ftree-vectorize
+ ac_save_c_werror_flag=$ac_c_werror_flag
+ ac_c_werror_flag=yes
+ cat conftest.$ac_ext _ACEOF
+ /* confdefs.h.  */
+ _ACEOF
+ cat confdefs.h conftest.$ac_ext
+ cat conftest.$ac_ext _ACEOF
+ /* end confdefs.h.  */
+ 
+ int
+ main ()
+ {
+ 
+   ;
+   return 0;
+ }
+ _ACEOF
+ rm -f conftest.$ac_objext
+ if { (ac_try=$ac_compile
+ case

Re: [HACKERS] Re: [COMMITTERS] pgsql: Fix collation assignment for aggregates with ORDER BY.

2013-04-26 Thread Jeff Davis
On Fri, 2013-04-26 at 16:46 -0700, David Fetter wrote:
 On Fri, Apr 26, 2013 at 07:49:47PM +, Tom Lane wrote:
  Given this risk and the lack of field complaints about the issue, it
  doesn't seem prudent to back-patch.

...

 This needs back-patching to 9.1, where the bug was introduced.

It was explained in the commit message why it was not backported.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Substituting Checksum Algorithm (was: Enabling Checksums)

2013-04-25 Thread Jeff Davis
On Tue, 2013-04-23 at 11:44 +0300, Ants Aasma wrote:
 I will try to reword.

Did you have a chance to clarify this, as well as some of the other
documentation issues Simon mentioned here?

http://www.postgresql.org/message-id/CA+U5nMKVEu8UDXQe
+Nk=d7nqm4ypfszaef0esak4j31lyqc...@mail.gmail.com

I'm not sure if others are waiting on me for a new patch or not. I can
give the documentation issues a try, but I was hesitant to do so because
you've done the research.

The problems that I can correct are fairly trivial.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-24 Thread Jeff Davis
On Wed, 2013-04-24 at 08:20 +0100, Simon Riggs wrote:
 On 24 April 2013 01:10, Jeff Davis pg...@j-davis.com wrote:
  I'd prefer that it was some kind of a checksum ID code -- e.g. 0 for no
  checksum, 1 for FNV-1a-SR3, etc. That would allow us to release 9.4 with
  a new algorithm without forcing existing users to change.
 
 That's exactly what the patch does.

The word version indicates an order to it though, like N+1 is always
preferable to N. This is user-facing (through pg_controldata output),
otherwise I wouldn't mind.

  initdb would have to take the code as an option, probably in string
  form.
 
 When/if we have multiple options we can add that. The main thing was
 to make sure the control file recorded things in a common way.

The main strange thing to me is that we're still using the
enabled/disabled for the output of pg_controldata as well as the
version.

When we do have multiple options, it seems like we'd just have one field
output:

  Data page checksums: none|crc32c|pg-fnv

What goal are you trying to accomplish with this patch? pg_control
doesn't need to be compatible between releases, so can't we just add
this later when we really do have multiple options?

Regards,
Jeff Davis





-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-24 Thread Jeff Davis
On Wed, 2013-04-24 at 21:09 +0100, Simon Riggs wrote:
 On 24 April 2013 21:06, Jeff Davis pg...@j-davis.com wrote:
 
  What goal are you trying to accomplish with this patch?
 
 That we might need to patch the checksum version on a production release.

Oh, I see.

I don't think we need two output fields from pg_controldata though. It's
a little redundant, and confused me when I was looking at the impact on
pg_upgrade. And it means nothing to the user until we actually have
multiple algorithms available, at which time we are better off with a
text representation.

Other than that, I think your patch is fine to accomplish the
aforementioned goal. Essentially, it just changes the bool to a uint32,
which I favor.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


[HACKERS] Substituting Checksum Algorithm (was: Enabling Checksums)

2013-04-23 Thread Jeff Davis
Starting a new thread to more narrowly address the topic.

Attached is my reorganization of Ants's patch here:

http://www.postgresql.org/message-id/CA
+CSw_vinyf-w45i=M1m__MpJZY=e8s4nt_knnpebtwjtoa...@mail.gmail.com

My changes:

* wrest the core FNV algorithm from the specific concerns of a data page
   - PageCalcChecksum16 mixes the block number, reduces to 16 bits,
 and avoids the pd_checksum field
   - the checksum algorithm is just a pure block checksum with a 32-bit
 result
* moved the FNV checksum into a separate file, checksum.c
* added Ants's suggested compilation flags for better optimization
* slight update to the paragraph in the README that discusses concerns
specific to a data page

I do have a couple questions/concerns about Ants's patch though:

* The README mentions a slight bias; does that come from the mod
(2^16-1)? That's how I updated the README, so I wanted to make sure.
* How was the FNV_PRIME chosen?
* I can't match the individual algorithm step as described in the README
to the actual code. And the comments in the README don't make it clear
enough which one is right (or maybe they both are, and I'm just tired).

The README says:

   hash = (hash ^ value) * ((hash ^ value)  3)

(which is obviously missing the FNV_PRIME factor) and the code says:

   +#define CHECKSUM_COMP(checksum, value) do {\
   +   uint32 __tmp = (checksum) ^ (value);\
   +   (checksum) = __tmp * FNV_PRIME ^ (__tmp  3);\
   +} while (0)

I'm somewhat on the fence about the shift right. It was discussed in
this part of the thread:

http://www.postgresql.org/message-id/99343716-5f5a-45c8-b2f6-74b9ba357...@phlo.org

I think we should be able to state with a little more clarity in the
README why there is a problem with plain FNV-1a, and why this
modification is both effective and safe.

I'd lean toward simplicity and closer adherence to the published version
of the algorithm rather than detecting a few more obscure error
patterns. It looks like the modification slows down the algorithm, too.

Regards,
Jeff Davis

*** a/src/backend/storage/page/Makefile
--- b/src/backend/storage/page/Makefile
***
*** 12,17  subdir = src/backend/storage/page
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS =  bufpage.o itemptr.o
  
  include $(top_srcdir)/src/backend/common.mk
--- 12,22 
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS =  bufpage.o checksum.o itemptr.o
  
  include $(top_srcdir)/src/backend/common.mk
+ 
+ # important optimization flags for checksum.c
+ ifeq ($(GCC),yes)
+ checksum.o: CFLAGS += -msse4.1 -funroll-loops -ftree-vectorize
+ endif
*** a/src/backend/storage/page/README
--- b/src/backend/storage/page/README
***
*** 61,63  checksums are enabled.  Systems in Hot-Standby mode may benefit from hint bits
--- 61,109 
  being set, but with checksums enabled, a page cannot be dirtied after setting a
  hint bit (due to the torn page risk). So, it must wait for full-page images
  containing the hint bit updates to arrive from the master.
+ 
+ Checksum algorithm
+ --
+ 
+ The algorithm used to checksum pages is chosen for very fast calculation.
+ Workloads where the database working set fits into OS file cache but not into
+ shared buffers can read in pages at a very fast pace and the checksum
+ algorithm itself can become the largest bottleneck.
+ 
+ The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand for
+ Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 4 bytes at
+ a time according to the formula:
+ 
+ hash = (hash ^ value) * FNV_PRIME(16777619)
+ 
+ PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of high
+ bits - high order bits in input data only affect high order bits in output
+ data. To resolve this we xor in the value prior to multiplication shifted
+ right by 3 bits. The number 3 was chosen as it is a small odd, prime, and
+ experimentally provides enough mixing for the high order bits to avalanche
+ into lower positions. The actual hash formula used as the basis is:
+ 
+ hash = (hash ^ value) * ((hash ^ value)  3)
+ 
+ The main bottleneck in this calculation is the multiplication latency. To hide
+ the latency and to make use of SIMD parallelism multiple hash values are
+ calculated in parallel. Each hash function uses a different initial value
+ (offset basis in FNV terminology). The initial values actually used were
+ chosen randomly, as the values themselves don't matter as much as that they
+ are different and don't match anything in real data. The page is then treated
+ as 32 wide array of 32bit values and each column is aggregated according to
+ the above formula. Finally one more iteration of the formula is performed with
+ value 0 to mix the bits of the last value added.
+ 
+ The partial checksums are then aggregated together using xor to form a
+ 32-bit checksum. The caller can

Re: [HACKERS] Enabling Checksums

2013-04-23 Thread Jeff Davis
On Tue, 2013-04-23 at 16:28 +0100, Simon Riggs wrote:
 * make the pg_control.data_checksums field into a version number, for
 future flexibility...
 patch attached

Commenting on this separately because it's a separate issue.

I'd prefer that it was some kind of a checksum ID code -- e.g. 0 for no
checksum, 1 for FNV-1a-SR3, etc. That would allow us to release 9.4 with
a new algorithm without forcing existing users to change.

initdb would have to take the code as an option, probably in string
form.

What do you think?

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-22 Thread Jeff Davis
On Mon, 2013-04-22 at 20:04 +0200, Florian Pflug wrote:
 The one downside of the fnv1+shift approach is that it's built around
 the assumption that processing 64-bytes at once is the sweet spot. That
 might be true for x86 and x86_64 today, but it won't stay that way for
 long, and quite surely isn't true for other architectures. That doesn't
 necessarily rule it out, but it certainly weakens the argument that
 slipping it into 9.3 avoids having the change the algorithm later...

I think you are setting the bar way too high. Right now, we have a slow
algorithm. According to Ants's tests, FNV-1a is much, much faster. Do
you think that it will still be such a bottleneck that we will want to
change it again later for purely performance reasons?

The only time this is likely to matter is in the situation Greg Smith
describes, where shared buffers is much smaller than memory, and the
working set of buffers is near the size of memory (in other words, a lot
of buffers moving to and from shared memory, but not much to or from
disk). And it's already significantly faster than algorithm in the
original tests (Fletcher), so it's not clear that it's still even a
serious problem.

(Also remember that checksum users already accept a WAL penalty.)

The biggest problem now is getting one of these faster algorithms (FNV
or even a faster CRC) into shape that is acceptable to
reviewers/committers. If we don't do that, we will be missing out on a
lot of potential checksum users for whom the existing CRC algorithm is
just too slow.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-22 Thread Jeff Davis
On Mon, 2013-04-22 at 19:25 +0300, Ants Aasma wrote:
 I was just now writing up a generic C based patch based on the
 parallel FNV-1a + shift that we discussed with Florian with an added
 round of mixing. Testing the performance in isolation indicates that:
 1) it is about an order of magnitude faster than the Sarwate CRC
 method used in Postgresql.
 2) it is about 2x faster than fastest software based CRC method.
 3) by using -msse4.1 -funroll-loops -ftree-vectorize compilation
 options the performance improves 5x. (within 20% of handcoded ASM)

That's great news!

This means that we can have a simple C implementation in a separate
file, and pass a few build flags when compiling just that file (so it
doesn't affect other code). That should make reviewers/committers happy
(including me).

FWIW, that was my last real concern about FNV (reviewability). I'm not
worried about the performance based on your analysis; nor am I worried
about the error detection rate.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-22 Thread Jeff Davis
On Tue, 2013-04-23 at 01:08 +0300, Ants Aasma wrote:
 A slight delay, but here it is. I didn't lift the checksum part into a
 separate file as I didn't have a great idea what I would call it. The
 code is reasonably compact so I don't see a great need for this right
 now. It would be more worth the effort when/if we add non-generic
 variants. I'm not particularly attached to the method I used to mask
 out pd_checksum field, this could be improved if someone has a better
 idea how to structure the code.

Thank you. A few initial comments:

I have attached (for illustration purposes only) a patch on top of yours
that divides the responsibilities a little more cleanly.

* easier to move into a separate file, and use your recommended compiler
flags without affecting other routines in bufpage.c
* makes the checksum algorithm itself simpler
* leaves the data-page-specific aspects (mixing in the page number,
ignoring pd_checksum, reducing to 16 bits) to PageCalcChecksum16
* overall easier to review and understand

I'm not sure what we should call the separate file or where we should
put it, though. How about src/backend/utils/checksum/checksum_fnv.c? Is
there a clean way to override the compiler flags for a single file so we
don't need to put it in its own directory?

Regards,
Jeff Davis

*** a/src/backend/storage/page/bufpage.c
--- b/src/backend/storage/page/bufpage.c
***
*** 23,28  static char pageCopyData[BLCKSZ];	/* for checksum calculation */
--- 23,29 
  static Page pageCopy = pageCopyData;
  
  static uint16 PageCalcChecksum16(Page page, BlockNumber blkno);
+ static uint32 checksum_fnv(char *data, uint32 size);
  
  /* 
   *		Page support functions
***
*** 986,1017  static const uint32 checksumBaseOffsets[N_SUMS] = {
  static uint16
  PageCalcChecksum16(Page page, BlockNumber blkno)
  {
! 	uint32 sums[N_SUMS];
! 	uint32 (*pageArr)[N_SUMS] = (uint32 (*)[N_SUMS]) page;
! 	uint32 result = blkno;
! 	int i, j;
! 	int pd_checksum_word = offsetof(PageHeaderData, pd_checksum)/sizeof(uint32);
! 	int pd_checksum_half = (offsetof(PageHeaderData, pd_checksum) % sizeof(uint32)) / sizeof(uint16);
  
  	/* only calculate the checksum for properly-initialized pages */
  	Assert(!PageIsNew(page));
  
  	/* initialize partial checksums to their corresponding offsets */
  	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
  
! 	/* first iteration needs to mask out pd_checksum field itself with zero */
! 	for (j = 0; j  N_SUMS; j++)
! 	{
! 		uint32 value = pageArr[0][j];
! 		if (j == pd_checksum_word)
! 			((uint16*) value)[pd_checksum_half] = 0;
! 		CHECKSUM_COMP(sums[j], value);
! 	}
! 
! 	/* now add in the rest of the page */
! 	for (i = 1; i  BLCKSZ/sizeof(uint32)/N_SUMS; i++)
  		for (j = 0; j  N_SUMS; j++)
! 			CHECKSUM_COMP(sums[j], pageArr[i][j]);
  
  	/* finally add in one round of zeroes for one more layer of mixing */
  	for (j = 0; j  N_SUMS; j++)
--- 987,1035 
  static uint16
  PageCalcChecksum16(Page page, BlockNumber blkno)
  {
! 	PageHeader	phdr   = (PageHeader) page;
! 	uint16		save_checksum;
! 	uint32		fnv_checksum;
  
  	/* only calculate the checksum for properly-initialized pages */
  	Assert(!PageIsNew(page));
  
+ 	/*
+ 	 * Save pd_checksum and set it to zero, so that the checksum calculation
+ 	 * isn't affected by the checksum stored on the page.
+ 	 */
+ 	save_checksum = phdr-pd_checksum;
+ 	phdr-pd_checksum = 0;
+ 	fnv_checksum = checksum_fnv(page, BLCKSZ);
+ 	phdr-pd_checksum = save_checksum;
+ 
+ 	/* mix in the block number to detect transposed pages */
+ 	fnv_checksum ^= blkno;
+ 
+ 	/*
+ 	 * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of
+ 	 * one. That avoids checksums of zero, which seems like a good idea.
+ 	 */
+ 	return (fnv_checksum % 65535) + 1;
+ }
+ 
+ static uint32
+ checksum_fnv(char *data, uint32 size)
+ {
+ 	uint32 sums[N_SUMS];
+ 	uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data;
+ 	uint32 result;
+ 	int i, j;
+ 
+ 	Assert((size % (sizeof(uint32)*N_SUMS)) == 0);
+ 
  	/* initialize partial checksums to their corresponding offsets */
  	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
  
! 	/* main checksum calculation */
! 	for (i = 0; i  size/sizeof(uint32)/N_SUMS; i++)
  		for (j = 0; j  N_SUMS; j++)
! 			CHECKSUM_COMP(sums[j], dataArr[i][j]);
  
  	/* finally add in one round of zeroes for one more layer of mixing */
  	for (j = 0; j  N_SUMS; j++)
***
*** 1021,1026  PageCalcChecksum16(Page page, BlockNumber blkno)
  	for (i = 0; i  N_SUMS; i++)
  		result ^= sums[i];
  
! 	/* use mod mapping to map the value into 16 bits and offset from zero */
! 	return (result % 65535) + 1;
  }
--- 1039,1043 
  	for (i = 0; i  N_SUMS; i++)
  		result ^= sums[i];
  
! 	return result;
  }

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http

Re: [HACKERS] Enabling Checksums

2013-04-18 Thread Jeff Davis
On Wed, 2013-04-17 at 20:21 -0400, Greg Smith wrote:
 -Original checksum feature used Fletcher checksums.  Its main problems, 
 to quote wikipedia, include that it cannot distinguish between blocks 
 of all 0 bits and blocks of all 1 bits.

That is fairly easy to fix by using a different modulus: 251 vs 255.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-18 Thread Jeff Davis
On Thu, 2013-04-18 at 19:05 +0200, Florian Pflug wrote:
 On Apr18, 2013, at 19:04 , Jeff Davis pg...@j-davis.com wrote:
  On Wed, 2013-04-17 at 20:21 -0400, Greg Smith wrote:
  -Original checksum feature used Fletcher checksums.  Its main problems, 
  to quote wikipedia, include that it cannot distinguish between blocks 
  of all 0 bits and blocks of all 1 bits.
  
  That is fairly easy to fix by using a different modulus: 251 vs 255.
 
 At the expense of a drastic performance hit though, no? Modulus operations
 aren't exactly cheap.

Modulo is only necessary when there's a possibility of overflow, or at
the very end of the calculation. If we accumulate 32-bit integers into
64-bit sums, then it turns out that it can't overflow given the largest
input we support (32K page).

32K page = 8192 32-bit integers

1*(2^32-1) + 2*(2^32-1) + 3*(2^32-1) ... 8192*(2^32-1)
= (2^32-1) * (8192^2 - 8192)/2
= 144097595856261120 (  2^64-1 )

So, we only need to do the modulo at the end.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-17 Thread Jeff Davis
On Wed, 2013-04-17 at 12:42 -0400, Bruce Momjian wrote:
  AFAIK, there's currently no per-page checksum flag. Still, being only
  able to go from checksummed to not-checksummed probably is for all
  practical purposes the same as not being able to pg_upgrade at all.
  Otherwise, why would people have enabled checksums in the first place?
 
 Good point, but it is _an_ option, at least.
 
 I would like to know the answer of how an upgrade from checksum to
 no-checksum would behave so I can modify pg_upgrade to allow it.

Why? 9.3 pg_upgrade certainly doesn't need it. When we get to 9.4, if
someone has checksums enabled and wants to disable it, why is pg_upgrade
the right time to do that? Wouldn't it make more sense to allow them to
do that at any time?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-17 Thread Jeff Davis
On Wed, 2013-04-17 at 16:58 +0100, Greg Stark wrote:
 On Wed, Apr 17, 2013 at 4:28 PM, Florian Pflug f...@phlo.org wrote:
  Is there any way we can change the checksum algorithm in 9.4
  *without* breaking pg_upgrade?
 
 Personally I think we're going to need a solution for page format
 changes someday eventually
 
 What advantages are we postponing now to avoid it?
 
 * 32-bit checksums?
 * Being able to enable/disable checksums?
 
 Anything else?

I'm not sure that changing the page format is the most difficult part of
enabling/disabling checksums. It's easy enough to have page header bits
if the current information is not enough (and those bits were there, but
Heikki requested their removal and I couldn't think of a concrete reason
to keep them).

Eventually, it would be nice to be able to break the page format and
have more space for things like checksums (and probably a few other
things, maybe some visibility-related optimizations). But that's a few
years off and we don't have any real plan for that.

What I wanted to accomplish with this patch is the simplest checksum
mechanism that we could get that would be fast enough that many people
would be able to use it. I expect it to be useful until we do decide to
break the page format.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-12 Thread Jeff Davis
On Thu, 2013-04-11 at 20:12 +0100, Simon Riggs wrote:

 So, if we apply a patch like the one attached, we then end up with the
 WAL checksum using the page checksum as an integral part of its
 calculation. (There is no increase in code inside WALInsertLock,
 nothing at all touched in that area).
 
 
 Then all we need to do is make PageSetChecksumInplace() use Ants' algo
 and we're done.
 
 
 Only point worth discussing is that this change would make backup
 blocks be covered by a 16-bit checksum, not the CRC-32 it is now. i.e.
 the record header is covered by a CRC32 but the backup blocks only by
 16-bit. 

FWIW, that's fine with me. 

 (Attached patch is discussion only. Checking checksum in recovery
 isn't coded at all.)

I like it.

A few points:

* Given that setting the checksum is unconditional in a backup block, do
we want to zero the checksum field when the backup block is restored if
checksums are disabled? Otherwise we would have a strange situation
where some blocks have a checksum on disk even when checksums are
disabled.

* When we do PageSetChecksumInplace(), we need to be 100% sure that the
hole is empty; otherwise the checksum will fail when we re-expand it. It
might be worth a memset beforehand just to be sure.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-12 Thread Jeff Davis
On Fri, 2013-04-12 at 15:21 -0400, Bruce Momjian wrote:
  * When we do PageSetChecksumInplace(), we need to be 100% sure that the
  hole is empty; otherwise the checksum will fail when we re-expand it. It
  might be worth a memset beforehand just to be sure.
 
 Do we write the page holes to the WAL for full-page writes?  I hope we
 don't.

No, but the page hole is included in the checksum.

Let's say that the page hole contains some non-zero value, and we
calculate a checksum. When we eliminate the page hole, and then
reconstitute the page using zeros for the page hole later, then the page
will not match the checksum any more.

So, we need to be sure the original page hole is all-zero when we
calculate the checksum.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-12 Thread Jeff Davis
On Fri, 2013-04-12 at 21:28 +0200, Andres Freund wrote:
 That means we will have to do the verification for this in
 ValidXLogRecord() *not* in RestoreBkpBlock or somesuch. Otherwise we
 won't always recognize the end of WAL correctly.
 And I am a bit wary of reducing the likelihood of noticing the proper
 end-of-recovery by reducing the crc width.

Good point.

 Why again are we doing this now? Just to reduce the overhead of CRC
 computation for full page writes? Or are we forseeing issues with the
 page checksums being wrong because of non-zero data in the hole being
 zero after the restore from bkp blocks?

That shouldn't be a problem, because the block is not expected to have a
proper checksum in WAL, and it will be recalculated before being
written. So I see these changes as mostly independent.

The reason we're discussing right now is because, when choosing the
checksum algorithm, I was hoping that it might be usable in the future
for WAL backup blocks. I'm convinced that they can be; and the primary
question now seems to be should they be, which does not need to be
settled right now in my opinion.

Anyway, I would be perfectly happy if we just got the SIMD algorithm in
for data pages. The support for changing the WAL checksums seems
lukewarm, and there might be quite a few alternatives (e.g. optimizing
the CRC for backup blocks as Heikki suggested) to achieve that
performance goal.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-12 Thread Jeff Davis
On Fri, 2013-04-12 at 23:03 +0300, Heikki Linnakangas wrote:
 I think this is a bad idea. It complicates the WAL format significantly. 
 Simon's patch didn't include the changes to recovery to validate the 
 checksum, but I suspect it would be complicated. And it reduces the 
 error-detection capability of WAL recovery. Keep in mind that unlike 
 page checksums, which are never expected to fail, so even if we miss a 
 few errors it's still better than nothing, the WAL checkum is used to 
 detect end-of-WAL. There is expected to be a failure every time we do 
 crash recovery. This far, we've considered the probability of one in 
 1^32 small enough for that purpose, but IMHO one in 1^16 is much too weak.

One thing that just occurred to me is that we could make the SIMD
checksum a 32-bit checksum, and reduce it down to 16 bits for the data
pages. That might give us more flexibility to later use it for WAL
without compromising on the error detection nearly as much (though
obviously that wouldn't work with Simon's current proposal which uses
the same data page checksum in a WAL backup block).

In general, we have more flexibility with WAL because there is no
upgrade issue. It would be nice to share code with the data page
checksum algorithm; but really we should just use whatever offers the
best trade-off in terms of complexity, performance, and error detection
rate.

I don't think we need to decide all of this right now. Personally, I'm
satisfied having SIMD checksums on data pages now and leaving WAL
optimization until later.

Regards,
Jeff Davis





-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-10 Thread Jeff Davis
On Wed, 2013-04-10 at 11:01 +0300, Ants Aasma wrote:
 I think we should first deal with using it for page checksums and if
 future versions want to reuse some of the code for WAL checksums then
 we can rearrange the code.

Sounds good to me, although I expect we at least want any assembly to be
in a separate file (if the specialization makes it in 9.3).

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-10 Thread Jeff Davis
On Wed, 2013-04-10 at 20:17 +0100, Simon Riggs wrote:

 OK, so we have a single combined calculate a checksum for a block
 function. That uses Jeff's zeroing trick and Ants' bulk-oriented
 performance optimization.
 
 
 For buffer checksums we simply calculate for the block.

Sounds good.

 For WAL full page writes, we first set the checksums for all defined
 buffers, then calculate the checksum of remaining data plus the
 pd_checksum field from each block using the normal WAL CRC32.
 
 Seems good to me. One set of fast code. And it avoids the weirdness
 that the checksum stored on the full page is actually wrong.

Oh, that's a nice benefit.

 It also means that the WAL checksum calculation includes the hole, yet
 we do not include the data for the hole. So we have to do an extra
 copy when restoring the backuo block.

I like this, but it sounds like there is some room for discussion on
some of these points. I assume changes to the WAL checksums are 9.4
material?

I'm satisfied with SIMD data checksums in 9.3 and that we have a plan
for using SIMD for WAL checksums later.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-09 Thread Jeff Davis
On Sat, 2013-04-06 at 16:44 +0200, Andres Freund wrote:
 I think we can just make up the rule that changing full page writes also
 requires SpinLockAcquire(xlogctl-info_lck);. Then its easy enough. And
 it can hardly be a performance bottleneck given how infrequently its
 modified.

That seems like a good idea to me. As it stands, checksums basically
force full page writes to be on; so we should either fix that or
document it.

 In retrospect I think making up the rule that full_page_writes changes
 imply a checkpoint would have made things easier performance and
 codewise.

I don't even see why we allow changing that while the server is on.
Either the I/O system requires it for safety or not, right?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-09 Thread Jeff Davis
On Mon, 2013-04-08 at 09:19 +0100, Simon Riggs wrote:

 Applied, with this as the only code change.
 
 
 Thanks everybody for good research and coding and fast testing.
 
 
 We're in good shape now.

Thank you.

I have attached two more patches:

1. I believe that the issue I brought up at the end of this email:

http://www.postgresql.org/message-id/1365035537.7580.380.camel@sussancws0025

is a real issue. In lazy_vacuum_page(), the following sequence can
happen when checksums are on:

   a. PageSetAllVisible
   b. Pass heap page to visibilitymap_set
   c. visibilitymap_set logs the heap page and sets the LSN
   d. MarkBufferDirty

If a checkpoint happens between (c) and (d), then we have a problem. The
fix is easy: just mark the heap buffer dirty first. There's another call
site that looks like a potential problem, but I don't think it is. I
simplified the code there to make it (hopefully) more clearly correct.

2. A cleanup patch to pass the buffer_std flag down through
MarkBufferDirtyHint. This is a matter of preference and purely cosmetic,
so it might not be wanted. The reason I thought it was useful is that a
future caller that sets a hint on a non-standard buffer might easily
miss the assumption that we have a standard buffer.

Regards,
Jeff Davis
*** a/src/backend/access/hash/hash.c
--- b/src/backend/access/hash/hash.c
***
*** 287,293  hashgettuple(PG_FUNCTION_ARGS)
  			/*
  			 * Since this can be redone later if needed, mark as a hint.
  			 */
! 			MarkBufferDirtyHint(buf);
  		}
  
  		/*
--- 287,293 
  			/*
  			 * Since this can be redone later if needed, mark as a hint.
  			 */
! 			MarkBufferDirtyHint(buf, true);
  		}
  
  		/*
*** a/src/backend/access/heap/pruneheap.c
--- b/src/backend/access/heap/pruneheap.c
***
*** 262,268  heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
  		{
  			((PageHeader) page)-pd_prune_xid = prstate.new_prune_xid;
  			PageClearFull(page);
! 			MarkBufferDirtyHint(buffer);
  		}
  	}
  
--- 262,268 
  		{
  			((PageHeader) page)-pd_prune_xid = prstate.new_prune_xid;
  			PageClearFull(page);
! 			MarkBufferDirtyHint(buffer, true);
  		}
  	}
  
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***
*** 413,421  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
  	 * crucial. Be sure to mark the proper buffer dirty.
  	 */
  	if (nbuf != InvalidBuffer)
! 		MarkBufferDirtyHint(nbuf);
  	else
! 		MarkBufferDirtyHint(buf);
  }
  			}
  		}
--- 413,421 
  	 * crucial. Be sure to mark the proper buffer dirty.
  	 */
  	if (nbuf != InvalidBuffer)
! 		MarkBufferDirtyHint(nbuf, true);
  	else
! 		MarkBufferDirtyHint(buf, true);
  }
  			}
  		}
*** a/src/backend/access/nbtree/nbtree.c
--- b/src/backend/access/nbtree/nbtree.c
***
*** 1052,1058  restart:
  opaque-btpo_cycleid == vstate-cycleid)
  			{
  opaque-btpo_cycleid = 0;
! MarkBufferDirtyHint(buf);
  			}
  		}
  
--- 1052,1058 
  opaque-btpo_cycleid == vstate-cycleid)
  			{
  opaque-btpo_cycleid = 0;
! MarkBufferDirtyHint(buf, true);
  			}
  		}
  
*** a/src/backend/access/nbtree/nbtutils.c
--- b/src/backend/access/nbtree/nbtutils.c
***
*** 1789,1795  _bt_killitems(IndexScanDesc scan, bool haveLock)
  	if (killedsomething)
  	{
  		opaque-btpo_flags |= BTP_HAS_GARBAGE;
! 		MarkBufferDirtyHint(so-currPos.buf);
  	}
  
  	if (!haveLock)
--- 1789,1795 
  	if (killedsomething)
  	{
  		opaque-btpo_flags |= BTP_HAS_GARBAGE;
! 		MarkBufferDirtyHint(so-currPos.buf, true);
  	}
  
  	if (!haveLock)
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***
*** 7661,7667  XLogRestorePoint(const char *rpName)
   * i.e. those for which buffer_std == true
   */
  XLogRecPtr
! XLogSaveBufferForHint(Buffer buffer)
  {
  	XLogRecPtr recptr = InvalidXLogRecPtr;
  	XLogRecPtr lsn;
--- 7661,7667 
   * i.e. those for which buffer_std == true
   */
  XLogRecPtr
! XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
  {
  	XLogRecPtr recptr = InvalidXLogRecPtr;
  	XLogRecPtr lsn;
***
*** 7683,7689  XLogSaveBufferForHint(Buffer buffer)
  	 * We reuse and reset rdata for any actual WAL record insert.
  	 */
  	rdata[0].buffer = buffer;
! 	rdata[0].buffer_std = true;
  
  	/*
  	 * Check buffer while not holding an exclusive lock.
--- 7683,7689 
  	 * We reuse and reset rdata for any actual WAL record insert.
  	 */
  	rdata[0].buffer = buffer;
! 	rdata[0].buffer_std = buffer_std;
  
  	/*
  	 * Check buffer while not holding an exclusive lock.
*** a/src/backend/commands/sequence.c
--- b/src/backend/commands/sequence.c
***
*** 1118,1124  read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple)
  		HeapTupleHeaderSetXmax(seqtuple-t_data, InvalidTransactionId

Re: [HACKERS] Enabling Checksums

2013-04-09 Thread Jeff Davis
On Tue, 2013-04-09 at 05:35 +0300, Ants Aasma wrote:
 And here you go. I decided to be verbose with the comments as it's
 easier to delete a comment to write one. I also left in a huge jumble
 of macros to calculate the contents of a helper var during compile
 time. This can easily be replaced with the calculated values once we
 settle on specific parameters.

Great, thank you.

Is it possible to put an interface over it that somewhat resembles the
CRC checksum (INIT/COMP/FIN)? It looks a little challenging because of
the nature of the algorithm, but it would make it easier to extend to
other places (e.g. WAL). It doesn't have to match the INIT/COMP/FIN
pattern exactly.

Regardless, we should have some kind of fairly generic interface and
move the code to its own file (e.g. checksum.c).

To make the interface more generic, would it make sense to require the
caller to save the page's stored checksum and zero it before
calculating? That would avoid the awkwardness of avoiding the
pd_checksum field. For example (code for illustration only):

   PageCalcChecksum16(Page page, BlockNumber blkno)
   {
  PageHeader phdr = (PageHeader)page;
  uint16 stored_checksum = phdr-pd_checksum;
  uint16 calc_checksum;

  phdr-pd_checksum = 0;
  calc_checksum = SIMD_CHECKSUM(page, BLCKSZ);
  phdr-pd_checksum = stored_checksum;
  return calc_checksum;
   }

That would make it possible to use a different word size -- is uint16
optimal or would a larger word be more efficient?

It looks like the block size needs to be an even multiple of
sizeof(uint16)*NSUMS. And it also look like it's hard to combine
different regions of memory into the same calculation (unless we want to
just calculate them separately and XOR them or something). Does that
mean that this is not suitable for WAL at all?

Using SIMD for WAL is not a requirement at all; I just thought it might
be a nice benefit for non-checksum-enabled users in some later release.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-05 Thread Jeff Davis
On Tue, 2013-03-26 at 03:34 +0200, Ants Aasma wrote:
 The main thing to look out for is that we don't
 have any blind spots for conceivable systemic errors. If we decide to
 go with the SIMD variant then I intend to figure out what the blind
 spots are and show that they don't matter.

Are you still looking into SIMD? Right now, it's using the existing CRC
implementation. Obviously we can't change it after it ships. Or is it
too late to change it already?

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-04-05 Thread Jeff Davis
On Fri, 2013-04-05 at 21:39 +0300, Ants Aasma wrote:
 Yes, I just managed to get myself some time so I can look at it some
 more. I was hoping that someone would weigh in on what their
 preferences are on the performance/effectiveness trade-off and the
 fact that we need to use assembler to make it fly so I knew how to go
 forward.

My opinion is that we don't need to be perfect as long as we catch 99%
of random errors and we don't have any major blind spots. Also, the
first version doesn't necessarily need to perform well; we can leave
optimization as future work. Requiring assembly to achieve those
optimizations is a drawback in terms of maintainability, but it seems
isolated so I don't think it's a major problem.

Ideally, the algorithm would also be suitable for WAL checksums, and we
could eventually use it for that as well.

 The worst blind spot that I could come up with was an even number of
 single bit errors that are all on the least significant bit of 16bit
 word. This type of error can occur in memory chips when row lines go
 bad, usually stuck at zero or one.

We're not really trying to catch memory errors anyway. Of course it
would be nice, but I would rather have more people using a slightly
flawed algorithm than fewer using it because it has too great a
performance impact.

 Unless somebody tells me not to waste my time I'll go ahead and come
 up with a workable patch by Monday.

Sounds great to me, thank you.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-05 Thread Jeff Davis
On Fri, 2013-04-05 at 15:09 +0200, Andres Freund wrote:
 How does the attached version look? I verified that it survives
 recovery, but not more.

Comments:

* Regarding full page writes, we can:
  - always write full pages (as in your current patch), regardless of
the current settings
  - take WALInsertLock briefly to get the current settings from XLogCtl
  - always calculate the full page record, but in XLogInsert, if it
happens to be a HINT record, and full page writes are not necessary,
then discard it (right now I somewhat favor this option but I haven't
looked at the details)

* typo in Backup blocks are not used in **xlog xlog** records

* To get the appropriate setting for buffer_std, we should pass it down
through MarkBufferDirty as an extra flag, I think.

* I'm a bit worried that we'll need a cleanup lock when restoring an FSM
page. What do you think?

* In xlog_redo, it seemed slightly awkward to call XLogRecGetData twice.
Merely a matter of preference but I thought I would mention it.

 Jeff, any chance you can run this for a round with your suite?

Yes. I don't have a rigorous test suite, but I'll do some manual tests
and walk through it with gdb.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-05 Thread Jeff Davis
On Fri, 2013-04-05 at 10:34 +0200, Florian Pflug wrote:
 Maybe we could scan forward to check whether a corrupted WAL record is
 followed by one or more valid ones with sensible LSNs. If it is,
 chances are high that we haven't actually hit the end of the WAL. In
 that case, we could either log a warning, or (better, probably) abort
 crash recovery.

+1.

 Corruption of fields which we require to scan past the record would
 cause false negatives, i.e. no trigger an error even though we do
 abort recovery mid-way through. There's a risk of false positives too,
 but they require quite specific orderings of writes and thus seem
 rather unlikely. (AFAICS, the OS would have to write some parts of
 record N followed by the whole of record N+1 and then crash to cause a
 false positive).

Does the xlp_pageaddr help solve this?

Also, we'd need to be a little careful when written-but-not-flushed WAL
data makes it to disk, which could cause a false positive and may be a
fairly common case.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-05 Thread Jeff Davis
On Fri, 2013-04-05 at 19:22 -0500, Jaime Casanova wrote:
 On Fri, Apr 5, 2013 at 8:09 AM, Andres Freund and...@2ndquadrant.com wrote:
 
  How does the attached version look? I verified that it survives
  recovery, but not more.
 
 
 I still got errors when executing make installcheck in a just compiled
 9.3devel + this_patch, this is when setting wal_level higher than
 minimal.
 Attached regression.diffs

Did you also apply the other patch here:

http://www.postgresql.org/message-id/1364340203.21411.143.camel@sussancws0025

?

I think that is a completely separate issue. Simon, should that patch be
applied or are you waiting on the issue Jeff Janes raised to be
resolved, as well?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-04 Thread Jeff Davis
Andres,

Thank you for diagnosing this problem!

On Thu, 2013-04-04 at 16:53 +0200, Andres Freund wrote:
 I think the route you quickly sketched is more realistic. That would
 remove all knowledge obout XLOG_HINT from generic code hich is a very
 good thing, I spent like 15minutes yesterday wondering whether the early
 return in there might be the cause of the bug...

I like this approach. It may have some performance impact though,
because there are a couple extra spinlocks taken, and an extra memcpy.
The code looks good to me except that we should be consistent about the
page hole -- XLogCheckBuffer is calculating it, but then we copy the
entire page. I don't think anything can change the size of the page hole
while we have a shared lock on the buffer, so it seems OK to skip the
page hole during the copy.

Another possible approach is to drop the lock on the buffer and
re-acquire it in exclusive mode after we find out we'll need to do
XLogInsert. It means that MarkBufferDirtyHint may end up blocking for
longer, but would avoid the memcpy. I haven't really thought through the
details.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-04 Thread Jeff Davis
On Thu, 2013-04-04 at 14:21 -0700, Jeff Janes wrote:

 This brings up a pretty frightening possibility to me, unrelated to
 data checksums.  If a bit gets twiddled in the WAL file due to a
 hardware issue or a cosmic ray, and then a crash happens, automatic
 recovery will stop early with the failed WAL checksum with
 an innocuous looking message.  The system will start up but will be
 invisibly inconsistent, and will proceed to overwrite that portion of
 the WAL file which contains the old data (real data that would have
 been necessary to reconstruct, once the corruption is finally realized
 ) with an end-of-recovery checkpoint record and continue to chew up
 real data from there.

I've been worried about that for a while, and I may have even seen
something like this happen before. We could perhaps do some checks, but
in general it seems hard to solve without writing flushing some data to
two different places. For example, you could flush WAL, and then update
an LSN stored somewhere else indicating how far the WAL has been
written. Recovery could complain if it gets an error in the WAL before
that point.

But obviously, that makes WAL flushes expensive (in many cases, about
twice as expensive).

Maybe it's not out of the question to offer that as an option if nobody
has a better idea. Performance-conscious users could place the extra LSN
on an SSD or NVRAM or something; or maybe use commit_delay or async
commits. It would only need to store a few bytes.

Streaming replication mitigates the problem somewhat, by being a second
place to write WAL.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-04 Thread Jeff Davis
On Thu, 2013-04-04 at 22:39 +0200, Andres Freund wrote:
 I don't think its really slower. Earlier the code took WalInsertLock
 everytime, even if we ended up not logging anything. Thats far more
 epensive than a single spinlock. And the copy should also only be taken
 in the case we need to log. So I think we end up ahead of the current
 state.

Good point.

  The code looks good to me except that we should be consistent about the
  page hole -- XLogCheckBuffer is calculating it, but then we copy the
  entire page. I don't think anything can change the size of the page hole
  while we have a shared lock on the buffer, so it seems OK to skip the
  page hole during the copy.
 
 I don't think it can change either, but I doubt that there's a
 performance advantage by not copying the hole. I'd guess the simpler
 code ends up faster.

I was thinking more about the WAL size, but I don't have a strong
opinion.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-04 Thread Jeff Davis
On Thu, 2013-04-04 at 21:06 -0400, Tom Lane wrote:
 I can't escape the feeling that we'd just be reinventing software RAID.
 There's no reason to think that we can deal with this class of problems
 better than the storage system can.

The goal would be to reliably detect a situation where WAL that has been
flushed successfully was later corrupted; not necessarily to recover
when we find it. Unless it's something like ZFS, I don't think most
software RAID will reliably detect corruption.

A side benefit is that it would also help catch bugs like this in the
future.

I'm not advocating for this particular solution, but I do concur with
Jeff Janes that it's a little bit scary and that we can entertain some
ideas about how to mitigate it.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] regression test failed when enabling checksum

2013-04-03 Thread Jeff Davis
On Wed, 2013-04-03 at 09:48 -0700, Jeff Janes wrote:

  And why did those uninitialized pages trigger warnings when they were
 autovacced, but not when they were seq scanned in a query?
 
A scan won't trigger that warning. Empty pages are sometimes the
expected result of a crash when the file is extended but the page is not
written yet. So empty pages aren't necessarily an indication of
corruption (though I'd like to fix that eventually, because sometimes
zeroing is corruption).

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] regression test failed when enabling checksum

2013-04-03 Thread Jeff Davis
On Mon, 2013-04-01 at 19:51 -0700, Jeff Janes wrote:

 I've reproduced the problem, this time in block 74 of relation
 base/16384/4931589, and a tarball of the data directory is here:
 
 
 https://docs.google.com/file/d/0Bzqrh1SO9FcELS1majlFcTZsR0k/edit?usp=sharing
 
 
 
 (the table is in database jjanes under role jjanes, the binary is
 commit 9ad27c215362df436f8c)
 
 
 What I would probably really want is the data as it existed after the
 crash but before recovery started, but since the postmaster
 immediately starts recovery after the crash, I don't know of a good
 way to capture this.

Can you just turn off the restart_after_crash GUC? I had a chance to
look at this, and seeing the block before and after recovery would be
nice. I didn't see a log file in the data directory, but it didn't go
through recovery, so I assume it already did that.

The block is corrupt as far as I can tell. The first third is written,
and the remainder is all zeros. The header looks like this:

(These numbers are mostly from pageinspect. The checksum value that
comes from pageinspect needed to be cast back to an unsigned short to
match the error message above -- should that be changed in
pageinspect?).

lsn: 7/252E4080
checksum: 34212
flags: 1
lower: 1188
upper: 5952
special: 8192
pagesize: 8192
version: 4
prune_xid: 156833911

So the header looks good, but most of the page data is missing. I tried
with pg_filedump (the 9.2.0 version, which should be fine), and it
agrees with me that the page is corrupt.

Interestingly, that doesn't result in a user-visible error when
ignore_checksum_failure=on. That's because the item pointers appear to
all be either not normal or they point to the zeroed region. Testing the
visibility of a zeroed tuple header is always false, so no problem.

You'd still think this would cause incorrect results, but I think what's
happening is that this is a new page (otherwise it would have been
written with something other than zeroes before). So, the tuples that
are supposed to be there may be uncommitted anyway.

So, the page may be corrupt without checksums as well, but it just
happens to be hidden for the same reason. Can you try to reproduce it
without -k? And on the checkin right before checksums were added?
Without checksums, you'll need to use pg_filedump (or similar) to find
whether an error has happened.

To start speculating about the root cause: something is violating the
WAL before data rule, or not writing a FPI when it's supposed to, or not
properly restoring the FPI during recovery, or something sets the wrong
LSN. This could still be caused by the checksums patch, but it seems a
little less likely now. The reason I say that is because it's a new page
with tuples on it, so that means something in the insert/update path
ended up not writing the FPI before writing the page.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] corrupt pages detected by enabling checksums

2013-04-03 Thread Jeff Davis
 On Wed, 2013-04-03 at 15:57 -0700, Jeff Janes wrote:



 You don't know that the cluster is in the bad state until after it
 goes through recovery because most crashes recover perfectly fine.  So
 it would have to make a side-copy of the cluster after the crash, then
 recover the original and see how things go, then either retain or
 delete the side-copy.  Unfortunately my testing harness can't do this
 at the moment, because the perl script storing the consistency info
 needs to survive over the crash and recovery.   It might take
 me awhile to figure out how to make it do this.

Hmm... you're right, that is difficult to integrate into a test.

Another approach is to expand PageHeaderIsValid in a pre-checksums
version to do some extra checks (perhaps the same checks as pg_filedump
does). It might be able to at least detect simple cases, like all zeros
between pd_upper and pd_special (when pd_upper  pd_special). Still not
easy, but maybe more practical than saving the directory like that. We
may even want a GUC for that to aid future debugging (obviously it would
have a performance impact).

 The other 3 files in it constitute the harness.  It is a quite a mess,
 with some hard-coded paths.  The just-posted fix for wal_keep_segments
 will also have to be applied.

I'm still trying to reproduce the problem myself. I keep trying simpler
versions of your test and I don't see the problem. It's a little awkward
to try to run the full version of your test (and I'm jumping between
code inspection and various other ideas).

 I'll work on it, but it will take awhile to figure out exactly how to
 use pg_filedump to do that, and how to automate that process and
 incorporate it into the harness.

It might be more productive to try to expand PageHeaderIsValid as I
suggested above.

 In the meantime, I tested the checksum commit itself (96ef3b8ff1c) and
 the problem occurs there, so if the problem is not the checksums
 themselves (and I agree it probably isn't) it must have been
 introduced before that rather than after.

Thank you for all of your work on this. I have also attached another
test patch that disables most of the complex areas of the checksums
patch (VM bits, hint bits, etc.). If you apply different portions of the
patch (or the whole thing), it will help eliminate possibilities. In
particular, eliminating the calls to PageSetAllVisible+visibilitymap_set
could tell us a lot.

Also, the first data directory you posted had a problem with a page that
appeared to be a new page (otherwise I would have expected either old or
new data at the end). Do you think this might be a problem only for new
pages? Or do you think your test just makes it likely that many writes
will happen on new pages?

I did find one potential problem in the checksums code (aside from my
previous patch involving wal_level=archive): visibilitymap_set is called
sometimes before the heap page is marked dirty. I will examine a little
more closely, but I expect that to require another patch. Not sure if
that could explain this problem or not.

Regards,
Jeff Davis

*** a/src/backend/access/heap/pruneheap.c
--- b/src/backend/access/heap/pruneheap.c
***
*** 257,264  heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
  		 * point in repeating the prune/defrag process until something else
  		 * happens to the page.
  		 */
! 		if (((PageHeader) page)-pd_prune_xid != prstate.new_prune_xid ||
! 			PageIsFull(page))
  		{
  			((PageHeader) page)-pd_prune_xid = prstate.new_prune_xid;
  			PageClearFull(page);
--- 257,264 
  		 * point in repeating the prune/defrag process until something else
  		 * happens to the page.
  		 */
! 		if (false  (((PageHeader) page)-pd_prune_xid != prstate.new_prune_xid ||
! 	  PageIsFull(page)))
  		{
  			((PageHeader) page)-pd_prune_xid = prstate.new_prune_xid;
  			PageClearFull(page);
*** a/src/backend/commands/vacuumlazy.c
--- b/src/backend/commands/vacuumlazy.c
***
*** 668,674  lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
  			freespace = PageGetHeapFreeSpace(page);
  
  			/* empty pages are always all-visible */
! 			if (!PageIsAllVisible(page))
  			{
  PageSetAllVisible(page);
  MarkBufferDirty(buf);
--- 668,674 
  			freespace = PageGetHeapFreeSpace(page);
  
  			/* empty pages are always all-visible */
! 			if (false  !PageIsAllVisible(page))
  			{
  PageSetAllVisible(page);
  MarkBufferDirty(buf);
***
*** 901,907  lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
  		freespace = PageGetHeapFreeSpace(page);
  
  		/* mark page all-visible, if appropriate */
! 		if (all_visible)
  		{
  			if (!PageIsAllVisible(page))
  			{
--- 901,907 
  		freespace = PageGetHeapFreeSpace(page);
  
  		/* mark page all-visible, if appropriate */
! 		if (false  all_visible)
  		{
  			if (!PageIsAllVisible(page))
  			{
***
*** 1149,1155  lazy_vacuum_page(Relation onerel

Re: [HACKERS] Hash Join cost estimates

2013-04-01 Thread Jeff Davis
On Sun, 2013-03-31 at 15:45 -0400, Tom Lane wrote:
 Really, when we're traipsing down a bucket
 list, skipping over bucket entries with the wrong hash code is just
 about free, or at least it's a whole lot cheaper than applying ExecQual.
 
 Perhaps what we should do is charge the hash_qual_cost only for some
 small multiple of the number of tuples that we expect will *pass* the
 hash quals, which is a number we have to compute anyway.  The multiple
 would represent the rate of hash-code collisions we expect.

+1.

 I'd still be inclined to charge something per bucket entry, but it
 should be really small, perhaps on the order of 0.01 times
 cpu_operator_cost.

 Or we could just drop that term entirely. 

FWIW, either of those are fine with me based on my limited experience.

 Maybe what we should be doing with the bucketsize numbers is estimating
 peak memory consumption to gate whether we'll accept the plan at all,
 rather than adding terms to the cost estimate.

Sounds reasonable.

Ideally, we'd have a way to continue executing even in that case; but
that's a project by itself, and would make it even more difficult to
cost accurately.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] regression test failed when enabling checksum

2013-04-01 Thread Jeff Davis
On Mon, 2013-04-01 at 10:37 -0700, Jeff Janes wrote:

 Over 10,000 cycles of crash and recovery, I encountered two cases of
 checksum failures after recovery, example:
 
 
 14264 SELECT 2013-03-28 13:08:38.980 PDT:WARNING:  page verification
 failed, calculated checksum 7017 but expected 1098
 14264 SELECT 2013-03-28 13:08:38.980 PDT:ERROR:  invalid page in block
 77 of relation base/16384/2088965
 
 14264 SELECT 2013-03-28 13:08:38.980 PDT:STATEMENT:  select sum(count)
 from foo

It would be nice to know whether that's an index or a heap page.

 
 In both cases, the bad block (77 in this case) is the same block that
 was intentionally partially-written during the crash.  However, that
 block should have been restored from the WAL FPW, so its fragmented
 nature should not have been present in order to be detected.  Any idea
 what is going on?  

Not right now. My primary suspect is what's going on in
visibilitymap_set() and heap_xlog_visible(), which is more complex than
some of the other code. That would require some VACUUM activity, which
isn't in your workload -- do you think autovacuum may kick in sometimes?

Thank you for testing! I will try to reproduce it, as well.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Hash Join cost estimates

2013-03-29 Thread Jeff Davis
On Thu, 2013-03-28 at 19:56 -0400, Stephen Frost wrote:
   41K hashed, seqscan 4M: 115030.10 + 1229.46 = 116259.56
   4M hashed, seqscan 41K: 1229.46 + 211156.20 = 212385.66

I think those are backwards -- typo?

   In the end, I think the problem here is that we are charging far too
   much for these bucket costs (particularly when we're getting them so
   far wrong) and not nearly enough for the cost of building the hash
   table in the first place.
 
   Thoughts?  Ideas about how we can 'fix' this?  Have others run into
   similar issues?

Yes, I have run into this issue (or something very similar). I don't
understand why the bucketsize even matters much -- assuming few hash
collisions, we are not actually evaluating the quals any more times than
necessary. So why all of the hashjoin-specific logic in determining the
number of qual evaluations? The only reason I can think of is to model
the cost of comparing the hashes themselves.

Also, searching the archives turns up at least one other, but I think
I've seen more:

http://www.postgresql.org/message-id/a82128a6-4e3b-43bd-858d-21b129f7b...@richrelevance.com

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Hash Join cost estimates

2013-03-29 Thread Jeff Davis
On Fri, 2013-03-29 at 16:37 -0400, Tom Lane wrote: 
 Jeff Davis pg...@j-davis.com writes:
  Yes, I have run into this issue (or something very similar). I don't
  understand why the bucketsize even matters much -- assuming few hash
  collisions, we are not actually evaluating the quals any more times than
  necessary. So why all of the hashjoin-specific logic in determining the
  number of qual evaluations? The only reason I can think of is to model
  the cost of comparing the hashes themselves.
 
 I think the point is that there may *not* be few hash collisions ...

In Stephen's case the table was only 41KB, so something still seems off.
Maybe we should model the likelihood of a collision based on the
cardinalities (assuming a reasonably good hash function)?

Also, I think I found an important assumption that seems dubious (in
comment for estimate_hash_bucketsize()):

If the other relation in the join has a key distribution similar to
this one's, then the most-loaded buckets are exactly those that will be
probed most often.  Therefore, the average bucket size for costing
purposes should really be taken as something close to the worst case
bucket size.  We try to estimate this by adjusting the fraction if there
are too few distinct data values, and then scaling up by the ratio of
the most common value's frequency to the average frequency.

But the key distribution is not necessarily similar at all... the large
table might have many more distinct values.

Stephen, do you think this could explain your problem?

Regards,
Jeff Davis





-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] regression test failed when enabling checksum

2013-03-26 Thread Jeff Davis
On Tue, 2013-03-26 at 02:50 +0900, Fujii Masao wrote:
 Hi,
 
 I found that the regression test failed when I created the database
 cluster with the checksum and set wal_level to archive. I think that
 there are some bugs around checksum feature. Attached is the regression.diff.

Thank you for the report. This was a significant oversight, but simple
to diagnose and fix.

There were several places that were doing something like:

   PageSetChecksumInplace
   if (use_wal)
  log_newpage
   smgrextend

Which is obviously wrong, because log_newpage set the LSN of the page,
invalidating the checksum. We need to set the checksum after
log_newpage.

Also, I noticed that copy_relation_data was doing smgrread without
validating the checksum (or page header, for that matter), so I also
fixed that.

Patch attached. Only brief testing done, so I might have missed
something. I will look more closely later.

Regards,
Jeff Davis
*** a/src/backend/access/heap/rewriteheap.c
--- b/src/backend/access/heap/rewriteheap.c
***
*** 273,286  end_heap_rewrite(RewriteState state)
  	/* Write the last page, if any */
  	if (state-rs_buffer_valid)
  	{
- 		PageSetChecksumInplace(state-rs_buffer, state-rs_blockno);
- 
  		if (state-rs_use_wal)
  			log_newpage(state-rs_new_rel-rd_node,
  		MAIN_FORKNUM,
  		state-rs_blockno,
  		state-rs_buffer);
  		RelationOpenSmgr(state-rs_new_rel);
  		smgrextend(state-rs_new_rel-rd_smgr, MAIN_FORKNUM, state-rs_blockno,
     (char *) state-rs_buffer, true);
  	}
--- 273,287 
  	/* Write the last page, if any */
  	if (state-rs_buffer_valid)
  	{
  		if (state-rs_use_wal)
  			log_newpage(state-rs_new_rel-rd_node,
  		MAIN_FORKNUM,
  		state-rs_blockno,
  		state-rs_buffer);
  		RelationOpenSmgr(state-rs_new_rel);
+ 
+ 		PageSetChecksumInplace(state-rs_buffer, state-rs_blockno);
+ 
  		smgrextend(state-rs_new_rel-rd_smgr, MAIN_FORKNUM, state-rs_blockno,
     (char *) state-rs_buffer, true);
  	}
***
*** 616,623  raw_heap_insert(RewriteState state, HeapTuple tup)
  		{
  			/* Doesn't fit, so write out the existing page */
  
- 			PageSetChecksumInplace(page, state-rs_blockno);
- 
  			/* XLOG stuff */
  			if (state-rs_use_wal)
  log_newpage(state-rs_new_rel-rd_node,
--- 617,622 
***
*** 632,637  raw_heap_insert(RewriteState state, HeapTuple tup)
--- 631,639 
  			 * end_heap_rewrite.
  			 */
  			RelationOpenSmgr(state-rs_new_rel);
+ 
+ 			PageSetChecksumInplace(page, state-rs_blockno);
+ 
  			smgrextend(state-rs_new_rel-rd_smgr, MAIN_FORKNUM,
  	   state-rs_blockno, (char *) page, true);
  
*** a/src/backend/commands/tablecmds.c
--- b/src/backend/commands/tablecmds.c
***
*** 51,56 
--- 51,57 
  #include commands/tablespace.h
  #include commands/trigger.h
  #include commands/typecmds.h
+ #include common/relpath.h
  #include executor/executor.h
  #include foreign/foreign.h
  #include miscadmin.h
***
*** 8902,8913  copy_relation_data(SMgrRelation src, SMgrRelation dst,
  
  		smgrread(src, forkNum, blkno, buf);
  
! 		PageSetChecksumInplace(page, blkno);
  
  		/* XLOG stuff */
  		if (use_wal)
  			log_newpage(dst-smgr_rnode.node, forkNum, blkno, page);
  
  		/*
  		 * Now write the page.	We say isTemp = true even if it's not a temp
  		 * rel, because there's no need for smgr to schedule an fsync for this
--- 8903,8923 
  
  		smgrread(src, forkNum, blkno, buf);
  
! 		if (!PageIsVerified(page, blkno))
! 			ereport(ERROR,
! 	(errcode(ERRCODE_DATA_CORRUPTED),
! 	 errmsg(invalid page in block %u of relation %s,
! 			blkno,
! 			relpathbackend(src-smgr_rnode.node,
! 		   src-smgr_rnode.backend,
! 		   forkNum;
  
  		/* XLOG stuff */
  		if (use_wal)
  			log_newpage(dst-smgr_rnode.node, forkNum, blkno, page);
  
+ 		PageSetChecksumInplace(page, blkno);
+ 
  		/*
  		 * Now write the page.	We say isTemp = true even if it's not a temp
  		 * rel, because there's no need for smgr to schedule an fsync for this

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Limiting setting of hint bits by read-only queries; vacuum_delay

2013-03-25 Thread Jeff Davis
On Mon, 2013-03-25 at 12:21 +, Greg Stark wrote:
 On Mon, Mar 25, 2013 at 2:50 AM, Greg Smith g...@2ndquadrant.com wrote:
  The idea I was thinking about is refactoring the background writer's role in
  hint bit maintenance
 
 A good first step might be to separate the dirty bit into two bits.
 mandatory dirty and optional dirty. (Or maybe hard dirty and
 soft dirty). Hint bit updates could set the latter and then some
 later policy decision could be made about whether to bother writing
 out the buffer if it's only optionally dirty.

I like this idea, but I think there's a problem. When checksums are
enabled, we may need to write WAL at the time the page is dirtied. It
would seem a waste if a full-page image was written, but the bgwriter
never wrote the page out because it was only a soft dirty. 

One solution is to make it work as you say, unless checksums are enabled
and it needs to write WAL (which is only for the first modification
after a checkpoint). Unfortunately, it basically means that users of
checksums will still see a big impact from doing a SELECT on a large,
freshly loaded table. So that doesn't really answer Simon's use case.

Maybe it can still be made to work. Let's say that the bgwriter can't
write soft-dirty pages out, it can only ignore them or drop them on the
floor. But VACUUM would promote pages from soft-dirty to dirty at a
controlled rate, and would write out WAL if necessary for checksums.

That should still keep some distance between the WAL writing and the WAL
flushing, but still offer better control than writing it at SELECT time.
Also, it would solve a complaint I have about Simon's proposal: that we
lose the information that we have a changed buffer in shared memory.
Maybe that's not a practical problem, but it seems like we should keep
track of that and have a way to make sure a page gets cleaned if we want
to.

The downside is that it doesn't allow anyone to get the current
behavior, unless we provide an additional GUC to say whether SELECT
causes a page to be marked soft-dirty or dirty. At least that would be a
boolean though, and the finer-grained control can be over VACUUM.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-22 Thread Jeff Davis
On Fri, 2013-03-22 at 17:09 +0200, Ants Aasma wrote:
 For performance the K8 results gave me confidence that we have a
 reasonably good overview what the performance is like for the class of
 CPU's that PostgreSQL is likely to run on. I don't think there is
 anything left to optimize there, all algorithms are pretty close to
 maximum theoretical performance.

Great work!

 The worst case workload is set up using
 CREATE TABLE sparse (id serial primary key, v text) WITH (fillfactor=10);
 INSERT INTO sparse (v) SELECT REPEAT('x', 1000) FROM 
 generate_series(1,10);
 VACUUM ANALYZE sparse;
 
 The test query itself is a simple SELECT count(v) FROM sparse;
 
 Results for the worst case workload:
 No checksums:   tps = 14.710519
 Fletcher checksums: tps = 10.825564 (1.359x slowdown)
 CRC checksums:  tps =  5.844995 (2.517x slowdown)
 SIMD checksums: tps = 14.062388 (1.046x slowdown)

I assume this is in the bad region identified by Greg, where there is
no disk activity, but shared_buffers is small, leading to a lot of
movement between the OS cache and shared buffers?

What do you mean by TPS exactly? If the select query is writing hint
bits, then you wouldn't be able to repeat it because they are already
set. So are you repeating the creation/loading of the table, as well?

 Results for pgbench scale 100:
 No checksums:   tps = 56623.819783
 Fletcher checksums: tps = 55282.222687 (1.024x slowdown)
 CRC Checksums:  tps = 50571.324795 (1.120x slowdown)
 SIMD Checksums: tps = 56608.888985 (1.000x slowdown)
 
 So to conclude, the 3 approaches:

Great analysis. Still a tough choice.

One thing that might be interesting is to look at doing SIMD for both
data and WAL. I wonder if that would be a noticeable speedup for WAL
full-page writes? That would give greater justification for the extra
work it will take (intrinsics/ASM), and it would be a nice win for
non-checksum users.

I also notice that http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%
80%93Vo_hash_function explicitly mentions adapting FNV to a smaller
size. That gives me a little more confidence. Do you have other links we
should read about this approach, or possible weaknesses?

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-22 Thread Jeff Davis
On Fri, 2013-03-22 at 17:09 +0200, Ants Aasma wrote:
 So to conclude, the 3 approaches:

One other question: assuming that the algorithms use the full 16-bit
space, is there a good way to avoid zero without skewing the result? Can
we do something like un-finalize (after we figure out that it's zero),
compute in an extra salt value, and then re-finalize? That might work
for Fletcher; but I don't think that works for CRC or Fowler-Noll-Vo
because the final value is the same as the state.

I'm still slightly concerned about differentiating checksummed pages in
the future if we want to offer a transition path, since we no longer use
header bits. Avoiding zero might help us there. Hopefully not necessary,
but something we might find useful. Also, it would help us identify
situations where the checksum is never set.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Default connection parameters for postgres_fdw and dblink

2013-03-22 Thread Jeff Davis
On Fri, 2013-03-22 at 12:19 -0400, Tom Lane wrote:
 Is there a better way to handle all this?  It may be too late to rethink
 dblink's behavior anyhow, but perhaps it's not too late to change
 postgres_fdw.  I think though that once we let 9.3 out the door, it
 *will* be too late to make any major changes, because postgres_fdw's
 usage is going to go through the roof now that it can do remote updates.

The first thing that occurs to me is to have postgres_fdw install some
GUCs with reasonable defaults.

Perhaps the default could be a magic value that is replaced by the
current user or something (similar to search_path).

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-21 Thread Jeff Davis
On Wed, 2013-03-20 at 02:11 +0200, Ants Aasma wrote:
 Fletcher is also still a strong contender, we just need to replace the
 255 modulus with something less prone to common errors, maybe use
 65521 as the modulus. I'd have to think how to best combine the values
 in that case. I believe we can lose the property that neither byte can
 be zero, just avoiding both being zero seems good enough to me.

Agreed on all points.

I've been following your analysis and testing, and it looks like there
are still at least three viable approaches:

1. Some variant of Fletcher
2. Some variant of CRC32
3. Some SIMD-based checksum

Each of those has some open implementation questions, as well. If we
settle on one of those approaches, we don't necessarily need the fastest
implementation right away. I might even argue that the first patch to be
committed should be a simple implementation of whatever algorithm we
choose, and then optimization should be done in a separate patch (if it
is tricky to get right).

Of course, it's hard to settle on the general algorithm to use without
knowing the final performance numbers. So right now I'm in somewhat of a
holding pattern until we settle on something.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-19 Thread Jeff Davis
On Sat, 2013-03-16 at 20:41 -0400, Tom Lane wrote:
 Simon Riggs si...@2ndquadrant.com writes:
  On 15 March 2013 13:08, Andres Freund and...@2ndquadrant.com wrote:
  I commented on this before, I personally think this property makes 
  fletcher a
  not so good fit for this. Its not uncommon for parts of a block being 
  all-zero
  and many disk corruptions actually change whole runs of bytes.

[ referring to Ants's comment that the existing algorithm doesn't
distinguish between 0x00 and 0xFF ]

 Meh.  I don't think that argument holds a lot of water.  The point of
 having checksums is not so much to notice corruption as to be able to
 point the finger at flaky hardware.  If we have an 8K page with only
 1K of data in it, and we fail to notice that the hardware dropped a lot
 of bits in the other 7K, we're not doing our job; and that's not really
 something to write off, because it would be a lot better if we complain
 *before* the hardware manages to corrupt something valuable.

I will move back to verifying the page hole, as well.

There are a few approaches:

1. Verify that the page hole is zero before write and after read.
2. Include it in the calculation (if we think there are some corner
cases where the hole might not be all zero).
3. Zero the page hole before write, and verify that it's zero on read.
This can be done during the memcpy at no performance penalty in
PageSetChecksumOnCopy(), but that won't work for
PageSetChecksumInplace().

With option #2 or #3, we might also verify that the hole is all-zero if
asserts are enabled.

 So I think we'd be best off to pick an algorithm whose failure modes
 don't line up so nicely with probable hardware failure modes.  It's
 worth noting that one of the reasons that CRCs are so popular is
 precisely that they were designed to detect burst errors with high
 probability.

Another option is to use a different modulus. The page
http://en.wikipedia.org/wiki/Fletcher%27s_checksum suggests that a prime
number can be a good modulus for Fletcher-32. Perhaps we could use 251
instead of 255? That would make it less likely to miss a common form of
hardware failure, although it would also reduce the number of possible
checksums slightly (about 4% fewer than 2^16).

I'm leaning toward this option now, or a CRC of some kind if the
performance is reasonable.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-19 Thread Jeff Davis
On Fri, 2013-03-15 at 14:32 +0200, Ants Aasma wrote:
 The most obvious case here is that you
 can swap any number of bytes from 0x00 to 0xFF or back without
 affecting the hash.

That's a good point. Someone (Simon?) had brought that up before, but
you and Tom convinced me that it's a problem. As I said in my reply to
Tom, one option is to change the modulus.

 I took a look at how the fletcher-64 compiles.

Great analysis, thank you.

 I'm not really sure if parallel checksums would be worth doing or not.
 On one hand, enabling data parallelism would make it more future
 proof, on the other hand, the unvectorized variant is slower than
 Fletcher-64.

Looks like we still have several options being discussed. I think the
checksum with modulo 255 is out, but perhaps a different modulus is
still on the table. And if we can get a CRC to be fast enough, then we'd
all be happy with that option.

Another thing to consider is that, right now, the page is copied and
then checksummed. If we can calculate the checksum during the copy, that
might save us a small amount of effort. My feeling is that it would only
really help if the checksum is very cheap and works on large word sizes,
but I'm not sure.

 On another note, I think I found a bug with the current latest patch.

Ugh. Great catch, thank you!

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-18 Thread Jeff Davis
On Mon, 2013-03-18 at 13:52 -0400, Bruce Momjian wrote:
 In fact, this feature is going to need
 pg_upgrade changes to detect from pg_controldata that the old/new
 clusters have the same checksum setting.

I believe that has been addressed in the existing patch. Let me know if
you see any problems.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-18 Thread Jeff Davis
On Sun, 2013-03-17 at 22:26 -0700, Daniel Farina wrote:
 as long as I am able to turn them off easily 

To be clear: you don't get the performance back by doing
ignore_checksum_failure = on. You only get around the error itself,
which allows you to dump/reload the good data.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-13 Thread Jeff Davis
On Thu, 2013-03-07 at 13:45 -0800, Jeff Davis wrote:
 I need to do another self-review after these changes and some more
 extensive testing, so I might have missed a couple things.

New patch attached.

Aside from rebasing, I also found a problem with temp tables. At first I
was going to fix it by continuing to exclude temp tables from checksums
entirely. But then I re-thought it and decided to just checksum temp
tables, too.

Excluding temp tables from checksums means more special cases in the
code, and more documentation. After thinking about it, there is no huge
benefit to excluding temp tables:
  * small temp tables will be in memory only, and never checksummed
  * no WAL for temp tables, so the biggest cost of checksums is
non-existent
  * there are good reasons to want to checksum temp tables, because they
can be used to stage data for permanent tables

However, I'm willing to be convinced to exclude temp tables again.

Regards,
Jeff Davis



checksums-20130312.patch.gz
Description: GNU Zip compressed data


replace-tli-with-checksums-20130312.patch.gz
Description: GNU Zip compressed data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-07 Thread Jeff Davis
On Tue, 2013-03-05 at 11:35 +0200, Heikki Linnakangas wrote:
 If you enable checksums, the free space map never gets updated in a 
 standby. It will slowly drift to be completely out of sync with reality, 
 which could lead to significant slowdown and bloat after failover.

One of the design points of this patch is that those operations that use
MarkBufferDirtyHint(), including tuple hint bits, the FSM, index dead
markers, etc., do not directly go to the standby. That's because the
standby can't write WAL, so it can't protect itself against a torn page
breaking the checksum.

However, these do make it through by riding along with a full-page image
in the WAL. The fact that checksums are enabled means that these full
page images will be written once per modified page per checkpoint, and
then replayed on the standby. FSM should get the updates the same way,
even though no other WAL is written for the FSM.

If full_page_writes are disabled, then the updates will never arrive.
But in that case, I think we can just go ahead and dirty the page during
recovery, because there isn't a real problem. I was hesitant to make
this change in my patch because:
1. I wanted to see if someone saw a flaw in this reasoning; and
2. I noticed that full_page_images can be changed with a SIGHUP, which
could add complexity (I don't see any reason we allow this... shouldn't
we just force a restart for that change?).

I added a README file, moved some of the explanatory material there, and
tried to clarify this situation.

Let me know if you see a problem that I'm missing. I verified that at
least some FSM changes do make it through with checksums on, but I
didn't dig much deeper than that.

 Since the checksums are an all-or-nothing cluster-wide setting, the 
 three extra flags in the page header, PD_CHECKSUMS1, PD_CHECKSUM2 and 
 PD_HEADERCHECK, are not needed. Let's leave them out. That keeps the 
 code simpler, and leaves the bits free for future use. If we want to 
 enable such per-page setting in the future, we can add it later. For a 
 per-relation scheme, they're not needed.

Removed header bits.

 XLogCheckBuffer() and XLogCheckBufferNeedsBackup() read the page LSN 
 without a lock. That's not atomic, so it could incorrectly determine 
 that a page doesn't need to be backed up. We used to always hold an 
 exclusive lock on the buffer when it's called, which prevents 
 modifications to the LSN, but that's no longer the case.

Fixed. I added a new exported function, BufferGetLSNAtomic().

There was another similar omission in gistget.c.

By the way, I can not find any trace of XLogCheckBufferNeedsBackup(),
was that a typo?

 Shouldn't SetBufferCommitInfoNeedsSave() check the BM_PERMANENT flag? I 
 think it will generate WAL records for unlogged tables as it is.

Fixed.

I also rebased and added a GUC to control whether the checksum failure
causes an error or not.

I need to do another self-review after these changes and some more
extensive testing, so I might have missed a couple things.

Regards,
Jeff Davis


checksums-20130307.patch.gz
Description: GNU Zip compressed data


replace-tli-with-checksums-20130307.patch.gz
Description: GNU Zip compressed data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-05 Thread Jeff Davis

Thank you for the review.

On Tue, 2013-03-05 at 11:35 +0200, Heikki Linnakangas wrote:
 If you enable checksums, the free space map never gets updated in a 
 standby. It will slowly drift to be completely out of sync with reality, 
 which could lead to significant slowdown and bloat after failover.

Will investigate.

 Since the checksums are an all-or-nothing cluster-wide setting, the 
 three extra flags in the page header, PD_CHECKSUMS1, PD_CHECKSUM2 and 
 PD_HEADERCHECK, are not needed. Let's leave them out. That keeps the 
 code simpler, and leaves the bits free for future use. If we want to 
 enable such per-page setting in the future, we can add it later. For a 
 per-relation scheme, they're not needed.

They don't really need to be there, I just put them there because it
seemed wise if we ever want to allow online enabling/disabling of
checksums. But I will remove them.

 How does the error detection rate of this compare with e.g CRC-16? Is 
 there any ill effect from truncating the Fletcher sums like this?

I don't recall if I published these results or not, but I loaded a
table, and used pageinspect to get the checksums of the pages. I then
did some various GROUP BY queries to see if I could find any clustering
or stepping of the checksum values, and I could not. The distribution
seemed very uniform across the 255^2 space.

I tried to think of other problems, like missing errors in the high or
low bits of a word or a page (similar to the issue with mod 256
described below), but I couldn't find any. I'm not enough of an expert
to say more than that about the error detection rate.

Fletcher is probably significantly faster than CRC-16, because I'm just
doing int32 addition in a tight loop.

Simon originally chose Fletcher, so perhaps he has more to say.

 That's a bit odd. We don't avoid zero in the WAL crc, and I don't recall 
 seeing that in other checksum implementations either. 16-bits is not 
 very wide for a checksum, and this eats about 1% of the space of valid 
 values.
 
 I can see that it might be a handy debugging aid to avoid 0. But there's 
 probably no need to avoid 0 in both bytes, it seems enough to avoid a 
 completely zero return value.

http://en.wikipedia.org/wiki/Fletcher%27s_checksum

If you look at the section on Fletcher-16, it discusses the choice of
the modulus. If we used 256, then an error anywhere except the lowest
byte of a 4-byte word read from the page would be missed.

Considering that I was using only 255 values anyway, I thought I might
as well shift the values away from zero.

We could get slightly better by using all combinations. I also
considered chopping the 64-bit ints into 16-bit chunks and XORing them
together. But when I saw the fact that we avoided zero with the other
approach, I kind of liked it, and kept it.

 XLogCheckBuffer() and XLogCheckBufferNeedsBackup() read the page LSN 
 without a lock. That's not atomic, so it could incorrectly determine 
 that a page doesn't need to be backed up. We used to always hold an 
 exclusive lock on the buffer when it's called, which prevents 
 modifications to the LSN, but that's no longer the case.

Will investigate, but it sounds like a buffer header lock will fix it.

 Shouldn't SetBufferCommitInfoNeedsSave() check the BM_PERMANENT flag? I 
 think it will generate WAL records for unlogged tables as it is.

Yes, thank you.

Also, in FlushBuffer(), this patch moves the clearing of the
BM_JUST_DIRTIED bit to before the WAL flush. That seems to expand the
window during which a change to a page will prevent it from being marked
clean. Do you see any performance problem with that?

The alternative is to take the buffer header lock twice: once to get the
LSN, then WAL flush, then another header lock to clear BM_JUST_DIRTIED.
Not sure if that's better or worse. This goes back to Simon's patch, so
he may have a comment here, as well.

I'll post a new patch with these comments addressed, probably tomorrow
so that I have some time to self-review and do some basic testing.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 10:36 +0200, Heikki Linnakangas wrote:
 On 04.03.2013 09:11, Simon Riggs wrote:
  Are there objectors?
 
 FWIW, I still think that checksumming belongs in the filesystem, not 
 PostgreSQL.

Doing checksums in the filesystem has some downsides. One is that you
need to use a copy-on-write filesystem like btrfs or zfs, which (by
design) will fragment the heap on random writes. If we're going to start
pushing people toward those systems, we will probably need to spend some
effort to mitigate this problem (aside: my patch to remove
PD_ALL_VISIBLE might get some new wind behind it).

There are also other issues, like what fraction of our users can freely
move to btrfs, and when. If it doesn't happen to be already there, you
need root to get it there, which has never been a requirement before.

I don't fundamentally disagree. We probably need to perform reasonably
well on btrfs in COW mode[1] regardless, because a lot of people will be
using it a few years from now. But there are a lot of unknowns here, and
I'm concerned about tying checksums to a series of things that will be
resolved a few years from now, if ever.

[1] Interestingly, you can turn off COW mode on btrfs, but you lose
checksums if you do.

  If you go ahead with this anyway, at the very least I'd like 
 to see some sort of a comparison with e.g btrfs. How do performance, 
 error-detection rate, and behavior on error compare? Any other metrics 
 that are relevant here?

I suspect it will be hard to get an apples-to-apples comparison here
because of the heap fragmentation, which means that a sequential scan is
not so sequential. That may be acceptable for some workloads but not for
others, so it would get tricky to compare. And any performance numbers
from an experimental filesystem are somewhat suspect anyway.

Also, it's a little more challenging to test corruption on a filesystem,
because you need to find the location of the file you want to corrupt,
and corrupt it out from underneath the filesystem.

Greg may have more comments on this matter.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-02-25 at 01:30 -0500, Greg Smith wrote:
 Attached is some bit rot updates to the checksums patches.  The 
 replace-tli one still works fine.  I fixed a number of conflicts in the 
 larger patch.  The one I've attached here isn't 100% to project 
 standards--I don't have all the context diff tools setup yet for 
 example.  I expect to revise this more now that I've got the whole week 
 cleared to work on CF submissions.

Thank you for the rebase. I redid the rebase myself and came up with
essentially the same result, but there was an additional problem that
needed fixing after the materialized view patch. 

I will post a new version tonight that includes those fixes as well as
something to address these recent comments (probably just another GUC).
Further comment in another reply.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 11:52 +0800, Craig Ringer wrote:
 I also suspect that at least in the first release it might be desirable
 to have an option that essentially says something's gone horribly wrong
 and we no longer want to check or write checksums, we want a
 non-checksummed DB that can still read our data from before we turned
 checksumming off. Essentially, a way for someone who's trying
 checksumming in production after their staging tests worked out OK to
 abort and go back to the non-checksummed case without having to do a
 full dump and reload.

A recovery option to extract data sounds like a good idea, but I don't
want to go as far as you are suggesting here.

An option to ignore checksum failures (while still printing the
warnings) sounds like all we need here. I think Greg's point that the
page might be written out again (hiding the corruption) is a very good
one, but the same is true for zero_damaged_pages. So we can just still
allow the writes to proceed (including setting the checksum on write),
and the system should be as available as it would be without checksums.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 22:13 +0200, Heikki Linnakangas wrote:
 On 04.03.2013 20:58, Greg Smith wrote:
  There
  is no such thing as a stable release of btrfs, and no timetable for when
  there will be one. I could do some benchmarks of that but I didn't think
  they were very relevant. Who cares how fast something might run when it
  may not work correctly? btrfs might as well be /dev/null to me right
  now--sure it's fast, but maybe the data won't be there at all.
 
 This PostgreSQL patch hasn't seen any production use, either. In fact, 
 I'd consider btrfs to be more mature than this patch. Unless you think 
 that there will be some major changes to the worse in performance in 
 btrfs, it's perfectly valid and useful to compare the two.
 
 A comparison with ZFS would be nice too. That's mature, and has checksums.

Is there any reason why we can't have both postgres and filesystem
checksums? The same user might not want both (or might, if neither are
entirely trustworthy yet), but I think it's too early to declare one as
the right solution and the other not. Even with btrfs stable, I
pointed out a number of reasons users might not want it, and reasons
that the project should not depend on it.

Numbers are always nice, but it takes a lot of effort to come up with
them. What kind of numbers are you looking for, and how *specifically*
will those numbers affect the decision?

If btrfs with checksums is 10% slower than ext4 with postgres checksums,
does that mean we should commit the postgres checksums?

On the other side of the coin, if btrfs with checksums is exactly the
same speed as ext4 with no postgres checksums (i.e. checksums are free
if we use btrfs), does that mean postgres checksums should be rejected?

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Sun, 2013-03-03 at 22:18 -0500, Greg Smith wrote:
 As for a design of a GUC that might be useful here, the option itself 
 strikes me as being like archive_mode in its general use.  There is an 
 element of parameters like wal_sync_method or enable_cassert though, 
 where the options available vary depending on how you built the cluster. 
   Maybe name it checksum_level with options like this:
 
 off:  only valid option if you didn't enable checksums with initdb
 enforcing:  full checksum behavior as written right now.
 unvalidated:  broken checksums on reads are ignored.

I think GUCs should be orthogonal to initdb settings. If nothing else,
it's extra effort to get initdb to write the right postgresql.conf.

A single new GUC that prevents checksum failures from causing an error
seems sufficient to address the concerns you, Dan, and Craig raised.

We would still calculate the checksum and print the warning; and then
pass it through the rest of the header checks. If the header checks
pass, then it proceeds. If the header checks fail, and if
zero_damaged_pages is off, then it would still generate an error (as
today).

So: ignore_checksum_failures = on|off ?

 The main tricky case I see in that is where you read in a page with a 
 busted checksum using unvalidated.  Ideally you wouldn't write such a 
 page back out again, because it's going to hide that it's corrupted in 
 some way already.  How to enforce that though?  Perhaps unvalidated 
 only be allowed in a read-only transaction?

That's a good point. But we already have zero_damaged_pages, which does
something similar. And it's supposed to be a recovery option to get the
data out rather than something to run in online mode. It will still
print the warning, so it won't completely hide the corruption.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 13:58 -0500, Greg Smith wrote:
 On 3/4/13 2:11 AM, Simon Riggs wrote:
  It's crunch time. Do you and Jeff believe this patch should be
  committed to Postgres core?
 
 I want to see a GUC to allow turning this off, to avoid the problem I 
 saw where a non-critical header corruption problem can cause an entire 
 page to be unreadable.  A variation on that capable of turning this off 
 altogether, as Craig suggested, is a good idea too.

Based on your comments as well those of Dan and Craig, I am leaning
toward a GUC that causes a checksum failure to be ignored. It will still
emit the checksum failure warning, but proceed.

That will then fall through to the normal header checks we've always
had, and the same zero_damaged_pages option. So, to get past a really
corrupt page, you'd need to set ignore_checksum_failure and
zero_damaged_pages.

 I'll write up a long discussion of filesystem trends and why I think 
 this is more relevant than ever if that's the main objection now.  There 
 is no such thing as a stable release of btrfs, and no timetable for when 
 there will be one.  I could do some benchmarks of that but I didn't 
 think they were very relevant.  Who cares how fast something might run 
 when it may not work correctly?  btrfs might as well be /dev/null to me 
 right now--sure it's fast, but maybe the data won't be there at all. 
 How long has it taken the Linux kernel to reach the point it handles 
 write barriers and fsync correctly?  It does not give me a lot of 
 confidence that now is the time they'll suddenly start executing on 
 database filesystem mechanics perfectly.

I have a similar viewpoint here. It will take significant effort to come
up with anything, and I'm not sure how meaningful the numbers would be.
Even if btrfs is great, this feature is not mutually exclusive with
btrfs:
  * users might not have easy access to run the filesystem
  * they might not trust it
  * they might get poor performance numbers
  * postgres checksums might provide a good test of btrfs checksums, and
vice-versa, until both are stable

Additionally, I don't like the idea of depending so heavily on what
linux is doing. If there are performance problems that affect postgres,
will they fix them? Will they introduce new ones? Are there a zillion
tuneable options that a new user has to get right in order to run
postgres efficiently, and will poor settings mean a bunch of postgres
is slow blog posts?

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 22:27 +0200, Heikki Linnakangas wrote:
 Yeah, fragmentation will certainly hurt some workloads. But how badly, 
 and which workloads, and how does that compare with the work that 
 PostgreSQL has to do to maintain the checksums? I'd like to see some 
 data on those things.

I think we all would. Btrfs will be a major filesystem in a few years,
and we should be ready to support it.

Unfortunately, it's easier said than done. What you're talking about
seems like a significant benchmark report that encompasses a lot of
workloads. And there's a concern that a lot of it will be invalidated if
they are still improving the performance of btrfs.

 If you're serious enough about your data that you want checksums, you 
 should be able to choose your filesystem.

I simply disagree. I am targeting my feature at casual users. They may
not have a lot of data or a dedicated DBA, but the data they do have
might be very important transactional data.

And right now, if they take a backup of their data, it will contain all
of the corruption from the original. And since corruption is silent
today, then they would probably think the backup is fine, and may delete
the previous good backups.

 An apples-to-apples comparison is to run the benchmark and see what 
 happens. If it gets fragmented as hell on btrfs, and performance tanks 
 because of that, then that's your result. If avoiding fragmentation is 
 critical to the workload, then with btrfs you'll want to run the 
 defragmenter in the background to keep it in order, and factor that into 
 the test case.

Again, easier said than done. To get real fragmentation problems, the
data set needs to be huge, and we need to reach a steady state of this
background defrag process, and a million other things.

 I realize that performance testing is laborious. But we can't skip it 
 and assume that the patch performs fine, because it's hard to benchmark.

You aren't asking me to benchmark the patch in question. You are asking
me to benchmark a filesystem that very few people actually run postgres
on in production. I don't think that's a reasonable requirement.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 14:57 -0600, Jim Nasby wrote:
 I suggest we paint that GUC along the lines of
 checksum_failure_log_level, defaulting to ERROR. That way if someone
 wanted completely bury the elogs to like DEBUG they could.

The reason I didn't want to do that is because it's essentially a
recovery feature. A boolean seems more appropriate than a slider.

That's a good point about burying the messages with DEBUG, but I think
it might be slightly over-engineering it. I am willing to change it if
others want it, though.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 23:22 +0200, Heikki Linnakangas wrote:
 On 04.03.2013 23:00, Jeff Davis wrote:
  On Mon, 2013-03-04 at 22:27 +0200, Heikki Linnakangas wrote:
  Yeah, fragmentation will certainly hurt some workloads. But how badly,
  and which workloads, and how does that compare with the work that
  PostgreSQL has to do to maintain the checksums? I'd like to see some
  data on those things.
 
  I think we all would. Btrfs will be a major filesystem in a few years,
  and we should be ready to support it.
 
 Perhaps we should just wait a few years? If we suspect that this becomes 
 obsolete in a few years

I do not expect it to be obsolete, even if btrfs is stable and fast
today.

Consider this hypothetical scenario: what if btrfs performs acceptably
well today, but they tune it away from our needs later and it tanks
performance? Then, when we complain, the btrfs people say for DB
workloads, you should turn off COW, or use ext4 or XFS. And then we say
but we want checksums. And then they tell us that real databases do
their own checksums.

Then what?

I don't think that scenario is very outlandish. Postgres is essentially
a COW system (for tuples), and stacking COW on top of COW does not seem
like a good idea (neither for filesystems nor actual cows). So it may be
within reason for the filesystem folks to say we're doing the wrong
thing, and then checksums are our problem again. Additionally, I don't
have a lot of faith that linux will address all of our btrfs complaints
(even legitimate ones) in a reasonable amount of time, if ever.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Mon, 2013-03-04 at 23:11 +0200, Heikki Linnakangas wrote:
 Of course not. But if we can get away without checksums in Postgres, 
 that's better, because then we don't need to maintain that feature in 
 Postgres. If the patch gets committed, it's not mission accomplished. 
 There will be discussion and need for further development on things like 
 what to do if you get a checksum failure, patches to extend the 
 checksums to cover things like the clog and other non-data files and so 
 forth. And it's an extra complication that will need to be taken into 
 account when developing other new features; in particular, hint bit 
 updates need to write a WAL record. Even if you have all the current 
 hint bits covered, it's an extra hurdle for future patches that might 
 want to have hint bits in e.g new index access methods.

The example you chose of adding a hint bit is a little overstated -- as
far as I can tell, setting a hint bit follows pretty much the same
pattern as before, except that I renamed the function to
MarkBufferDirtyHint().

But I agree in general. If complexity can be removed or avoided, that is
a very good thing. But right now, we have no answer to a real problem
that other databases do have an answer for. To me, the benefit is worth
the cost.

We aren't going down an irreversible path by adding checksums. If every
platform has a good checksumming filesystem and there is no demand for
the postgres code any more, we can deprecate it and remove it. But at
least users would have something between now and then.

 The PostgreSQL project would not be depending on it, any more than the 
 project depends on filesystem snapshots for backup purposes, or the OS 
 memory manager for caching.

I don't understand your analogies at all. We have WAL-protected base
backups so that users can get a consistent snapshot without filesystem
snapshots. To follow the analogy, we want postgres checksums so that the
user can be protected without filesystem checksums.

I would agree with you if we could point users somewhere and actually
recommend something and say what you're doing now is wrong, do X
instead (though if there is only one such X, we are dependent on it).
But even if we fast forward to three years from now: if someone shows up
saying that XFS gives him the best performance, but wants checksums,
will we really be able to say you are wrong to be using XFS; use
Btrfs?

One of the things I like about postgres is that we don't push a lot of
hard trade-offs on users. Several people (including you) put in effort
recently to support unlogged gist indexes. Are there some huge number of
users there that can't live without unlogged gist indexes? Probably not.
But that is one less thing that potential users have to trade away, and
one less thing to be confused or frustrated about.

I want to get to the point where checksums are the default, and only
advanced users would disable them. If that point comes in the form of
checksumming filesystems that are fast enough and enabled by default on
most of the platforms we support, that's fine with me. But I'm not very
sure that it will happen that way ever, and certainly not soon.

  If btrfs with checksums is 10% slower than ext4 with postgres checksums,
  does that mean we should commit the postgres checksums?
 
 In my opinion, a 10% gain would not be worth it, and we should not 
 commit in that case.
 
  On the other side of the coin, if btrfs with checksums is exactly the
  same speed as ext4 with no postgres checksums (i.e. checksums are free
  if we use btrfs), does that mean postgres checksums should be rejected?
 
 Yes, I think so. I'm sure at least some others will disagree; Greg 
 already made it quite clear that he doesn't care how the performance of 
 this compares with btrfs.

If all paths lead to rejection, what are these tests supposed to
accomplish, exactly?

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-03-04 Thread Jeff Davis
On Sun, 2013-03-03 at 18:05 -0500, Greg Smith wrote:
 = Test 1 - find worst-case overhead for the checksum calculation on write =
 
 This can hit 25% of runtime when you isolate it out.  I'm not sure if 
 how I'm running this multiple times makes sense yet.  This one is so 
 much slower on my Mac that I can't barely see a change at all.
 
 = Test 2 - worst-case overhead for calculating checksum while reading data =
 
 Jeff saw an 18% slowdown, I get 24 to 32%.  This one bothers me because 
 the hit is going to happen during the very common situation where data 
 is shuffling a lot between a larger OS cache and shared_buffers taking a 
 relatively small fraction.  If that issue were cracked, such that 
 shared_buffers could be 50% of RAM, I think the typical real-world 
 impact of this would be easier to take.

I believe that test 1 and test 2 can be improved a little, if there is a
need. Right now we copy the page and then calculate the checksum on the
copy. If we instead calculate as we're copying, I believe it will make
it significantly faster.

I decided against doing that, because it decreased the readability, and
we can always do that later as an optimization. That should mitigate the
case you have in mind, which is a very legitimate concern. I'll wait for
someone to ask for it, though.

 = Test 3 - worst-case WAL overhead =
 
 This is the really nasty one.  The 10,000,000 rows touched by the SELECT 
 statement here create no WAL in a non-checksum environment.  When 
 checksums are on, 368,513,656 bytes of WAL are written, so about 37 
 bytes per row.

Yeah, nothing we can do about this.

 Right now the whole hint bit mechanism and its overhead are treated as 
 an internal detail that isn't in the regular documentation.  I think 
 committing this sort of checksum patch will require exposing some of the 
 implementation to the user in the documentation, so people can 
 understand what the trouble cases are--either in advance or when trying 
 to puzzle out why they're hitting one of them.

Any particular sections that you think would be good to update?

Thank you for the test results.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-01-27 Thread Jeff Davis
On Sat, 2013-01-26 at 23:23 -0500, Robert Haas wrote:
  If we were to try to defer writing the WAL until the page was being
  written, the most it would possibly save is the small XLOG_HINT WAL
  record; it would not save any FPIs.
 
 How is the XLOG_HINT_WAL record kept small and why does it not itself
 require an FPI?

There's a maximum of one FPI per page per cycle, and we need the FPI for
any modified page in this design regardless.

So, deferring the XLOG_HINT WAL record doesn't change the total number
of FPIs emitted. The only savings would be on the trivial XLOG_HINT wal
record itself, because we might notice that it's not necessary in the
case where some other WAL action happened to the page.

  At first glance, it seems sound as long as the WAL FPI makes it to disk
  before the data. But to meet that requirement, it seems like we'd need
  to write an FPI and then immediately flush WAL before cleaning a page,
  and that doesn't seem like a win. Do you (or Simon) see an opportunity
  here that I'm missing?
 
 I am not sure that isn't a win.  After all, we can need to flush WAL
 before flushing a buffer anyway, so this is just adding another case -

Right, but if we get the WAL record in earlier, there is a greater
chance that it goes out with some unrelated WAL flush, and we don't need
to flush the WAL to clean the buffer at all. Separating WAL insertions
from WAL flushes seems like a fairly important goal, so I'm a little
skeptical of a proposal to narrow that gap so drastically.

It's hard to analyze without a specific proposal on the table. But if
cleaning pages requires a WAL record followed immediately by a flush, it
seems like that would increase the number of actual WAL flushes we need
to do by a lot.

 and the payoff is that the initial access to a page, setting hint
 bits, is quickly followed by a write operation, we avoid the need for
 any extra WAL to cover the hint bit change.  I bet that's common,
 because if updating you'll usually need to look at the tuples on the
 page and decide whether they are visible to your scan before, say,
 updating one of them

That's a good point, I'm just not sure how avoid that problem without a
lot of complexity or a big cost. It seems like we want to defer the
XLOG_HINT WAL record for a short time; but not wait so long that we need
to clean the buffer or miss a chance to piggyback on another WAL flush.

  By the way, the approach I took was to add the heap buffer to the WAL
  chain of the XLOG_HEAP2_VISIBLE wal record when doing log_heap_visible.
  It seemed simpler to understand than trying to add a bunch of options to
  MarkBufferDirty.
 
 Unless I am mistaken, that's going to heavy penalize the case where
 the user vacuums an insert-only table.  It will emit much more WAL
 than currently.

Yes, that's true, but I think that's pretty fundamental to this
checksums design (and of course it only applies if checksums are
enabled). We need to make sure an FPI is written and the LSN bumped
before we write a page.

That's why I was pushing a little on various proposals to either remove
or mitigate the impact of hint bits (load path, remove PD_ALL_VISIBLE,
cut down on the less-important hint bits, etc.). Maybe those aren't
viable, but that's why I spent time on them.

There are some other options, but I cringe a little bit thinking about
them. One is to simply exclude the PD_ALL_VISIBLE bit from the checksum
calculation, so that a torn page doesn't cause a problem (though
obviously that one bit would be vulnerable to corruption). Another is to
use a double-write buffer, but that didn't seem to go very far. Or, we
could abandon the whole thing and tell people to use ZFS/btrfs/NAS/SAN.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-01-25 Thread Jeff Davis
On Fri, 2013-01-25 at 15:29 -0500, Robert Haas wrote:
 I thought Simon had the idea, at some stage, of writing a WAL record
 to cover hint-bit changes only at the time we *write* the buffer and
 only if no FPI had already been emitted that checkpoint cycle.  I'm
 not sure whether that approach was sound, but if so it seems more
 efficient than this approach.

My patch is based on his original idea; although I've made quite a lot
of changes, I believe that I have stuck to his same basic design w.r.t.
WAL.

This patch does not cause a new FPI to be emitted if one has already
been emitted this cycle. It also does not emit a WAL record at all if an
FPI has already been emitted.

If we were to try to defer writing the WAL until the page was being
written, the most it would possibly save is the small XLOG_HINT WAL
record; it would not save any FPIs.

At first glance, it seems sound as long as the WAL FPI makes it to disk
before the data. But to meet that requirement, it seems like we'd need
to write an FPI and then immediately flush WAL before cleaning a page,
and that doesn't seem like a win. Do you (or Simon) see an opportunity
here that I'm missing?

By the way, the approach I took was to add the heap buffer to the WAL
chain of the XLOG_HEAP2_VISIBLE wal record when doing log_heap_visible.
It seemed simpler to understand than trying to add a bunch of options to
MarkBufferDirty.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Enabling Checksums

2013-01-24 Thread Jeff Davis
On Wed, 2013-01-16 at 17:38 -0800, Jeff Davis wrote:
 New version of checksums patch.

And another new version of both patches.

Changes:
 * Rebased.
 * Rename SetBufferCommitInfoNeedsSave to MarkBufferDirtyHint. Now that
it's being used more places, it makes sense to give it a more generic
name.
 * My colleague, Yingjie He, noticed that the FSM doesn't write any WAL,
and therefore we must protect those operations against torn pages. That
seems simple enough: just use MarkBufferDirtyHint (formerly
SetBufferCommitInfoNeedsSave) instead of MarkBufferDirty. The FSM
changes are not critical, so the fact that we may lose the dirty bit is
OK.

Regards,
Jeff Davis


replace-tli-with-checksums-20130124.patch.gz
Description: GNU Zip compressed data


checksums-20130124.patch.gz
Description: GNU Zip compressed data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Removing PD_ALL_VISIBLE

2013-01-22 Thread Jeff Davis
On Mon, 2013-01-21 at 12:00 +0200, Heikki Linnakangas wrote:
 On 21.01.2013 11:10, Jeff Davis wrote:
  That confuses me. The testing was to show it didn't hurt other workloads
  (like scans or inserts/updates/deletes); so the best possible result is
  that they don't show signs either way.
 
 I went back to look at the initial test results that demonstrated that 
 keeping the pin on the VM buffer mitigated the overhead of pinning the 
 vm page. The obvious next question is, what is the impact when that's 
 inefficient, ie. when you update pages across different 512 MB sections, 
 so that the vm pin has to be dropped and reacquired repeatedly.
 
 I tried to construct a worst case scenario for that:

I confirmed this result in a single connection (no concurrency). I used
a shared_buffers of 2GB so that the whole table would fit.

Also, I fixed a bug that I noticed along the way, which was an
uninitialized variable. New version attached.

FWIW, I'm considering this patch to be rejected; I just didn't want to
leave a patch with a bug in it.

Regards,
Jeff Davis


rm-pd-all-visible-20130122.patch.gz
Description: GNU Zip compressed data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Removing PD_ALL_VISIBLE

2013-01-21 Thread Jeff Davis
On Mon, 2013-01-21 at 12:49 +0530, Pavan Deolasee wrote:

 At the minimum your patch will need to have one additional buffer
 pinned for every K  8192 * 8 heap pages.

I assume it's the same K I referred to when responding to Robert: the
max number of heap buffers we read before we unpin and repin the VM
buffer. Right now it's unlimited, because there is currently no need to
have it at all -- I was just offering the solution in case we did want
to do VM page cleanup in the future or something.

  For most cases, the value of K will be large enough to ignore the
 overhead, but for small values of K, I'm not sure if we can say that
 with confidence.

It's a constant, and we can easily set it high enough that it wouldn't
matter. There's no reason at all to choose a small value for K. Consider
that an index root page pin is held for the entire scan, and we don't
have a problem with that.

 Of course, for very small tables the real contention might be
 somewhere else and so this change may not show up anything on the
 benchmarks. Doesn't your own tests for 33 page tables confirm that
 there is indeed contention for this case ? I agree its not huge, but I
 don't know if there are workloads where it will matter.

That appears to happen because of the fraction of pinned pages being too
high (aside: it was fairly challenging to construct a test where that
happened). I believe it was mostly solved by adding a threshold to use
the VM, and I can probably solve it completely by doing some more
experiments and finding a better threshold value.
 
 I understand that my patch is probably not going in, 
 
 Sorry about that. I know how that feels. But we need some more reasons
 to justify this change, especially because the performance numbers
 themselves are not showing any signs either way.

That confuses me. The testing was to show it didn't hurt other workloads
(like scans or inserts/updates/deletes); so the best possible result is
that they don't show signs either way.

But yes, I see that others are not interested in the benefits offered by
the patch, which is why I'm giving up on it. If people are concerned
about the costs, then I can fix those; but there's nothing I can do to
increase the benefits.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Removing PD_ALL_VISIBLE

2013-01-20 Thread Jeff Davis
On Sun, 2013-01-20 at 22:19 -0500, Tom Lane wrote:
 Robert Haas robertmh...@gmail.com writes:
  On Fri, Jan 18, 2013 at 3:31 AM, Jeff Davis pg...@j-davis.com wrote:
  So, I attached a new version of the patch that doesn't look at the VM
  for tables with fewer than 32 pages. That's the only change.
 
  That certainly seems worthwhile, but I still don't want to get rid of
  this code.  I'm just not seeing a reason why that's something that
  desperately needs to be done.
 
 Yeah, I'm having the same problem.  Despite Jeff's test results, I can't
 help thinking that lack of PD_ALL_VISIBLE *will* hurt us under some
 workloads, and it's not obvious to me what benefit we get from dropping
 it.

OK. I respectfully disagree with the arguments I've seen so far, but we
can all move on.

I already had some more test results (again, thanks to Nathan Boley), so
I finished them up and attached them to the bottom of this email for the
archives (there's always hope, right?).

Regards,
Jeff Davis


Test goal: is 32 is an appropriate threshold for using the VM after we
remove PD_ALL_VISIBLE?

Test setup: 500 31-page tables and 500 33-page tables. Test recent build
against patched version, with varying shared_buffers. The first test is
500 connections each doing 20 iterations of COUNT(*) against the 500
31-page tables. The next test is the same, except against the 33-page
tables.

Test results:

  (There were a few outliers I discarded as being too fast. It always
happened in the first run, and I strongly suspect the connections began
unevenly, leading to lower concurrency. They didn't seem to favor one
build over another.)

master, 31-page (first column is shared_buffers, second is range of
seconds):
32MB:  5.8 -  6.1
64MB:  3.1 -  3.7
96MB:  1.7 -  2.2
   112MB:  0.6 -  1.1
   128MB:  0.4 -  0.4
   256MB:  0.4 -  0.4

patch, 31-page (doesn't use VM because 31 is below threshold):
32MB:  5.1 -  5.9  
64MB:  3.4 -  3.8
96MB:  1.7 -  2.0
   112MB:  0.7 -  1.1
   128MB:  0.4 -  0.5
   256MB:  0.4 -  0.5

master, 33-page:
32MB:  5.9 -  7.0
64MB:  4.2 -  4.7
96MB:  2.4 -  2.8
   112MB:  1.2 -  1.6
   128MB:  0.5 -  0.5
   256MB:  0.4 -  0.5

patch, 33-page (does use VM because 33 is above threshold):
32MB:  6.2 -  7.2
64MB:  4.4 -  4.7
96MB:  2.8 -  3.0
   112MB:  1.7 -  1.8
   128MB:  0.5 -  1.0
   256MB:  0.4 -  0.5

Conclusion:

32 pages is a low enough threshold that skipping the VM is
insignificant.

When the 500 tables are 33 pages, and it does use the VM, we do see a
measurable cost under cache pressure. The penalty is fairly small (10%
ballpark), and this is a worst-case, so I don't think it's a problem.
From the last test results, we know it gets back to even by the time the
tables are 128 pages (1MB). So it could be that there is a slightly
higher threshold (between 32 and 128) that would be slightly better. But
given how specific this case is and how small the penalty is, I think 32
is a fine threshold.

Also, to reiterate, I focused my tests almost entirely on scans, though
some of the concern was around inserts/updates/deletes. I tried, but was
unable to show any difference on those tests. I suspect that the
bottleneck is elsewhere.




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Removing PD_ALL_VISIBLE

2013-01-20 Thread Jeff Davis
On Mon, 2013-01-21 at 11:27 +0530, Pavan Deolasee wrote:
 I tend to agree. When I looked at the patch, I thought since its
 removing a WAL record (and associated redo logic), it has some
 additional value. But that was kind of broken (sorry, I haven't looked
 at the latest patch if Jeff fixed it without requiring to reinstate
 the WAL logging). I also thought that bug invalidates the performance
 numbers reported.

I revalidated the performance numbers after reinstating the WAL logging.
No difference (which is unsurprising, since my tests were read-only).

  Of course, there is an argument that this patch will
 simplify the code, but I'm not sure if its enough to justify the
 additional contention which may or may not show up in the benchmarks
 we are running, but we know its there.

What additional contention? How do you know it's there?

The design is to keep the VM page pinned, and to read it without
requiring locks (like index-only scans already do). So I don't see any
obvious additional contention there, unless you're talking about the
work the CPU does for cache coherency (which did not show up in any of
my tests).

I understand that my patch is probably not going in, but I would like to
be clear about what is a known practical problem, what is a theoretical
problem that has eluded my testing capabilities, and what is no problem
at all.

Regards,
Jeff Davis




-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] gistchoose vs. bloat

2013-01-20 Thread Jeff Davis
On Mon, 2013-01-21 at 00:48 -0500, Tom Lane wrote:
 I looked at this patch.  ISTM we should not have the option at all but
 just do it always.  I cannot believe that always-go-left is ever a
 preferable strategy in the long run; the resulting imbalance in the
 index will surely kill any possible benefit.  Even if there are some
 cases where it miraculously fails to lose, how many users are going to
 realize that applies to their case and make use of the option?

Sounds good to me.

If I remember correctly, there was also an argument that it may be
useful for repeatable test results. That's a little questionable for
performance (except in those cases where few penalties are identical
anyway), but could plausibly be useful for a crash report or something.

Regards,
Jeff Davis



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] Removing PD_ALL_VISIBLE

2013-01-18 Thread Jeff Davis
On Thu, 2013-01-17 at 14:53 -0800, Jeff Davis wrote:
 Test plan:
 
   1. Take current patch (without skip VM check for small tables
 optimization mentioned above).
   2. Create 500 tables each about 1MB.
   3. VACUUM them all.
   4. Start 500 connections (one for each table)
   5. Time the running of a loop that executes a COUNT(*) on that
 connection's table 100 times.

Done, with a few extra variables. Again, thanks to Nathan Boley for
lending me the 64-core box. Test program attached.

I did both 1MB tables and 1 tuple tables, but I ended up throwing out
the 1-tuple table results. First of all, as I said, that's a pretty easy
problem to solve, so not really what I want to test. Second, I had to do
so many iterations that I don't think I was testing anything useful. I
did see what might have been a couple differences, but I would need to
explore in more detail and I don't think it's worth it, so I'm just
reporting on the 1MB tables.

For each test, each of 500 connections runs 10 iterations of a COUNT(*)
on it's own 1MB table (which is vacuumed and has the VM bit set). The
query is prepared once. The table has only an int column.

The variable is shared_buffers, going from 32MB (near exhaustion for 500
connections) to 2048MB (everything fits).

The last column is the time range in seconds. I included the range this
time, because there was more variance in the runs but I still think they
are good test results.

master:
32MB: 16.4 - 18.9
64MB: 16.9 - 17.3
   128MB: 17.5 - 17.9
   256MB: 14.7 - 15.8
   384MB:  8.1 -  9.3
   448MB:  4.3 -  9.2
   512MB:  1.7 -  2.2
   576MB:  0.6 -  0.6
  1024MB:  0.6 -  0.6
  2048MB:  0.6 -  0.6

patch:
32MB: 16.8 - 17.6
64MB: 17.1 - 17.5
   128MB: 17.2 - 18.0
   256MB: 14.8 - 16.2
   384MB:  8.0 - 10.1
   448MB:  4.6 -  7.2
   512MB:  2.0 -  2.6
   576MB:  0.6 -  0.6
  1024MB:  0.6 -  0.6
  2048MB:  0.6 -  0.6

Conclusion:

I see about what I expect: a precipitous drop in runtime after
everything fits in shared_buffers (500 1MB tables means the inflection
point around 512MB makes a lot of sense). There does seem to be a
measurable difference right around that inflection point, but it's not
much. Considering that this is the worst case that I could devise, I am
not too concerned about this.

However, it is interesting to see that there really is a lot of
maintenance work being done when we need to move pages in and out of
shared buffers. I'm not sure that it's related to the freelists though.

For the extra pins to really be a problem, I think a much higher
percentage of the buffers would need to be pinned. Since the case we are
worried about involves scans (if it involved indexes, that would already
be using more than one pin per scan), then that means the only way to
get to a high percentage of pinned buffers is by having very small
tables. But we don't really need to use the VM when scanning very small
tables (the overhead would be elsewhere), so I think we're OK.

So, I attached a new version of the patch that doesn't look at the VM
for tables with fewer than 32 pages. That's the only change.

Regards,
Jeff Davis

#include libpq-fe.h
#include stdlib.h
#include stdio.h
#include sys/time.h

#define QSIZE 256

void
test(char *query, int procnum, int niter)
{
  PGconn	*conn;
  PGresult	*result;
  int		 i;
  
  conn = PQconnectdb(host=/tmp dbname=postgres);
  if (PQstatus(conn) != CONNECTION_OK)
{
  fprintf(stderr, connection failed!\n);
  exit(1);
}

  result = PQprepare(conn, q, query, 0, NULL);
  if (PQresultStatus(result) != PGRES_COMMAND_OK)
{
  fprintf(stderr, PREPARE failed: %s, PQerrorMessage(conn));
  PQclear(result);
  exit(1);
}
  PQclear(result);

  for (i = 0; i  niter; i++)
{
  result = PQexecPrepared(conn, q, 0, NULL, NULL, NULL, 0);
  if (PQresultStatus(result) != PGRES_TUPLES_OK)
	{
	  fprintf(stderr, EXECUTE PREPARED failed: %s\n, PQerrorMessage(conn));
	  PQclear(result);
	  exit(1);
	}
  PQclear(result);
}

  PQfinish(conn);
}

int
main(int argc, char *argv[])
{
  int	 niter;
  int	 nprocs;
  char	 query[QSIZE];
  int	 i;
  pid_t *procs;
  struct timeval tv1, tv2;

  if (argc != 3)
{
  fprintf(stderr, expected 3 arguments, got %d\n, argc);
  exit(1);
}

  nprocs = atoi(argv[1]);
  niter = atoi(argv[2]);

  procs = malloc(sizeof(pid_t) * nprocs);

  gettimeofday(tv1, NULL);

  for (i = 0; i  nprocs; i++)
{
  pid_t pid = fork();
  if (pid == 0)
	{
	  snprintf(query, QSIZE, SELECT COUNT(*) FROM mb_%d;, i);
	  test(query, i, niter);
	  exit(0);
	}
  else
	{
	  procs[i] = pid;
	}
}

  for (i = 0; i  nprocs; i++)
{
  int status;
  waitpid(procs[i], status, 0);
  if (!WIFEXITED(status))
	{
	  fprintf(stderr, child did not exit!\n, argc);
	  exit(1);
	}
  if (WEXITSTATUS(status) != 0)
	{
	  fprintf(stderr, child exited with status %d\n, WEXITSTATUS(status));
	  exit(1);
	}
}

  gettimeofday(tv2, NULL);

  free(procs

<    1   2   3   4   5   6   7   8   9   10   >