Folks,

What:

    Please find attached a patch for 9.2-to-be which implements page
    checksums.  It changes the page format, so it's an initdb-forcing
    change.

How: 
    In order to ensure that the checksum actually matches the hint
    bits, this makes a copy of the page, calculates the checksum, then
    sends the checksum and copy to the kernel, which handles sending
    it the rest of the way to persistent storage.

Why:
    My employer, VMware, thinks it's a good thing, and has dedicated
    engineering resources to it.  Lots of people's data is already in
    cosmic ray territory, and many others' data will be soon.  And
    it's a TODO :)

If this introduces new failure modes, please detail, and preferably
demonstrate, just what those new modes are.  As far as we've been able
to determine so far, it could expose on-disk corruption that wasn't
exposed before, but we see this as dealing with a previously
un-dealt-with failure rather than causing one.

Questions, comments and bug fixes are, of course, welcome.

Let the flames begin!

Cheers,
David.
-- 
David Fetter <da...@fetter.org> http://fetter.org/
Phone: +1 415 235 3778  AIM: dfetter666  Yahoo!: dfetter
Skype: davidfetter      XMPP: david.fet...@gmail.com
iCal: webcal://www.tripit.com/feed/ical/people/david74/tripit.ics

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0cc3296..a5c20b3 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1685,6 +1685,7 @@ SET ENABLE_SEQSCAN TO OFF;
         data corruption, after a system failure. The risks are similar to 
turning off
         <varname>fsync</varname>, though smaller, and it should be turned off
         only based on the same circumstances recommended for that parameter.
+        This parameter must be on when <varname>page_cksum</varname> is on.
        </para>
 
        <para>
@@ -1701,6 +1702,20 @@ SET ENABLE_SEQSCAN TO OFF;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-page-cksum" xreflabel="page_cksum">
+      <term><varname>page_cksum</varname> (<type>boolean</type>)</term>
+      <indexterm>
+       <primary><varname>page_cksum</varname>configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         When this parameter is on, the
+         <productname>PostgreSQL</productname> server writes and
+         checks checksums for each page written to persistent storage.
+        </para>
+       </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers">
       <term><varname>wal_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/src/backend/postmaster/postmaster.c 
b/src/backend/postmaster/postmaster.c
index 963189d..0fa5f68 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -314,6 +314,9 @@ extern char *optarg;
 extern int     optind,
                        opterr;
 
+extern int page_checksum;
+extern bool fullPageWrites;
+
 #ifdef HAVE_INT_OPTRESET
 extern int     optreset;                       /* might not be declared by 
system headers */
 #endif
@@ -766,6 +769,29 @@ PostmasterMain(int argc, char *argv[])
                                (errmsg("WAL streaming (max_wal_senders > 0) 
requires wal_level \"archive\" or \"hot_standby\"")));
 
        /*
+        * The idea here is that there will be checksum matches if there
+        * are partial writes to pages during hardware crashes.  The user
+        * should have full_page_writes enabled if page_checksum is
+        * enabled so that these pages are automatically fixed, otherwise
+        * PostgreSQL may get checksum errors after crashes on pages that
+        * are in fact partially written and hence corrupted.  With
+        * full_page_writes enabled, PostgreSQL will replace each page
+        * without ever looking at the partially-written page and seeing
+        * an incorrect checksum.  Hence, checksums will detect only real
+        * disk corruptions, i.e. places where the disk reported a
+        * successful write but the data was still corrupted at some
+        * point.
+        *
+        * Alternatively, we may want to leave this check out.  This would
+        * be for sophisticated users who have some other guarantee
+        * (hardware and/or software) against ever producing a partial
+        * write during crashes.
+        */
+       if (page_checksum && !fullPageWrites)
+               ereport(ERROR,
+                               (errmsg("full_page_writes must be enabled if 
page_checksum is enabled.")));
+
+       /*
         * Other one-time internal sanity checks can go here, if they are fast.
         * (Put any slow processing further down, after postmaster.pid 
creation.)
         */
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4f607cd..1756d62 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -17,6 +17,7 @@
  */
 #include "postgres.h"
 
+#include "catalog/catalog.h"
 #include "commands/tablespace.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
@@ -79,6 +80,12 @@ static const int NSmgr = lengthof(smgrsw);
  */
 static HTAB *SMgrRelationHash = NULL;
 
+/* Page checksumming. */
+static uint64 tempbuf[BLCKSZ/sizeof(uint64)];
+extern bool page_checksum;
+
+#define INVALID_CKSUM 0x1b0af034
+
 /* local function prototypes */
 static void smgrshutdown(int code, Datum arg);
 
@@ -381,6 +388,59 @@ smgrdounlink(SMgrRelation reln, ForkNumber forknum, bool 
isRedo)
 }
 
 /*
+ * The initial value when computing the checksum for a data page.
+ */
+static inline uint64
+ChecksumInit(SMgrRelation reln, ForkNumber f, BlockNumber b)
+{
+       return b + f;
+}
+
+/*
+ * Compute a checksum of a buffer (with length len), using initial
+ * value cksum.  We use a relatively simple checksum calculation to
+ * avoid overhead, but could replace with some kind of CRC
+ * calculation.
+ */
+static inline uint32
+ComputeChecksum(uint64 *buffer, uint32 len, uint64 cksum)
+{
+       int i;
+
+       for (i = 0; i < len/sizeof(uint64); i += 4) {
+               cksum += (cksum << 5) + *buffer;
+               cksum += (cksum << 5) + *(buffer+1);
+               cksum += (cksum << 5) + *(buffer+2);
+               cksum += (cksum << 5) + *(buffer+3);
+               buffer += 4;
+       }
+       cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32);
+       return cksum;
+}
+
+/*
+ * Copy buffer to dst and compute the checksum during the copy (so
+ * that the checksum is correct for the final contents fo dst).
+ */
+static inline uint32
+CopyAndComputeChecksum(uint64 *dst, volatile uint64 *buffer,
+                                          uint32 len, uint64 cksum)
+{
+       int i;
+
+       for (i = 0; i < len/sizeof(uint64); i += 4) {
+               cksum += (cksum << 5) + (*dst = *buffer);
+               cksum += (cksum << 5) + (*(dst+1) = *(buffer+1));
+               cksum += (cksum << 5) + (*(dst+2) = *(buffer+2));
+               cksum += (cksum << 5) + (*(dst+3) = *(buffer+3));
+               dst += 4;
+               buffer += 4;
+       }
+       cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32);
+       return cksum;
+}
+
+/*
  *     smgrextend() -- Add a new block to a file.
  *
  *             The semantics are nearly the same as smgrwrite(): write at the
@@ -393,8 +453,25 @@ void
 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                   char *buffer, bool skipFsync)
 {
+       PageHeader p;
+       Assert(PageGetPageLayoutVersion(((PageHeader)buffer)) == 
PG_PAGE_LAYOUT_VERSION ||
+                       PageIsNew(buffer));
+       if (page_checksum) {
+               p = (PageHeader)tempbuf;
+               ((PageHeader)buffer)->cksum = 0;
+               /*
+                * We copy and compute the checksum, and then write out the
+                * data from the copy to avoid any problem with hint bits
+                * changing after we compute the checksum.
+                */
+               p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, 
BLCKSZ,
+                                                                               
  ChecksumInit(reln, forknum, blocknum));
+       } else {
+               p = (PageHeader)buffer;
+               p->cksum = INVALID_CKSUM;
+       }
        (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum,
-                                                                               
           buffer, skipFsync);
+                                                                               
           (char *)p, skipFsync);
 }
 
 /*
@@ -418,7 +495,29 @@ void
 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                 char *buffer)
 {
+       PageHeader p = (PageHeader) buffer;
        (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, 
buffer);
+       Assert(PageIsNew(p) || PageGetPageLayoutVersion(p) == 
PG_PAGE_LAYOUT_VERSION);
+       if (page_checksum && p->cksum != INVALID_CKSUM) {
+               const uint32 diskCksum = p->cksum;
+               uint32 cksum;
+
+               p->cksum = 0;
+               cksum = ComputeChecksum((uint64 *)buffer, BLCKSZ,
+                                                               
ChecksumInit(reln, forknum, blocknum));
+               
+               if (cksum != diskCksum) {
+                       ereport(PANIC, (0, errmsg("checksum mismatch: disk has 
%#x, should be %#x\n"
+                                                                         
"filename %s, BlockNum %u, block specifier %d/%d/%d/%d/%u",
+                                                                         
diskCksum, (uint32)cksum,
+                                                                         
relpath(reln->smgr_rnode, forknum),
+                                                                         
blocknum,
+                                                                         
reln->smgr_rnode.node.spcNode,
+                                                                         
reln->smgr_rnode.node.dbNode,
+                                                                         
reln->smgr_rnode.node.relNode,
+                                                                         
forknum, blocknum)));
+               }
+       }
 }
 
 /*
@@ -440,8 +539,25 @@ void
 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                  char *buffer, bool skipFsync)
 {
+       PageHeader p;
+
+       if (page_checksum) {
+               p = (PageHeader)tempbuf;
+               ((PageHeader)buffer)->cksum = 0;
+               /*
+                * We copy and compute the checksum, then write out the data
+                * from the copy so that we avoid any problem with hint bits
+                * changing after we compute the checksum.
+                */
+               p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, 
BLCKSZ,
+                                                                               
  ChecksumInit(reln, forknum, blocknum));
+       } else {
+               p = (PageHeader)buffer;
+               p->cksum = INVALID_CKSUM;
+       }
+       Assert(PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION);
        (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum,
-                                                                               
          buffer, skipFsync);
+                                                                               
          (char *)p, skipFsync);
 }
 
 /*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index da7b6d4..332b960 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -419,6 +419,7 @@ bool                default_with_oids = false;
 bool           SQL_inheritance = true;
 
 bool           Password_encryption = true;
+bool           page_checksum = true;
 
 int                    log_min_error_statement = ERROR;
 int                    log_min_messages = WARNING;
@@ -1438,6 +1439,14 @@ static struct config_bool ConfigureNamesBool[] =
                NULL, NULL, NULL
        },
 
+       {
+               {"page_checksum", PGC_POSTMASTER, CUSTOM_OPTIONS,
+                       gettext_noop("Enable disk page checksums."),
+                       NULL,
+               },
+               &page_checksum, true, NULL, NULL
+       },
+
        /* End-of-list marker */
        {
                {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample 
b/src/backend/utils/misc/postgresql.conf.sample
index 315db46..6a107b9 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -167,6 +167,8 @@
                                        #   fsync_writethrough
                                        #   open_sync
 #full_page_writes = on                 # recover from partial page writes
+#page_cksum = on            # checksum disk pages
+
 #wal_buffers = -1                      # min 32kB, -1 sets based on 
shared_buffers
                                        # (change requires restart)
 #wal_writer_delay = 200ms              # 1-10000 milliseconds
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 14e177d..05ae537 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     201112071
+#define CATALOG_VERSION_NO     201112141
 
 #endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 42d6b10..847f157 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -132,6 +132,7 @@ typedef struct PageHeaderData
        LocationIndex pd_special;       /* offset to start of special space */
        uint16          pd_pagesize_version;
        TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
+       uint32          cksum;                  /* page checksum */
        ItemIdData      pd_linp[1];             /* beginning of line pointer 
array */
 } PageHeaderData;
 
@@ -165,8 +166,9 @@ typedef PageHeaderData *PageHeader;
  * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
  *             added the pd_flags field (by stealing some bits from pd_tli),
  *             as well as adding the pd_prune_xid field (which enlarges the 
header).
+ *     Release 9.2 uses 5; we added checksums to heap, index and fsm files.
  */
-#define PG_PAGE_LAYOUT_VERSION         4
+#define PG_PAGE_LAYOUT_VERSION         5
 
 
 /* ----------------------------------------------------------------
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to