On Mon, Mar 4, 2019 at 11:31 AM Julien Rouhaud <rjuju...@gmail.com> wrote:

> On Fri, Feb 22, 2019 at 3:01 PM Magnus Hagander <mag...@hagander.net>
> wrote:
> >
> > It tracks things that happen in the general backends. Possibly we should
> also consider counting the errors actually found when running base backups?
> OTOH, that part of the code doesn't really track things like databases (as
> it operates just on the raw data directory underneath), so that
> implementation would definitely not be as clean...
>
> Sorry I just realized that I totally forgot this part of the thread.
>
> While it's true that we operate on raw directory, I see that sendDir()
> already setup a isDbDir var, and if this is true lastDir should
> contain the oid of the underlying database.  Wouldn't it be enough to
> call sendFile() using this, something like (untested):
>
> if (!sizeonly)
> - sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, true);
> + sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, true,
> isDbDir ? pg_atoi(lastDir+1, 4) : InvalidOid);
>
> and accordingly report any checksum error from sendFile()?
>

That seems it was easy enough. PFA an updated patch that does this, and
also rebased so it doesn't conflict on oid.

(yes, while moving this from draft to publish after lunch, I realized that
you put a patch togerher for about the same. So let's merge it)

-- 
 Magnus Hagander
 Me: https://www.hagander.net/ <http://www.hagander.net/>
 Work: https://www.redpill-linpro.com/ <http://www.redpill-linpro.com/>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 0e73cdcdda..e2630fd368 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -2509,6 +2509,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
      <entry>Number of deadlocks detected in this database</entry>
     </row>
     <row>
+     <entry><structfield>checksum_failures</structfield></entry>
+     <entry><type>bigint</type></entry>
+     <entry>Number of data page checksum failures detected in this database</entry>
+    </row>
+    <row>
      <entry><structfield>blk_read_time</structfield></entry>
      <entry><type>double precision</type></entry>
      <entry>Time spent reading data file blocks by backends in this database,
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 3e229c693c..7723f01327 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS
             pg_stat_get_db_temp_files(D.oid) AS temp_files,
             pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes,
             pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
+            pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures,
             pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
             pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
             pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 81c6499251..43ec33834b 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
 static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
+static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
 /* ------------------------------------------------------------
@@ -1518,6 +1519,40 @@ pgstat_report_deadlock(void)
 	pgstat_send(&msg, sizeof(msg));
 }
 
+
+
+/* --------
+ * pgstat_report_checksum_failures_in_db(dboid, failure_count) -
+ *
+ *	Tell the collector about one or more checksum failures.
+ * --------
+ */
+void
+pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
+{
+	PgStat_MsgChecksumFailure msg;
+
+	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
+	msg.m_databaseid = dboid;
+	msg.m_failurecount = failurecount;
+	pgstat_send(&msg, sizeof(msg));
+}
+
+/* --------
+ * pgstat_report_checksum_failure() -
+ *
+ *	Tell the collector about a checksum failure.
+ * --------
+ */
+void
+pgstat_report_checksum_failure(void)
+{
+	pgstat_report_checksum_failures_in_db(MyDatabaseId, 1);
+}
+
 /* --------
  * pgstat_report_tempfile() -
  *
@@ -4455,6 +4490,10 @@ PgstatCollectorMain(int argc, char *argv[])
 					pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
 					break;
 
+				case PGSTAT_MTYPE_CHECKSUMFAILURE:
+					pgstat_recv_checksum_failure((PgStat_MsgChecksumFailure *) &msg, len);
+					break;
+
 				default:
 					break;
 			}
@@ -4554,6 +4593,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 	dbentry->n_temp_files = 0;
 	dbentry->n_temp_bytes = 0;
 	dbentry->n_deadlocks = 0;
+	dbentry->n_checksum_failures = 0;
 	dbentry->n_block_read_time = 0;
 	dbentry->n_block_write_time = 0;
 
@@ -6197,6 +6237,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
 }
 
 /* ----------
+ * pgstat_recv_checksum_failure() -
+ *
+ *	Process a CHECKSUMFAILURE message.
+ * ----------
+ */
+static void
+pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
+{
+	PgStat_StatDBEntry *dbentry;
+
+	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
+	dbentry->n_checksum_failures += msg->m_failurecount;
+}
+
+/* ----------
  * pgstat_recv_tempfile() -
  *
  *	Process a TEMPFILE message.
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index def6c03dd0..6c324a6661 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -58,7 +58,7 @@ typedef struct
 static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
 		List *tablespaces, bool sendtblspclinks);
 static bool sendFile(const char *readfilename, const char *tarfilename,
-		 struct stat *statbuf, bool missing_ok);
+		 struct stat *statbuf, bool missing_ok, Oid dboid);
 static void sendFileWithContent(const char *filename, const char *content);
 static int64 _tarWriteHeader(const char *filename, const char *linktarget,
 				struct stat *statbuf, bool sizeonly);
@@ -342,7 +342,7 @@ perform_base_backup(basebackup_options *opt)
 							(errcode_for_file_access(),
 							 errmsg("could not stat file \"%s\": %m",
 									XLOG_CONTROL_FILE)));
-				sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false);
+				sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid);
 			}
 			else
 				sendTablespace(ti->path, false);
@@ -592,7 +592,7 @@ perform_base_backup(basebackup_options *opt)
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m", pathbuf)));
 
-			sendFile(pathbuf, pathbuf, &statbuf, false);
+			sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid);
 
 			/* unconditionally mark file as archived */
 			StatusFilePath(pathbuf, fname, ".done");
@@ -1302,7 +1302,7 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
 
 			if (!sizeonly)
 				sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
-								true);
+								true, isDbDir ? pg_atoi(lastDir + 1, sizeof(Oid), 0) : InvalidOid);
 
 			if (sent || sizeonly)
 			{
@@ -1358,12 +1358,15 @@ is_checksummed_file(const char *fullpath, const char *filename)
  *
  * If 'missing_ok' is true, will not throw an error if the file is not found.
  *
+ * If dboid is anything other than InvalidOid then any checksum failures detected
+ * will get reported to the stats collector.
+ *
  * Returns true if the file was successfully sent, false if 'missing_ok',
  * and the file did not exist.
  */
 static bool
 sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
-		 bool missing_ok)
+		 bool missing_ok, Oid dboid)
 {
 	FILE	   *fp;
 	BlockNumber blkno = 0;
@@ -1580,6 +1583,9 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 		ereport(WARNING,
 				(errmsg("file \"%s\" has a total of %d checksum verification "
 						"failures", readfilename, checksum_failures)));
+
+		if (dboid != InvalidOid)
+			pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
 	}
 	total_checksum_failures += checksum_failures;
 
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 517220bc33..14bc61b8ad 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -17,6 +17,7 @@
 #include "access/htup_details.h"
 #include "access/itup.h"
 #include "access/xlog.h"
+#include "pgstat.h"
 #include "storage/checksum.h"
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
@@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno)
 				 errmsg("page verification failed, calculated checksum %u but expected %u",
 						checksum, p->pd_checksum)));
 
+		pgstat_report_checksum_failure();
+
 		if (header_sane && ignore_checksum_failure)
 			return true;
 	}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 69f7265779..da1d685c08 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1498,6 +1498,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS)
 }
 
 Datum
+pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
+{
+	Oid			dbid = PG_GETARG_OID(0);
+	int64		result;
+	PgStat_StatDBEntry *dbentry;
+
+	if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
+		result = 0;
+	else
+		result = (int64) (dbentry->n_checksum_failures);
+
+	PG_RETURN_INT64(result);
+}
+
+Datum
 pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
 {
 	Oid			dbid = PG_GETARG_OID(0);
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index e9638660ff..562c5408f8 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5227,6 +5227,10 @@
   proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r',
   prorettype => 'int8', proargtypes => 'oid',
   prosrc => 'pg_stat_get_db_deadlocks' },
+{ oid => '3426', descr => 'statistics: checksum failures detected in database',
+  proname => 'pg_stat_get_db_checksum_failures', provolatile => 's', proparallel => 'r',
+  prorettype => 'int8', proargtypes => 'oid',
+  prosrc => 'pg_stat_get_db_checksum_failures' },
 { oid => '3074', descr => 'statistics: last reset for a database',
   proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's',
   proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 88a75fb798..725c8b0d64 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -64,7 +64,8 @@ typedef enum StatMsgType
 	PGSTAT_MTYPE_FUNCPURGE,
 	PGSTAT_MTYPE_RECOVERYCONFLICT,
 	PGSTAT_MTYPE_TEMPFILE,
-	PGSTAT_MTYPE_DEADLOCK
+	PGSTAT_MTYPE_DEADLOCK,
+	PGSTAT_MTYPE_CHECKSUMFAILURE
 } StatMsgType;
 
 /* ----------
@@ -530,6 +531,18 @@ typedef struct PgStat_MsgDeadlock
 	Oid			m_databaseid;
 } PgStat_MsgDeadlock;
 
+/* ----------
+ * PgStat_MsgChecksumFailure	Sent by the backend to tell the collector
+ *								about checksum failures noticed.
+ * ----------
+ */
+typedef struct PgStat_MsgChecksumFailure
+{
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	int			m_failurecount;
+} PgStat_MsgChecksumFailure;
+
 
 /* ----------
  * PgStat_Msg					Union over all possible messages.
@@ -593,6 +606,7 @@ typedef struct PgStat_StatDBEntry
 	PgStat_Counter n_temp_files;
 	PgStat_Counter n_temp_bytes;
 	PgStat_Counter n_deadlocks;
+	PgStat_Counter n_checksum_failures;
 	PgStat_Counter n_block_read_time;	/* times in microseconds */
 	PgStat_Counter n_block_write_time;
 
@@ -1200,6 +1214,8 @@ extern void pgstat_report_analyze(Relation rel,
 
 extern void pgstat_report_recovery_conflict(int reason);
 extern void pgstat_report_deadlock(void);
+extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
+extern void pgstat_report_checksum_failure(void);
 
 extern void pgstat_initialize(void);
 extern void pgstat_bestart(void);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 98f417cb57..e0f2c543ef 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid,
     pg_stat_get_db_temp_files(d.oid) AS temp_files,
     pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes,
     pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
+    pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures,
     pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
     pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
     pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset

Reply via email to