On 17.2.2013 06:46, Alvaro Herrera wrote:
> Tomas Vondra wrote:
> 
>> I've been thinking about this (actually I had a really weird dream about
>> it this night) and I think it might work like this:
>>
>> (1) check the timestamp of the global file -> if it's too old, we need
>>     to send an inquiry or wait a bit longer
>>
>> (2) if it's new enough, we need to read it a look for that particular
>>     database - if it's not found, we have no info about it yet (this is
>>     the case handled by the dummy files)
>>
>> (3) if there's a database stat entry, we need to check the timestamp
>>     when it was written for the last time -> if it's too old, send an
>>     inquiry and wait a bit longer
>>
>> (4) well, we have a recent global file, it contains the database stat
>>     entry and it's fresh enough -> tadaaaaaa, we're done
> 
> Hmm, yes, I think this is what I was imagining.  I had even considered
> that the timestamp would be removed from the per-db file as you suggest
> here.

So, here's v10 of the patch (based on the v9+v9a), that implements the
approach described above.

It turned out to be much easier than I expected (basically just a
rewrite of the pgstat_read_db_statsfile_timestamp() function.

I've done a fair amount of testing (and will do some more next week) but
it seems to work just fine - no errors, no measurable decrease of
performance etc.

regards
Tomas Vondra
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 9b92ebb..36c0d8b 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -38,6 +38,7 @@
 #include "access/xact.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_proc.h"
+#include "lib/ilist.h"
 #include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "libpq/pqsignal.h"
@@ -66,8 +67,9 @@
  * Paths for the statistics files (relative to installation's $PGDATA).
  * ----------
  */
-#define PGSTAT_STAT_PERMANENT_FILENAME		"global/pgstat.stat"
-#define PGSTAT_STAT_PERMANENT_TMPFILE		"global/pgstat.tmp"
+#define PGSTAT_STAT_PERMANENT_DIRECTORY		"pg_stat"
+#define PGSTAT_STAT_PERMANENT_FILENAME		"pg_stat/global.stat"
+#define PGSTAT_STAT_PERMANENT_TMPFILE		"pg_stat/global.tmp"
 
 /* ----------
  * Timer definitions.
@@ -115,6 +117,8 @@ int			pgstat_track_activity_query_size = 1024;
  * Built from GUC parameter
  * ----------
  */
+char	   *pgstat_stat_directory = NULL;
+int			pgstat_stat_dbfile_maxlen = 0;
 char	   *pgstat_stat_filename = NULL;
 char	   *pgstat_stat_tmpname = NULL;
 
@@ -219,11 +223,16 @@ static int	localNumBackends = 0;
  */
 static PgStat_GlobalStats globalStats;
 
-/* Last time the collector successfully wrote the stats file */
-static TimestampTz last_statwrite;
+/* Write request info for each database */
+typedef struct DBWriteRequest
+{
+	Oid			databaseid;		/* OID of the database to write */
+	TimestampTz request_time;	/* timestamp of the last write request */
+	slist_node	next;
+} DBWriteRequest;
 
-/* Latest statistics request time from backends */
-static TimestampTz last_statrequest;
+/* Latest statistics request times from backends */
+static slist_head	last_statrequests = SLIST_STATIC_INIT(last_statrequests);
 
 static volatile bool need_exit = false;
 static volatile bool got_SIGHUP = false;
@@ -252,11 +261,16 @@ static void pgstat_sighup_handler(SIGNAL_ARGS);
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
 static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
 					 Oid tableoid, bool create);
-static void pgstat_write_statsfile(bool permanent);
-static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent);
+static void pgstat_write_statsfiles(bool permanent, bool allDbs);
+static void pgstat_write_db_statsfile(PgStat_StatDBEntry * dbentry, bool permanent);
+static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent, bool deep);
+static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
+static bool pgstat_write_statsfile_needed(void);
+static bool pgstat_db_requested(Oid databaseid);
+
 static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
 static void pgstat_send_funcstats(void);
 static HTAB *pgstat_collect_oids(Oid catalogid);
@@ -285,7 +299,6 @@ static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int le
 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
 
-
 /* ------------------------------------------------------------
  * Public functions called from postmaster follow
  * ------------------------------------------------------------
@@ -541,16 +554,40 @@ startup_failed:
 }
 
 /*
+ * subroutine for pgstat_reset_all
+ */
+static void
+pgstat_reset_remove_files(const char *directory)
+{
+	DIR * dir;
+	struct dirent * entry;
+	char	fname[MAXPGPATH];
+
+	dir = AllocateDir(pgstat_stat_directory);
+	while ((entry = ReadDir(dir, pgstat_stat_directory)) != NULL)
+	{
+		if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+			continue;
+
+		snprintf(fname, MAXPGPATH, "%s/%s", pgstat_stat_directory,
+				 entry->d_name);
+		unlink(fname);
+	}
+	FreeDir(dir);
+}
+
+/*
  * pgstat_reset_all() -
  *
- * Remove the stats file.  This is currently used only if WAL
+ * Remove the stats files.  This is currently used only if WAL
  * recovery is needed after a crash.
  */
 void
 pgstat_reset_all(void)
 {
-	unlink(pgstat_stat_filename);
-	unlink(PGSTAT_STAT_PERMANENT_FILENAME);
+
+	pgstat_reset_remove_files(pgstat_stat_directory);
+	pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY);
 }
 
 #ifdef EXEC_BACKEND
@@ -1408,13 +1445,14 @@ pgstat_ping(void)
  * ----------
  */
 static void
-pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time)
+pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
 {
 	PgStat_MsgInquiry msg;
 
 	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
 	msg.clock_time = clock_time;
 	msg.cutoff_time = cutoff_time;
+	msg.databaseid = databaseid;
 	pgstat_send(&msg, sizeof(msg));
 }
 
@@ -3004,6 +3042,7 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	bool		first_write = true;
 
 	IsUnderPostmaster = true;	/* we are a postmaster subprocess now */
 
@@ -3053,17 +3092,11 @@ PgstatCollectorMain(int argc, char *argv[])
 	init_ps_display("stats collector process", "", "", "");
 
 	/*
-	 * Arrange to write the initial status file right away
-	 */
-	last_statrequest = GetCurrentTimestamp();
-	last_statwrite = last_statrequest - 1;
-
-	/*
 	 * Read in an existing statistics stats file or initialize the stats to
 	 * zero.
 	 */
 	pgStatRunningInCollector = true;
-	pgStatDBHash = pgstat_read_statsfile(InvalidOid, true);
+	pgStatDBHash = pgstat_read_statsfile(InvalidOid, true, true);
 
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
@@ -3107,10 +3140,14 @@ PgstatCollectorMain(int argc, char *argv[])
 
 			/*
 			 * Write the stats file if a new request has arrived that is not
-			 * satisfied by existing file.
+			 * satisfied by existing file (force writing all files if it's
+			 * the first write after startup).
 			 */
-			if (last_statwrite < last_statrequest)
-				pgstat_write_statsfile(false);
+			if (first_write || pgstat_write_statsfile_needed())
+			{
+				pgstat_write_statsfiles(false, first_write);
+				first_write = false;
+			}
 
 			/*
 			 * Try to receive and process a message.  This will not block,
@@ -3269,7 +3306,7 @@ PgstatCollectorMain(int argc, char *argv[])
 	/*
 	 * Save the final stats to reuse at next startup.
 	 */
-	pgstat_write_statsfile(true);
+	pgstat_write_statsfiles(true, true);
 
 	exit(0);
 }
@@ -3349,6 +3386,7 @@ pgstat_get_db_entry(Oid databaseid, bool create)
 		result->n_block_write_time = 0;
 
 		result->stat_reset_timestamp = GetCurrentTimestamp();
+		result->stats_timestamp = 0;
 
 		memset(&hash_ctl, 0, sizeof(hash_ctl));
 		hash_ctl.keysize = sizeof(Oid);
@@ -3422,30 +3460,32 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
 
 
 /* ----------
- * pgstat_write_statsfile() -
+ * pgstat_write_statsfiles() -
  *
  *	Tell the news.
- *	If writing to the permanent file (happens when the collector is
- *	shutting down only), remove the temporary file so that backends
+ *	If writing to the permanent files (happens when the collector is
+ *	shutting down only), remove the temporary files so that backends
  *	starting up under a new postmaster can't read the old data before
  *	the new collector is ready.
+ *
+ *	When 'allDbs' is false, only the requested databases (listed in
+ *	last_statrequests) will be written; otherwise, all databases will be
+ *	written.
  * ----------
  */
 static void
-pgstat_write_statsfile(bool permanent)
+pgstat_write_statsfiles(bool permanent, bool allDbs)
 {
 	HASH_SEQ_STATUS hstat;
-	HASH_SEQ_STATUS tstat;
-	HASH_SEQ_STATUS fstat;
 	PgStat_StatDBEntry *dbentry;
-	PgStat_StatTabEntry *tabentry;
-	PgStat_StatFuncEntry *funcentry;
 	FILE	   *fpout;
 	int32		format_id;
 	const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 	int			rc;
 
+	elog(DEBUG1, "writing statsfile '%s'", statfile);
+
 	/*
 	 * Open the statistics temp file to write out the current values.
 	 */
@@ -3484,40 +3524,26 @@ pgstat_write_statsfile(bool permanent)
 	while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
 	{
 		/*
-		 * Write out the DB entry including the number of live backends. We
-		 * don't write the tables or functions pointers, since they're of no
-		 * use to any other process.
+		 * Write out the tables and functions into a separate file, if
+		 * required.
+		 *
+		 * We need to do this before the dbentry write, to ensure the
+		 * timestamps written to both are consistent.
 		 */
-		fputc('D', fpout);
-		rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
-		(void) rc;				/* we'll check for error with ferror */
-
-		/*
-		 * Walk through the database's access stats per table.
-		 */
-		hash_seq_init(&tstat, dbentry->tables);
-		while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
-		{
-			fputc('T', fpout);
-			rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
-			(void) rc;			/* we'll check for error with ferror */
-		}
-
-		/*
-		 * Walk through the database's function stats table.
-		 */
-		hash_seq_init(&fstat, dbentry->functions);
-		while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
+		if (allDbs || pgstat_db_requested(dbentry->databaseid))
 		{
-			fputc('F', fpout);
-			rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
-			(void) rc;			/* we'll check for error with ferror */
+			elog(DEBUG1, "writing statsfile for DB %d", dbentry->databaseid);
+			dbentry->stats_timestamp = globalStats.stats_timestamp;
+			pgstat_write_db_statsfile(dbentry, permanent);
 		}
 
 		/*
-		 * Mark the end of this DB
+		 * Write out the DB entry. We don't write the tables or functions
+		 * pointers, since they're of no use to any other process.
 		 */
-		fputc('d', fpout);
+		fputc('D', fpout);
+		rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
+		(void) rc;				/* we'll check for error with ferror */
 	}
 
 	/*
@@ -3527,6 +3553,25 @@ pgstat_write_statsfile(bool permanent)
 	 */
 	fputc('E', fpout);
 
+	/*
+	 * Now throw away the list of requests.  Note that requests sent after we
+	 * started the write are still waiting on the network socket.
+	 */
+	if (!slist_is_empty(&last_statrequests))
+	{
+		slist_mutable_iter	iter;
+
+		slist_foreach_modify(iter, &last_statrequests)
+		{
+			DBWriteRequest *req = slist_container(DBWriteRequest, next,
+												  iter.cur);
+
+			pfree(req);
+		}
+
+		slist_init(&last_statrequests);
+	}
+
 	if (ferror(fpout))
 	{
 		ereport(LOG,
@@ -3552,61 +3597,161 @@ pgstat_write_statsfile(bool permanent)
 						tmpfile, statfile)));
 		unlink(tmpfile);
 	}
-	else
+
+	if (permanent)
+		unlink(pgstat_stat_filename);
+}
+
+/*
+ * return the filename for a DB stat file; filename is the output buffer,
+ * of length len.
+ */
+static void
+get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
+					char *filename, int len)
+{
+	int		printed;
+
+	printed = snprintf(filename, len, "%s/db_%u.%s",
+					   permanent ? "pg_stat" : pgstat_stat_directory,
+					   databaseid,
+					   tempname ? "tmp" : "stat");
+	if (printed > len)
+		elog(ERROR, "overlength pgstat path");
+}
+
+/* ----------
+ * pgstat_write_db_statsfile() -
+ *
+ *	Tell the news. This writes stats file for a single database.
+ *
+ *	If writing to the permanent file (happens when the collector is
+ *	shutting down only), remove the temporary file so that backends
+ *	starting up under a new postmaster can't read the old data before
+ *	the new collector is ready.
+ * ----------
+ */
+static void
+pgstat_write_db_statsfile(PgStat_StatDBEntry * dbentry, bool permanent)
+{
+	HASH_SEQ_STATUS tstat;
+	HASH_SEQ_STATUS fstat;
+	PgStat_StatTabEntry *tabentry;
+	PgStat_StatFuncEntry *funcentry;
+	FILE	   *fpout;
+	int32		format_id;
+	Oid			dbid = dbentry->databaseid;
+	int			rc;
+	char		tmpfile[MAXPGPATH];
+	char		statfile[MAXPGPATH];
+
+	get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
+	get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
+
+	elog(DEBUG1, "writing statsfile '%s'", statfile);
+
+	/*
+	 * Open the statistics temp file to write out the current values.
+	 */
+	fpout = AllocateFile(tmpfile, PG_BINARY_W);
+	if (fpout == NULL)
 	{
-		/*
-		 * Successful write, so update last_statwrite.
-		 */
-		last_statwrite = globalStats.stats_timestamp;
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not open temporary statistics file \"%s\": %m",
+						tmpfile)));
+		return;
+	}
 
-		/*
-		 * If there is clock skew between backends and the collector, we could
-		 * receive a stats request time that's in the future.  If so, complain
-		 * and reset last_statrequest.	Resetting ensures that no inquiry
-		 * message can cause more than one stats file write to occur.
-		 */
-		if (last_statrequest > last_statwrite)
-		{
-			char	   *reqtime;
-			char	   *mytime;
+	/*
+	 * Write the file header --- currently just a format ID.
+	 */
+	format_id = PGSTAT_FILE_FORMAT_ID;
+	rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
+	(void) rc;					/* we'll check for error with ferror */
 
-			/* Copy because timestamptz_to_str returns a static buffer */
-			reqtime = pstrdup(timestamptz_to_str(last_statrequest));
-			mytime = pstrdup(timestamptz_to_str(last_statwrite));
-			elog(LOG, "last_statrequest %s is later than collector's time %s",
-				 reqtime, mytime);
-			pfree(reqtime);
-			pfree(mytime);
+	/*
+	 * Walk through the database's access stats per table.
+	 */
+	hash_seq_init(&tstat, dbentry->tables);
+	while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
+	{
+		fputc('T', fpout);
+		rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
+		(void) rc;			/* we'll check for error with ferror */
+	}
 
-			last_statrequest = last_statwrite;
-		}
+	/*
+	 * Walk through the database's function stats table.
+	 */
+	hash_seq_init(&fstat, dbentry->functions);
+	while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
+	{
+		fputc('F', fpout);
+		rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
+		(void) rc;			/* we'll check for error with ferror */
+	}
+
+	/*
+	 * No more output to be done. Close the temp file and replace the old
+	 * pgstat.stat with it.  The ferror() check replaces testing for error
+	 * after each individual fputc or fwrite above.
+	 */
+	fputc('E', fpout);
+
+	if (ferror(fpout))
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+			   errmsg("could not write temporary statistics file \"%s\": %m",
+					  tmpfile)));
+		FreeFile(fpout);
+		unlink(tmpfile);
+	}
+	else if (FreeFile(fpout) < 0)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+			   errmsg("could not close temporary statistics file \"%s\": %m",
+					  tmpfile)));
+		unlink(tmpfile);
+	}
+	else if (rename(tmpfile, statfile) < 0)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+						tmpfile, statfile)));
+		unlink(tmpfile);
 	}
 
 	if (permanent)
-		unlink(pgstat_stat_filename);
-}
+	{
+		get_dbstat_filename(false, false, dbid, tmpfile, MAXPGPATH);
 
+		elog(DEBUG1, "removing temporary stat file '%s'", tmpfile);
+		unlink(tmpfile);
+	}
+}
 
 /* ----------
  * pgstat_read_statsfile() -
  *
  *	Reads in an existing statistics collector file and initializes the
- *	databases' hash table (whose entries point to the tables' hash tables).
+ *	databases' hash table.  If the permanent file name is requested, also
+ *	remove it after reading.
+ *
+ *  If a deep read is requested, table/function stats are read also, otherwise
+ *  the table/function hash tables remain empty.
  * ----------
  */
 static HTAB *
-pgstat_read_statsfile(Oid onlydb, bool permanent)
+pgstat_read_statsfile(Oid onlydb, bool permanent, bool deep)
 {
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatDBEntry dbbuf;
-	PgStat_StatTabEntry *tabentry;
-	PgStat_StatTabEntry tabbuf;
-	PgStat_StatFuncEntry funcbuf;
-	PgStat_StatFuncEntry *funcentry;
 	HASHCTL		hash_ctl;
 	HTAB	   *dbhash;
-	HTAB	   *tabhash = NULL;
-	HTAB	   *funchash = NULL;
 	FILE	   *fpin;
 	int32		format_id;
 	bool		found;
@@ -3662,8 +3807,8 @@ pgstat_read_statsfile(Oid onlydb, bool permanent)
 	/*
 	 * Verify it's of the expected format.
 	 */
-	if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id)
-		|| format_id != PGSTAT_FILE_FORMAT_ID)
+	if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
+		format_id != PGSTAT_FILE_FORMAT_ID)
 	{
 		ereport(pgStatRunningInCollector ? LOG : WARNING,
 				(errmsg("corrupted statistics file \"%s\"", statfile)));
@@ -3690,8 +3835,7 @@ pgstat_read_statsfile(Oid onlydb, bool permanent)
 		{
 				/*
 				 * 'D'	A PgStat_StatDBEntry struct describing a database
-				 * follows. Subsequently, zero to many 'T' and 'F' entries
-				 * will follow until a 'd' is encountered.
+				 * follows.
 				 */
 			case 'D':
 				if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
@@ -3753,21 +3897,106 @@ pgstat_read_statsfile(Oid onlydb, bool permanent)
 								   HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
 				/*
-				 * Arrange that following records add entries to this
-				 * database's hash tables.
+				 * If requested, read the data from the database-specific file.
+				 * If there was onlydb specified (!= InvalidOid), we would not
+				 * get here because of a break above. So we don't need to
+				 * recheck.
 				 */
-				tabhash = dbentry->tables;
-				funchash = dbentry->functions;
-				break;
+				if (deep)
+					pgstat_read_db_statsfile(dbentry->databaseid,
+											 dbentry->tables,
+											 dbentry->functions,
+											 permanent);
 
-				/*
-				 * 'd'	End of this database.
-				 */
-			case 'd':
-				tabhash = NULL;
-				funchash = NULL;
 				break;
 
+			case 'E':
+				goto done;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				goto done;
+		}
+	}
+
+done:
+	FreeFile(fpin);
+
+	if (permanent)
+	{
+		/*
+		 * If requested to read the permanent file, also get rid of it; the
+		 * in-memory status is now authoritative, and the permanent file would
+		 * be out of date in case somebody else reads it.
+		 */
+		unlink(PGSTAT_STAT_PERMANENT_FILENAME);
+	}
+
+	return dbhash;
+}
+
+
+/* ----------
+ * pgstat_read_db_statsfile() -
+ *
+ *	Reads in an existing statistics collector db file and initializes the
+ *	tables and functions hash tables (for the database identified by Oid).
+ * ----------
+ */
+static void
+pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent)
+{
+	PgStat_StatTabEntry *tabentry;
+	PgStat_StatTabEntry tabbuf;
+	PgStat_StatFuncEntry funcbuf;
+	PgStat_StatFuncEntry *funcentry;
+	FILE	   *fpin;
+	int32		format_id;
+	bool		found;
+	char		statfile[MAXPGPATH];
+
+	get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
+
+	/*
+	 * Try to open the status file. If it doesn't exist, the backends simply
+	 * return zero for anything and the collector simply starts from scratch
+	 * with empty counters.
+	 *
+	 * ENOENT is a possibility if the stats collector is not running or has
+	 * not yet written the stats file the first time.  Any other failure
+	 * condition is suspicious.
+	 */
+	if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+	{
+		if (errno != ENOENT)
+			ereport(pgStatRunningInCollector ? LOG : WARNING,
+					(errcode_for_file_access(),
+					 errmsg("could not open statistics file \"%s\": %m",
+							statfile)));
+		return;
+	}
+
+	/*
+	 * Verify it's of the expected format.
+	 */
+	if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id)
+		|| format_id != PGSTAT_FILE_FORMAT_ID)
+	{
+		ereport(pgStatRunningInCollector ? LOG : WARNING,
+				(errmsg("corrupted statistics file \"%s\"", statfile)));
+		goto done;
+	}
+
+	/*
+	 * We found an existing collector stats file. Read it and put all the
+	 * hashtable entries into place.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
 				/*
 				 * 'T'	A PgStat_StatTabEntry follows.
 				 */
@@ -3854,24 +4083,41 @@ done:
 	FreeFile(fpin);
 
 	if (permanent)
-		unlink(PGSTAT_STAT_PERMANENT_FILENAME);
+	{
+		get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
 
-	return dbhash;
+		elog(DEBUG1, "removing permanent stats file '%s'", statfile);
+		unlink(statfile);
+	}
+
+	return;
 }
 
 /* ----------
- * pgstat_read_statsfile_timestamp() -
+ * pgstat_read_db_statsfile_timestamp() -
  *
- *	Attempt to fetch the timestamp of an existing stats file.
+ *	Attempt to determine the timestamp of the last db statfile write.
  *	Returns TRUE if successful (timestamp is stored at *ts).
+ * 
+ *	This needs to be careful about handling databases without statfiles,
+ *	i.e. databases without stat entry or not yet written. The 
+ * 
+ *	- if there's a db stat entry, return the corresponding stats_timestamp
+ *	(which may be 0 if it was not yet written, which results in writing it)
+ *
+ *	- if there's no db stat entry (e.g. for a new or inactive database), there's
+ * 	no stat_timestamp but also nothing to write so we return timestamp of the
+ * 	global statfile
  * ----------
  */
 static bool
-pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts)
+pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, TimestampTz *ts)
 {
+	PgStat_StatDBEntry dbentry;
 	PgStat_GlobalStats myGlobalStats;
 	FILE	   *fpin;
 	int32		format_id;
+
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
 	/*
@@ -3911,12 +4157,58 @@ pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts)
 		return false;
 	}
 
+	/* By default, we're going to return the timestamp of the global file. */
 	*ts = myGlobalStats.stats_timestamp;
 
+	/*
+	 * We found an existing collector stats file. Read it and look for a record
+	 * for database with (OID = databaseid) - if found, use it's timestamp.
+	 */
+	for (;;)
+	{
+		switch (fgetc(fpin))
+		{
+				/*
+				 * 'D'	A PgStat_StatDBEntry struct describing a database
+				 * follows.
+				 */
+			case 'D':
+				
+				if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
+						  fpin) != offsetof(PgStat_StatDBEntry, tables))
+				{
+					ereport(pgStatRunningInCollector ? LOG : WARNING,
+							(errmsg("corrupted statistics file \"%s\"",
+									statfile)));
+					goto done;
+				}
+
+				/* Is this the DB we're looking for? */
+				if (dbentry.databaseid == databaseid) {
+					*ts = dbentry.stats_timestamp;
+					goto done;
+				}
+
+				break;
+
+			case 'E':
+				goto done;
+
+			default:
+				ereport(pgStatRunningInCollector ? LOG : WARNING,
+						(errmsg("corrupted statistics file \"%s\"",
+								statfile)));
+				goto done;
+		}
+	}
+
+
+done:
 	FreeFile(fpin);
 	return true;
 }
 
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
@@ -3947,7 +4239,19 @@ backend_read_statsfile(void)
 
 		CHECK_FOR_INTERRUPTS();
 
-		ok = pgstat_read_statsfile_timestamp(false, &file_ts);
+		ok = pgstat_read_db_statsfile_timestamp(MyDatabaseId, false, &file_ts);
+
+		if (!ok)
+		{
+			/*
+			 * see if the global file exists; if it does, then failure to read
+			 * the db-specific file only means that there's no entry in the
+			 * collector for it.  If so, break out of here, because the file is
+			 * not going to magically show up.
+			 */
+
+
+		}
 
 		cur_ts = GetCurrentTimestamp();
 		/* Calculate min acceptable timestamp, if we didn't already */
@@ -4006,7 +4310,7 @@ backend_read_statsfile(void)
 				pfree(mytime);
 			}
 
-			pgstat_send_inquiry(cur_ts, min_ts);
+			pgstat_send_inquiry(cur_ts, min_ts, MyDatabaseId);
 			break;
 		}
 
@@ -4016,7 +4320,7 @@ backend_read_statsfile(void)
 
 		/* Not there or too old, so kick the collector and wait a bit */
 		if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
-			pgstat_send_inquiry(cur_ts, min_ts);
+			pgstat_send_inquiry(cur_ts, min_ts, MyDatabaseId);
 
 		pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
 	}
@@ -4024,11 +4328,14 @@ backend_read_statsfile(void)
 	if (count >= PGSTAT_POLL_LOOP_COUNT)
 		elog(WARNING, "pgstat wait timeout");
 
-	/* Autovacuum launcher wants stats about all databases */
+	/*
+	 * Autovacuum launcher wants stats about all databases, but a shallow
+	 * read is sufficient.
+	 */
 	if (IsAutoVacuumLauncherProcess())
-		pgStatDBHash = pgstat_read_statsfile(InvalidOid, false);
+		pgStatDBHash = pgstat_read_statsfile(InvalidOid, false, false);
 	else
-		pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false);
+		pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false, true);
 }
 
 
@@ -4084,26 +4391,53 @@ pgstat_clear_snapshot(void)
 static void
 pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
 {
+	slist_iter	iter;
+	bool		found = false;
+	DBWriteRequest *newreq;
+	PgStat_StatDBEntry *dbentry;
+
+	elog(DEBUG1, "received inquiry for %d", msg->databaseid);
+
 	/*
-	 * Advance last_statrequest if this requestor has a newer cutoff time
-	 * than any previous request.
+	 * Find the last write request for this DB (found=true in that case). Plain
+	 * linear search, not really worth doing any magic here (probably).
 	 */
-	if (msg->cutoff_time > last_statrequest)
-		last_statrequest = msg->cutoff_time;
+	slist_foreach(iter, &last_statrequests)
+	{
+		DBWriteRequest *req = slist_container(DBWriteRequest, next, iter.cur);
+
+		if (req->databaseid != msg->databaseid)
+			continue;
+
+		if (msg->cutoff_time > req->request_time)
+			req->request_time = msg->cutoff_time;
+		found = true;
+		return;
+	}
 
 	/*
-	 * If the requestor's local clock time is older than last_statwrite, we
+	 * There's no request for this DB yet, so create one.
+	 */
+	newreq = palloc(sizeof(DBWriteRequest));
+
+	newreq->databaseid = msg->databaseid;
+	newreq->request_time = msg->clock_time;
+	slist_push_head(&last_statrequests, &newreq->next);
+
+	/*
+	 * If the requestor's local clock time is older than stats_timestamp, we
 	 * should suspect a clock glitch, ie system time going backwards; though
 	 * the more likely explanation is just delayed message receipt.  It is
 	 * worth expending a GetCurrentTimestamp call to be sure, since a large
 	 * retreat in the system clock reading could otherwise cause us to neglect
 	 * to update the stats file for a long time.
 	 */
-	if (msg->clock_time < last_statwrite)
+	dbentry = pgstat_get_db_entry(msg->databaseid, false);
+	if ((dbentry != NULL) && (msg->clock_time < dbentry->stats_timestamp))
 	{
 		TimestampTz cur_ts = GetCurrentTimestamp();
 
-		if (cur_ts < last_statwrite)
+		if (cur_ts < dbentry->stats_timestamp)
 		{
 			/*
 			 * Sure enough, time went backwards.  Force a new stats file write
@@ -4113,15 +4447,16 @@ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
 			char	   *mytime;
 
 			/* Copy because timestamptz_to_str returns a static buffer */
-			writetime = pstrdup(timestamptz_to_str(last_statwrite));
+			writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
 			mytime = pstrdup(timestamptz_to_str(cur_ts));
-			elog(LOG, "last_statwrite %s is later than collector's time %s",
-				 writetime, mytime);
+			elog(LOG,
+				 "stats_timestamp %s is later than collector's time %s for db %d",
+				 writetime, mytime, dbentry->databaseid);
 			pfree(writetime);
 			pfree(mytime);
 
-			last_statrequest = cur_ts;
-			last_statwrite = last_statrequest - 1;
+			newreq->request_time = cur_ts;
+			dbentry->stats_timestamp = cur_ts - 1;
 		}
 	}
 }
@@ -4270,29 +4605,36 @@ pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
 static void
 pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
 {
+	Oid			dbid = msg->m_databaseid;
 	PgStat_StatDBEntry *dbentry;
 
 	/*
 	 * Lookup the database in the hashtable.
 	 */
-	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+	dbentry = pgstat_get_db_entry(dbid, false);
 
 	/*
-	 * If found, remove it.
+	 * If found, remove it (along with the db statfile).
 	 */
 	if (dbentry)
 	{
+		char		statfile[MAXPGPATH];
+
+		get_dbstat_filename(true, false, dbid, statfile, MAXPGPATH);
+
+		elog(DEBUG1, "removing %s", statfile);
+		unlink(statfile);
+
 		if (dbentry->tables != NULL)
 			hash_destroy(dbentry->tables);
 		if (dbentry->functions != NULL)
 			hash_destroy(dbentry->functions);
 
 		if (hash_search(pgStatDBHash,
-						(void *) &(dbentry->databaseid),
+						(void *) &dbid,
 						HASH_REMOVE, NULL) == NULL)
 			ereport(ERROR,
-					(errmsg("database hash table corrupted "
-							"during cleanup --- abort")));
+					(errmsg("database hash table corrupted during cleanup --- abort")));
 	}
 }
 
@@ -4687,3 +5029,43 @@ pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
 						   HASH_REMOVE, NULL);
 	}
 }
+
+/* ----------
+ * pgstat_write_statsfile_needed() -
+ *
+ *	Do we need to write out the files?
+ * ----------
+ */
+static bool
+pgstat_write_statsfile_needed(void)
+{
+	if (!slist_is_empty(&last_statrequests))
+		return true;
+
+	/* Everything was written recently */
+	return false;
+}
+
+/* ----------
+ * pgstat_db_requested() -
+ *
+ *	Checks whether stats for a particular DB need to be written to a file.
+ * ----------
+ */
+
+static bool
+pgstat_db_requested(Oid databaseid)
+{
+	slist_iter	iter;
+
+	/* Check the databases if they need to refresh the stats. */
+	slist_foreach(iter, &last_statrequests)
+	{
+		DBWriteRequest	*req = slist_container(DBWriteRequest, next, iter.cur);
+
+		if (req->databaseid == databaseid)
+			return true;
+	}
+
+	return false;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6128694..0a53bb7 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -8704,14 +8704,25 @@ static void
 assign_pgstat_temp_directory(const char *newval, void *extra)
 {
 	/* check_canonical_path already canonicalized newval for us */
+	char	   *dname;
 	char	   *tname;
 	char	   *fname;
 
-	tname = guc_malloc(ERROR, strlen(newval) + 12);		/* /pgstat.tmp */
-	sprintf(tname, "%s/pgstat.tmp", newval);
-	fname = guc_malloc(ERROR, strlen(newval) + 13);		/* /pgstat.stat */
-	sprintf(fname, "%s/pgstat.stat", newval);
-
+	/* directory */
+	dname = guc_malloc(ERROR, strlen(newval) + 1);		/* runtime dir */
+	sprintf(dname, "%s", newval);
+
+	/* global stats */
+	tname = guc_malloc(ERROR, strlen(newval) + 12);		/* /global.tmp */
+	sprintf(tname, "%s/global.tmp", newval);
+	fname = guc_malloc(ERROR, strlen(newval) + 13);		/* /global.stat */
+	sprintf(fname, "%s/global.stat", newval);
+
+	if (pgstat_stat_directory)
+		free(pgstat_stat_directory);
+	pgstat_stat_directory = dname;
+	/* invalidate cached length in pgstat.c */
+	pgstat_stat_dbfile_maxlen = 0;
 	if (pgstat_stat_tmpname)
 		free(pgstat_stat_tmpname);
 	pgstat_stat_tmpname = tname;
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index b8faf9c..b501132 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -192,6 +192,7 @@ const char *subdirs[] = {
 	"base",
 	"base/1",
 	"pg_tblspc",
+	"pg_stat",
 	"pg_stat_tmp"
 };
 
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 03c0174..1248f47 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -205,6 +205,7 @@ typedef struct PgStat_MsgInquiry
 	PgStat_MsgHdr m_hdr;
 	TimestampTz clock_time;		/* observed local clock time */
 	TimestampTz cutoff_time;	/* minimum acceptable file timestamp */
+	Oid			databaseid;		/* requested DB (InvalidOid => all DBs) */
 } PgStat_MsgInquiry;
 
 
@@ -514,7 +515,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9A
+#define PGSTAT_FILE_FORMAT_ID	0xA240CA47
 
 /* ----------
  * PgStat_StatDBEntry			The collector's data per database
@@ -545,6 +546,7 @@ typedef struct PgStat_StatDBEntry
 	PgStat_Counter n_block_write_time;
 
 	TimestampTz stat_reset_timestamp;
+	TimestampTz stats_timestamp;		/* time of db stats file update */
 
 	/*
 	 * tables and functions must be last in the struct, because we don't write
@@ -722,6 +724,8 @@ extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
 extern int	pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
+extern char *pgstat_stat_directory;
+extern int	pgstat_stat_dbfile_maxlen;
 extern char *pgstat_stat_tmpname;
 extern char *pgstat_stat_filename;
 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to