Hi,

I have a bug pending that autovacuum fails to give priority to
for-wraparound tables.  When xid consumption rate is high and dead tuple
creation is also high, it is possible that some tables are waiting for
for-wraparound vacuums that don't complete in time because the workers
are busy processing other tables that have accumulated dead tuples; the
system is then down because it's too near the Xid wraparound horizon.
Apparently this is particularly notorious in connection with TOAST
tables, because those are always put in the tables-to-process list after
regular tables.

(As far as I recall, this was already reported elsewhere, but so far I
have been unable to find the discussion in the archives.  Pointers
appreciated.)

So here's a small, backpatchable patch that sorts the list of tables to
process (not all that much tested yet).  Tables which have the
wraparound flag set are processed before those that are not.  Other
than this criterion, the order is not defined.

Now we could implement this differently, and maybe more simply (say by
keeping two lists of tables to process, one with for-wraparound tables
and one with the rest) but this way it is simpler to add additional
sorting criteria later: say within each category we could first process
smaller tables that have more dead tuples.

My intention is to clean this up and backpatch to all live branches.
Comments?

-- 
Álvaro Herrera                http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
*** a/src/backend/postmaster/autovacuum.c
--- b/src/backend/postmaster/autovacuum.c
***************
*** 1890,1895 **** get_database_list(void)
--- 1890,1919 ----
  	return dblist;
  }
  
+ typedef struct vacuum_table
+ {
+ 	Oid		reloid;
+ 	bool	wraparound;
+ } vacuum_table;
+ 
+ /*
+  * qsort comparator.  Tables marked for wraparound sort first.
+  */
+ static int
+ vacpriority_comparator(const void *a, const void *b)
+ {
+ 	const vacuum_table *taba = *(vacuum_table *const *) a;
+ 	const vacuum_table *tabb = *(vacuum_table *const *) b;
+ 
+ 	if (taba->wraparound == tabb->wraparound)
+ 		return 0;
+ 
+ 	if (taba->wraparound)
+ 		return -1;
+ 	else
+ 		return 1;
+ }
+ 
  /*
   * Process a database table-by-table
   *
***************
*** 1903,1917 **** do_autovacuum(void)
  	HeapTuple	tuple;
  	HeapScanDesc relScan;
  	Form_pg_database dbForm;
- 	List	   *table_oids = NIL;
  	HASHCTL		ctl;
  	HTAB	   *table_toast_map;
- 	ListCell   *volatile cell;
  	PgStat_StatDBEntry *shared;
  	PgStat_StatDBEntry *dbentry;
  	BufferAccessStrategy bstrategy;
  	ScanKeyData key;
  	TupleDesc	pg_class_desc;
  
  	/*
  	 * StartTransactionCommand and CommitTransactionCommand will automatically
--- 1927,1943 ----
  	HeapTuple	tuple;
  	HeapScanDesc relScan;
  	Form_pg_database dbForm;
  	HASHCTL		ctl;
  	HTAB	   *table_toast_map;
  	PgStat_StatDBEntry *shared;
  	PgStat_StatDBEntry *dbentry;
  	BufferAccessStrategy bstrategy;
  	ScanKeyData key;
  	TupleDesc	pg_class_desc;
+ 	vacuum_table **tables;
+ 	int			ntables;
+ 	int			maxtables;
+ 	int			i;
  
  	/*
  	 * StartTransactionCommand and CommitTransactionCommand will automatically
***************
*** 1986,1991 **** do_autovacuum(void)
--- 2012,2022 ----
  								  &ctl,
  								  HASH_ELEM | HASH_FUNCTION);
  
+ 	/* initialize our tasklist */
+ 	ntables = 0;
+ 	maxtables = 32;
+ 	tables = palloc(maxtables * sizeof(vacuum_table *));
+ 
  	/*
  	 * Scan pg_class to determine which tables to vacuum.
  	 *
***************
*** 2077,2085 **** do_autovacuum(void)
  		}
  		else
  		{
! 			/* relations that need work are added to table_oids */
  			if (dovacuum || doanalyze)
! 				table_oids = lappend_oid(table_oids, relid);
  
  			/*
  			 * Remember the association for the second pass.  Note: we must do
--- 2108,2133 ----
  		}
  		else
  		{
! 			/* relations that need work are added to our tasklist */
  			if (dovacuum || doanalyze)
! 			{
! 				vacuum_table *tab;
! 
! 				/* enlarge the array if necessary */
! 				if (ntables >= maxtables)
! 				{
! 					maxtables *= 2;
! 					tables = repalloc(tables,
! 									  maxtables * sizeof(vacuum_table *));
! 				}
! 
! 				tab = palloc(sizeof(vacuum_table));
! 
! 				tab->reloid = relid;
! 				tab->wraparound = wraparound;
! 
! 				tables[ntables++] = tab;
! 			}
  
  			/*
  			 * Remember the association for the second pass.  Note: we must do
***************
*** 2162,2168 **** do_autovacuum(void)
  
  		/* ignore analyze for toast tables */
  		if (dovacuum)
! 			table_oids = lappend_oid(table_oids, relid);
  	}
  
  	heap_endscan(relScan);
--- 2210,2233 ----
  
  		/* ignore analyze for toast tables */
  		if (dovacuum)
! 		{
! 			vacuum_table *tab;
! 
! 			/* enlarge the array if necessary */
! 			if (ntables >= maxtables)
! 			{
! 				maxtables *= 2;
! 				tables = repalloc(tables,
! 								  maxtables * sizeof(vacuum_table *));
! 			}
! 
! 			tab = palloc(sizeof(vacuum_table));
! 
! 			tab->reloid = relid;
! 			tab->wraparound = wraparound;
! 
! 			tables[ntables++] = tab;
! 		}
  	}
  
  	heap_endscan(relScan);
***************
*** 2185,2196 **** do_autovacuum(void)
  										  ALLOCSET_DEFAULT_MINSIZE,
  										  ALLOCSET_DEFAULT_MAXSIZE);
  
  	/*
  	 * Perform operations on collected tables.
  	 */
! 	foreach(cell, table_oids)
  	{
- 		Oid			relid = lfirst_oid(cell);
  		autovac_table *tab;
  		bool		skipit;
  		int			stdVacuumCostDelay;
--- 2250,2264 ----
  										  ALLOCSET_DEFAULT_MINSIZE,
  										  ALLOCSET_DEFAULT_MAXSIZE);
  
+ 	/* sort our task list */
+ 	qsort(tables, ntables, sizeof(vacuum_table *),
+ 		  vacpriority_comparator);
+ 
  	/*
  	 * Perform operations on collected tables.
  	 */
! 	for (i = 0; i < ntables; i++)
  	{
  		autovac_table *tab;
  		bool		skipit;
  		int			stdVacuumCostDelay;
***************
*** 2224,2230 **** do_autovacuum(void)
  			if (worker->wi_dboid != MyDatabaseId)
  				continue;
  
! 			if (worker->wi_tableoid == relid)
  			{
  				skipit = true;
  				break;
--- 2292,2298 ----
  			if (worker->wi_dboid != MyDatabaseId)
  				continue;
  
! 			if (worker->wi_tableoid == tables[i]->reloid)
  			{
  				skipit = true;
  				break;
***************
*** 2248,2254 **** do_autovacuum(void)
  		 * the race condition is not closed but it is very small.
  		 */
  		MemoryContextSwitchTo(AutovacMemCxt);
! 		tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc);
  		if (tab == NULL)
  		{
  			/* someone else vacuumed the table, or it went away */
--- 2316,2323 ----
  		 * the race condition is not closed but it is very small.
  		 */
  		MemoryContextSwitchTo(AutovacMemCxt);
! 		tab = table_recheck_autovac(tables[i]->reloid, table_toast_map,
! 									pg_class_desc);
  		if (tab == NULL)
  		{
  			/* someone else vacuumed the table, or it went away */
***************
*** 2260,2266 **** do_autovacuum(void)
  		 * Ok, good to go.	Store the table in shared memory before releasing
  		 * the lock so that other workers don't vacuum it concurrently.
  		 */
! 		MyWorkerInfo->wi_tableoid = relid;
  		LWLockRelease(AutovacuumScheduleLock);
  
  		/*
--- 2329,2335 ----
  		 * Ok, good to go.	Store the table in shared memory before releasing
  		 * the lock so that other workers don't vacuum it concurrently.
  		 */
! 		MyWorkerInfo->wi_tableoid = tables[i]->reloid;
  		LWLockRelease(AutovacuumScheduleLock);
  
  		/*
***************
*** 2361,2366 **** deleted:
--- 2430,2436 ----
  		if (tab->at_relname != NULL)
  			pfree(tab->at_relname);
  		pfree(tab);
+ 		pfree(tables[i]);
  
  		/*
  		 * Remove my info from shared memory.  We could, but intentionally
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to