Re: [HACKERS] Feedback on getting rid of VACUUM FULL

Heikki Linnakangas Wed, 16 Sep 2009 13:54:31 -0700

Hannu Krosing wrote:
> On Wed, 2009-09-16 at 21:23 +0300, Heikki Linnakangas wrote:
>> 2) Another utility that does something like UPDATE ... WHERE ctid > ? to
>> move tuples to lower pages. It will be different from current VACUUM
>> FULL in some ways. It won't require a table lock, for example, but it
>> won't be able to move update chains as nicely. But it would be trivial
>> to write one, so I think we should offer that as a contrib module.
> 
> I have not checked, but I suspect pg_reorg may already be doing
> something similar http://pgfoundry.org/forum/forum.php?forum_id=1561


Hmm, AFAICT pg_reorg is much more complex, writing stuff to a temp table
and swapping relfilenodes afterwards. More like the VACUUM REWRITE
that's been discussed.

For the kicks, I looked at what it would take to write a utility like
that. It turns out to be quite trivial, patch attached. It uses the same
principle as VACUUM FULL, scans from the end, moving tuples to
lower-numbered pages until it can't do it anymore. It requires a small
change to heap_update(), to override the preference to store the new
tuple on the same page as the old one, but other than that, it's all in
the external module.

To test:

-- Create and populate test table
CREATE TABLE foo (id int4 PRIMARY KEY);
INSERT INTO foo SELECT a FROM generate_series(1,100000) a;

-- Delete a lot of tuples from the beginning. This creates the hole that
we want to compact out.
DELETE FROM foo WHERE id < 90000;

-- Vacuum to remove the dead tuples
VACUUM VERBOSE foo;

-- Run the utility to "move" the tuples
SELECT vacuumfull('foo');

-- Vacuum table again to remove the old tuple versions of the moved rows
and truncate the file.

VACUUM VERBOSE foo;

-- 
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

diff --git a/contrib/Makefile b/contrib/Makefile
index 0afa149..59c9279 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -40,6 +40,7 @@ SUBDIRS = \
 		test_parser	\
 		tsearch2	\
 		unaccent	\
+		vacuumfull	\
 		vacuumlo
 
 ifeq ($(with_openssl),yes)
diff --git a/contrib/vacuumfull/Makefile b/contrib/vacuumfull/Makefile
new file mode 100644
index 0000000..925d2c4
--- /dev/null
+++ b/contrib/vacuumfull/Makefile
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------------------
+#
+# vacuumfull Makefile
+#
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+MODULE_big	= vacuumfull
+OBJS		= vacuumfull.o
+DATA_built	= vacuumfull.sql
+DATA      	= uninstall_vacuumfull.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/vacuumfull
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
diff --git a/contrib/vacuumfull/uninstall_vacuumfull.sql b/contrib/vacuumfull/uninstall_vacuumfull.sql
new file mode 100644
index 0000000..9ecab84
--- /dev/null
+++ b/contrib/vacuumfull/uninstall_vacuumfull.sql
@@ -0,0 +1,6 @@
+/* $PostgreSQL$ */
+
+-- Adjust this setting to control where the objects get dropped.
+SET search_path = public;
+
+DROP FUNCTION vacuumfull(regclass);
diff --git a/contrib/vacuumfull/vacuumfull.c b/contrib/vacuumfull/vacuumfull.c
new file mode 100644
index 0000000..07139ba
--- /dev/null
+++ b/contrib/vacuumfull/vacuumfull.c
@@ -0,0 +1,286 @@
+/*-------------------------------------------------------------------------
+ *
+ * vacuumfull.c
+ *	  An utility to replace old VACUUM FULL
+ *
+ * XXX
+ *
+ * Copyright (c) 2007-2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/xact.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/tqual.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC;
+
+Datum		vacuumfull(PG_FUNCTION_ARGS);
+
+
+/*----------------------------------------------------------------------
+ * ExecContext:
+ *
+ * As these variables always appear together, we put them into one struct
+ * and pull initialization and cleanup into separate routines.
+ * ExecContext is used by repair_frag() and move_xxx_tuple().  More
+ * accurately:	It is *used* only in move_xxx_tuple(), but because this
+ * routine is called many times, we initialize the struct just once in
+ * repair_frag() and pass it on to move_xxx_tuple().
+ */
+typedef struct ExecContextData
+{
+	ResultRelInfo *resultRelInfo;
+	EState	   *estate;
+	TupleTableSlot *slot;
+} ExecContextData;
+
+typedef ExecContextData *ExecContext;
+
+static void
+ExecContext_Init(ExecContext ec, Relation rel)
+{
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+
+	/*
+	 * We need a ResultRelInfo and an EState so we can use the regular
+	 * executor's index-entry-making machinery.
+	 */
+	ec->estate = CreateExecutorState();
+
+	ec->resultRelInfo = makeNode(ResultRelInfo);
+	ec->resultRelInfo->ri_RangeTableIndex = 1;	/* dummy */
+	ec->resultRelInfo->ri_RelationDesc = rel;
+	ec->resultRelInfo->ri_TrigDesc = NULL;		/* we don't fire triggers */
+
+	ExecOpenIndices(ec->resultRelInfo);
+
+	ec->estate->es_result_relations = ec->resultRelInfo;
+	ec->estate->es_num_result_relations = 1;
+	ec->estate->es_result_relation_info = ec->resultRelInfo;
+
+	/* Set up a tuple slot too */
+	ec->slot = MakeSingleTupleTableSlot(tupdesc);
+}
+
+static void
+ExecContext_Finish(ExecContext ec)
+{
+	ExecDropSingleTupleTableSlot(ec->slot);
+	ExecCloseIndices(ec->resultRelInfo);
+	FreeExecutorState(ec->estate);
+}
+
+/*
+ * End of ExecContext Implementation
+ *----------------------------------------------------------------------
+ */
+
+
+/*
+ * vacuumfull
+ */
+PG_FUNCTION_INFO_V1(vacuumfull);
+
+Datum
+vacuumfull(PG_FUNCTION_ARGS)
+{
+	Oid relid = PG_GETARG_OID(0);
+	Relation rel;
+	BlockNumber blk;
+	BlockNumber numblks;
+	BlockNumber minblk;
+	Buffer buf = InvalidBuffer;
+	TransactionId oldestxmin;
+	bool found_unmovable = false;
+	ExecContextData ec;
+	int tuplesmoved = 0;
+	BlockNumber pagescleared = 0;
+
+	rel = heap_open(relid, AccessShareLock);
+
+	/*
+	 * Check permissions.
+	 *
+	 * We allow the user to vacuum a table if he is superuser, the table
+	 * owner, or the database owner (but in the latter case, only if it's not
+	 * a shared relation).	pg_class_ownercheck includes the superuser case.
+	 */
+	if (!(pg_class_ownercheck(RelationGetRelid(rel), GetUserId()) ||
+		  (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !rel->rd_rel->relisshared)))
+	{
+		aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
+					   RelationGetRelationName(rel));
+
+	}
+
+	/* Check that this relation has storage */
+	if (rel->rd_rel->relkind == RELKIND_VIEW)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("\"%s\" is a view",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	oldestxmin = GetOldestXmin(false, true);
+
+	ExecContext_Init(&ec, rel);
+
+	/*
+	 * Read the relation backwards from the end, updating tuples. Hopefully
+	 * the new tuple versions will be stored on lower-numbered pages.
+	 * Stop when we reach the last page where we stored a new tuple version
+	 * (minblk).
+	 */
+	numblks = RelationGetNumberOfBlocks(rel);
+	minblk = 0;
+	for (blk = numblks ? numblks - 1 : 0;
+		 blk > minblk && !found_unmovable;
+		 blk--)
+	{
+		Page page;
+		OffsetNumber off;
+		OffsetNumber maxoff;
+
+		buf = ReleaseAndReadBuffer(buf, rel, blk);
+
+		page = BufferGetPage(buf);
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		for (off = FirstOffsetNumber; off <= maxoff && !found_unmovable; off++)
+		{
+			ItemId itemid;
+			HeapTupleData oldtuple;
+			bool move = false;
+
+			LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+			itemid = PageGetItemId(page, off);
+			if (ItemIdIsNormal(itemid))
+			{
+				ItemPointerSet(&oldtuple.t_self, blk, off);
+				oldtuple.t_len = ItemIdGetLength(itemid);
+				oldtuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+
+				switch(HeapTupleSatisfiesVacuum(oldtuple.t_data, oldestxmin, buf) != HEAPTUPLE_DEAD)
+				{
+					case HEAPTUPLE_LIVE:
+						/* let's move it to a lower-numbered page */
+						move = true;
+						break;
+					case HEAPTUPLE_DEAD:
+						/* dead tuples will go away with a regular vacuum */
+						break;
+					case HEAPTUPLE_RECENTLY_DEAD:
+						/* can't remove this yet, but it will eventually be removable
+						 * by regular VACUUM
+						 */
+						break;
+					case HEAPTUPLE_INSERT_IN_PROGRESS:
+						/* we could try to update this once the inserter commits,
+						 * but we would have to wait for i. */
+						found_unmovable = true;
+						break;
+					case HEAPTUPLE_DELETE_IN_PROGRESS:
+						/* assuming the deleter commits, this will become removable
+						 * by regular VACUUM.
+						 */
+						break;
+				}
+			}
+
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+			if (move)
+			{
+				HeapTuple newtuple = heap_copytuple(&oldtuple);
+				ItemPointerData errctid;
+				TransactionId update_xmax;
+				HTSU_Result result;
+
+				result = heap_update(rel, &oldtuple.t_self, newtuple, &errctid, &update_xmax,
+									 GetCurrentCommandId(true), NULL, true, true);
+
+				if (result == HeapTupleMayBeUpdated)
+				{
+					BlockNumber newblk;
+
+					tuplesmoved++;
+
+					CacheInvalidateHeapTuple(rel, &oldtuple);
+
+					/* Create index entries for the moved tuple */
+					if (ec.resultRelInfo->ri_NumIndices > 0)
+					{
+						ExecStoreTuple(newtuple, ec.slot, InvalidBuffer, false);
+						ExecInsertIndexTuples(ec.slot, &(newtuple->t_self), ec.estate, true);
+						ResetPerTupleExprContext(ec.estate);
+					}
+
+					newblk = ItemPointerGetBlockNumber(&newtuple->t_self);
+
+					elog(DEBUG4, "moved tuple %u/%u to %u/%u", blk, off, newblk,
+						 ItemPointerGetOffsetNumber(&newtuple->t_self));
+
+					if (newblk >= blk)
+					{
+						/*
+						 * Oops, the update moved the tuple higher in the
+						 * relation, not lower as we wanted. Let's stop now
+						 * before we do any more of that.
+						 */
+						elog(WARNING, "moved tuple %u/%u to a higher page, stopping",
+							 blk, off);
+						found_unmovable = true;
+					}
+
+					if(newblk > minblk)
+						minblk = newblk;
+				}
+				else
+				{
+					elog(DEBUG1, "failed to move tuple %u/%u to a lower page, stopping",
+						 blk, off);
+					found_unmovable = true;
+				}
+			}
+		}
+		if (!found_unmovable)
+			pagescleared++;
+	}
+
+	elog(NOTICE, "%d tuples moved, there is now %d pages at the end of relation with only dead tuples. Please wait for any in-progress transactions to finsih and run VACUUM to reclaim the space",
+		 tuplesmoved, pagescleared);
+
+	if (BufferIsValid(buf))
+		ReleaseBuffer(buf);
+
+	ExecContext_Finish(&ec);
+
+	heap_close(rel, AccessShareLock);
+
+	PG_RETURN_VOID();
+}
+
diff --git a/contrib/vacuumfull/vacuumfull.sql.in b/contrib/vacuumfull/vacuumfull.sql.in
new file mode 100644
index 0000000..6017cc9
--- /dev/null
+++ b/contrib/vacuumfull/vacuumfull.sql.in
@@ -0,0 +1,12 @@
+/* $PostgreSQL$ */
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+--
+-- vacuumfull()
+--
+CREATE OR REPLACE FUNCTION vacuumfull(regclass)
+RETURNS void
+AS '$libdir/vacuumfull', 'vacuumfull'
+LANGUAGE C STRICT;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b0a911e..56c42cf 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2349,12 +2349,17 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  *		cmax/cmin if successful)
  *	crosscheck - if not InvalidSnapshot, also check old tuple against this
  *	wait - true if should wait for any conflicting update to commit/abort
+ *  forcefsm - use FSM even if there's space on the same page.
  *
  * Normal, successful return value is HeapTupleMayBeUpdated, which
  * actually means we *did* update it.  Failure return codes are
  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
  * (the last only possible if wait == false).
  *
+ * If 'forcefsm' is TRUE, and the new tuple couldn't be stored on a page
+ * with lower block number than the old page, the tuple is not updated and
+ * HeapTupleNoSpace is returned.
+ *
  * On success, the header fields of *newtup are updated to match the new
  * stored tuple; in particular, newtup->t_self is set to the TID where the
  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
@@ -2369,7 +2374,7 @@ simple_heap_delete(Relation relation, ItemPointer tid)
 HTSU_Result
 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 			ItemPointer ctid, TransactionId *update_xmax,
-			CommandId cid, Snapshot crosscheck, bool wait)
+			CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm)
 {
 	HTSU_Result result;
 	TransactionId xid = GetCurrentTransactionId();
@@ -2599,7 +2604,7 @@ l2:
 
 	newtupsize = MAXALIGN(newtup->t_len);
 
-	if (need_toast || newtupsize > pagefree)
+	if (need_toast || newtupsize > pagefree || forcefsm)
 	{
 		/* Clear obsolete visibility flags ... */
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
@@ -2649,11 +2654,34 @@ l2:
 		 * while not holding the lock on the old page, and we must rely on it
 		 * to get the locks on both pages in the correct order.
 		 */
-		if (newtupsize > pagefree)
+		if (newtupsize > pagefree || forcefsm)
 		{
 			/* Assume there's no chance to put heaptup on same page. */
 			newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
 											   buffer, 0, NULL);
+			if (BufferGetBlockNumber(newbuf) >= ItemPointerGetBlockNumber(otid))
+			{
+				/*
+				 * We couldn't put the new tuple version on any page before
+				 * the old version. The purpose of 'forcefsm' is to update
+				 * tuples so that they are stored on lower-numbered pages,
+				 * to allow the heap to be truncated later, so there's no
+				 * point in continuing if the new page is higher than the old
+				 * one.
+				 */
+
+				if (newbuf != buffer)
+					UnlockReleaseBuffer(newbuf);
+
+				/* undo the xmax change. */
+				HeapTupleHeaderSetXmax(oldtup.t_data, InvalidTransactionId);
+
+				UnlockReleaseBuffer(buffer);
+				if (have_tuple_lock)
+					UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+				bms_free(hot_attrs);
+				return HeapTupleNoSpace;
+			}
 		}
 		else
 		{
@@ -2977,7 +3005,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
 	result = heap_update(relation, otid, tup,
 						 &update_ctid, &update_xmax,
 						 GetCurrentCommandId(true), InvalidSnapshot,
-						 true /* wait for commit */ );
+						 true /* wait for commit */, false);
 	switch (result)
 	{
 		case HeapTupleSelfUpdated:
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 55df5d0..7118dbb 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -2076,7 +2076,7 @@ lreplace:;
 						 &update_ctid, &update_xmax,
 						 estate->es_output_cid,
 						 estate->es_crosscheck_snapshot,
-						 true /* wait for commit */ );
+						 true /* wait for commit */, false);
 	switch (result)
 	{
 		case HeapTupleSelfUpdated:
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index f8395fe..f88a029 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -102,7 +102,7 @@ extern HTSU_Result heap_delete(Relation relation, ItemPointer tid,
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid,
 			HeapTuple newtup,
 			ItemPointer ctid, TransactionId *update_xmax,
-			CommandId cid, Snapshot crosscheck, bool wait);
+		    CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm);
 extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
 				Buffer *buffer, ItemPointer ctid,
 				TransactionId *update_xmax, CommandId cid,
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index df79e3a..1aab53b 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -72,7 +72,8 @@ typedef enum
 	HeapTupleInvisible,
 	HeapTupleSelfUpdated,
 	HeapTupleUpdated,
-	HeapTupleBeingUpdated
+	HeapTupleBeingUpdated,
+	HeapTupleNoSpace /* only used by heap_update(), with 'forcefsm'  */
 } HTSU_Result;
 
 #endif   /* SNAPSHOT_H */

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Feedback on getting rid of VACUUM FULL

Reply via email to