Hannu Krosing wrote:
> On Wed, 2009-09-16 at 21:23 +0300, Heikki Linnakangas wrote:
>> 2) Another utility that does something like UPDATE ... WHERE ctid > ? to
>> move tuples to lower pages. It will be different from current VACUUM
>> FULL in some ways. It won't require a table lock, for example, but it
>> won't be able to move update chains as nicely. But it would be trivial
>> to write one, so I think we should offer that as a contrib module.
>
> I have not checked, but I suspect pg_reorg may already be doing
> something similar http://pgfoundry.org/forum/forum.php?forum_id=1561
Hmm, AFAICT pg_reorg is much more complex, writing stuff to a temp table
and swapping relfilenodes afterwards. More like the VACUUM REWRITE
that's been discussed.
For the kicks, I looked at what it would take to write a utility like
that. It turns out to be quite trivial, patch attached. It uses the same
principle as VACUUM FULL, scans from the end, moving tuples to
lower-numbered pages until it can't do it anymore. It requires a small
change to heap_update(), to override the preference to store the new
tuple on the same page as the old one, but other than that, it's all in
the external module.
To test:
-- Create and populate test table
CREATE TABLE foo (id int4 PRIMARY KEY);
INSERT INTO foo SELECT a FROM generate_series(1,100000) a;
-- Delete a lot of tuples from the beginning. This creates the hole that
we want to compact out.
DELETE FROM foo WHERE id < 90000;
-- Vacuum to remove the dead tuples
VACUUM VERBOSE foo;
-- Run the utility to "move" the tuples
SELECT vacuumfull('foo');
-- Vacuum table again to remove the old tuple versions of the moved rows
and truncate the file.
VACUUM VERBOSE foo;
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
diff --git a/contrib/Makefile b/contrib/Makefile
index 0afa149..59c9279 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -40,6 +40,7 @@ SUBDIRS = \
test_parser \
tsearch2 \
unaccent \
+ vacuumfull \
vacuumlo
ifeq ($(with_openssl),yes)
diff --git a/contrib/vacuumfull/Makefile b/contrib/vacuumfull/Makefile
new file mode 100644
index 0000000..925d2c4
--- /dev/null
+++ b/contrib/vacuumfull/Makefile
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------------------
+#
+# vacuumfull Makefile
+#
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+MODULE_big = vacuumfull
+OBJS = vacuumfull.o
+DATA_built = vacuumfull.sql
+DATA = uninstall_vacuumfull.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/vacuumfull
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
diff --git a/contrib/vacuumfull/uninstall_vacuumfull.sql b/contrib/vacuumfull/uninstall_vacuumfull.sql
new file mode 100644
index 0000000..9ecab84
--- /dev/null
+++ b/contrib/vacuumfull/uninstall_vacuumfull.sql
@@ -0,0 +1,6 @@
+/* $PostgreSQL$ */
+
+-- Adjust this setting to control where the objects get dropped.
+SET search_path = public;
+
+DROP FUNCTION vacuumfull(regclass);
diff --git a/contrib/vacuumfull/vacuumfull.c b/contrib/vacuumfull/vacuumfull.c
new file mode 100644
index 0000000..07139ba
--- /dev/null
+++ b/contrib/vacuumfull/vacuumfull.c
@@ -0,0 +1,286 @@
+/*-------------------------------------------------------------------------
+ *
+ * vacuumfull.c
+ * An utility to replace old VACUUM FULL
+ *
+ * XXX
+ *
+ * Copyright (c) 2007-2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/xact.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/tqual.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC;
+
+Datum vacuumfull(PG_FUNCTION_ARGS);
+
+
+/*----------------------------------------------------------------------
+ * ExecContext:
+ *
+ * As these variables always appear together, we put them into one struct
+ * and pull initialization and cleanup into separate routines.
+ * ExecContext is used by repair_frag() and move_xxx_tuple(). More
+ * accurately: It is *used* only in move_xxx_tuple(), but because this
+ * routine is called many times, we initialize the struct just once in
+ * repair_frag() and pass it on to move_xxx_tuple().
+ */
+typedef struct ExecContextData
+{
+ ResultRelInfo *resultRelInfo;
+ EState *estate;
+ TupleTableSlot *slot;
+} ExecContextData;
+
+typedef ExecContextData *ExecContext;
+
+static void
+ExecContext_Init(ExecContext ec, Relation rel)
+{
+ TupleDesc tupdesc = RelationGetDescr(rel);
+
+ /*
+ * We need a ResultRelInfo and an EState so we can use the regular
+ * executor's index-entry-making machinery.
+ */
+ ec->estate = CreateExecutorState();
+
+ ec->resultRelInfo = makeNode(ResultRelInfo);
+ ec->resultRelInfo->ri_RangeTableIndex = 1; /* dummy */
+ ec->resultRelInfo->ri_RelationDesc = rel;
+ ec->resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */
+
+ ExecOpenIndices(ec->resultRelInfo);
+
+ ec->estate->es_result_relations = ec->resultRelInfo;
+ ec->estate->es_num_result_relations = 1;
+ ec->estate->es_result_relation_info = ec->resultRelInfo;
+
+ /* Set up a tuple slot too */
+ ec->slot = MakeSingleTupleTableSlot(tupdesc);
+}
+
+static void
+ExecContext_Finish(ExecContext ec)
+{
+ ExecDropSingleTupleTableSlot(ec->slot);
+ ExecCloseIndices(ec->resultRelInfo);
+ FreeExecutorState(ec->estate);
+}
+
+/*
+ * End of ExecContext Implementation
+ *----------------------------------------------------------------------
+ */
+
+
+/*
+ * vacuumfull
+ */
+PG_FUNCTION_INFO_V1(vacuumfull);
+
+Datum
+vacuumfull(PG_FUNCTION_ARGS)
+{
+ Oid relid = PG_GETARG_OID(0);
+ Relation rel;
+ BlockNumber blk;
+ BlockNumber numblks;
+ BlockNumber minblk;
+ Buffer buf = InvalidBuffer;
+ TransactionId oldestxmin;
+ bool found_unmovable = false;
+ ExecContextData ec;
+ int tuplesmoved = 0;
+ BlockNumber pagescleared = 0;
+
+ rel = heap_open(relid, AccessShareLock);
+
+ /*
+ * Check permissions.
+ *
+ * We allow the user to vacuum a table if he is superuser, the table
+ * owner, or the database owner (but in the latter case, only if it's not
+ * a shared relation). pg_class_ownercheck includes the superuser case.
+ */
+ if (!(pg_class_ownercheck(RelationGetRelid(rel), GetUserId()) ||
+ (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !rel->rd_rel->relisshared)))
+ {
+ aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
+ RelationGetRelationName(rel));
+
+ }
+
+ /* Check that this relation has storage */
+ if (rel->rd_rel->relkind == RELKIND_VIEW)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is a view",
+ RelationGetRelationName(rel))));
+
+ /*
+ * Reject attempts to read non-local temporary relations; we would be
+ * likely to get wrong data since we have no visibility into the owning
+ * session's local buffers.
+ */
+ if (RELATION_IS_OTHER_TEMP(rel))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot access temporary tables of other sessions")));
+
+ oldestxmin = GetOldestXmin(false, true);
+
+ ExecContext_Init(&ec, rel);
+
+ /*
+ * Read the relation backwards from the end, updating tuples. Hopefully
+ * the new tuple versions will be stored on lower-numbered pages.
+ * Stop when we reach the last page where we stored a new tuple version
+ * (minblk).
+ */
+ numblks = RelationGetNumberOfBlocks(rel);
+ minblk = 0;
+ for (blk = numblks ? numblks - 1 : 0;
+ blk > minblk && !found_unmovable;
+ blk--)
+ {
+ Page page;
+ OffsetNumber off;
+ OffsetNumber maxoff;
+
+ buf = ReleaseAndReadBuffer(buf, rel, blk);
+
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ for (off = FirstOffsetNumber; off <= maxoff && !found_unmovable; off++)
+ {
+ ItemId itemid;
+ HeapTupleData oldtuple;
+ bool move = false;
+
+ LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+ itemid = PageGetItemId(page, off);
+ if (ItemIdIsNormal(itemid))
+ {
+ ItemPointerSet(&oldtuple.t_self, blk, off);
+ oldtuple.t_len = ItemIdGetLength(itemid);
+ oldtuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+
+ switch(HeapTupleSatisfiesVacuum(oldtuple.t_data, oldestxmin, buf) != HEAPTUPLE_DEAD)
+ {
+ case HEAPTUPLE_LIVE:
+ /* let's move it to a lower-numbered page */
+ move = true;
+ break;
+ case HEAPTUPLE_DEAD:
+ /* dead tuples will go away with a regular vacuum */
+ break;
+ case HEAPTUPLE_RECENTLY_DEAD:
+ /* can't remove this yet, but it will eventually be removable
+ * by regular VACUUM
+ */
+ break;
+ case HEAPTUPLE_INSERT_IN_PROGRESS:
+ /* we could try to update this once the inserter commits,
+ * but we would have to wait for i. */
+ found_unmovable = true;
+ break;
+ case HEAPTUPLE_DELETE_IN_PROGRESS:
+ /* assuming the deleter commits, this will become removable
+ * by regular VACUUM.
+ */
+ break;
+ }
+ }
+
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (move)
+ {
+ HeapTuple newtuple = heap_copytuple(&oldtuple);
+ ItemPointerData errctid;
+ TransactionId update_xmax;
+ HTSU_Result result;
+
+ result = heap_update(rel, &oldtuple.t_self, newtuple, &errctid, &update_xmax,
+ GetCurrentCommandId(true), NULL, true, true);
+
+ if (result == HeapTupleMayBeUpdated)
+ {
+ BlockNumber newblk;
+
+ tuplesmoved++;
+
+ CacheInvalidateHeapTuple(rel, &oldtuple);
+
+ /* Create index entries for the moved tuple */
+ if (ec.resultRelInfo->ri_NumIndices > 0)
+ {
+ ExecStoreTuple(newtuple, ec.slot, InvalidBuffer, false);
+ ExecInsertIndexTuples(ec.slot, &(newtuple->t_self), ec.estate, true);
+ ResetPerTupleExprContext(ec.estate);
+ }
+
+ newblk = ItemPointerGetBlockNumber(&newtuple->t_self);
+
+ elog(DEBUG4, "moved tuple %u/%u to %u/%u", blk, off, newblk,
+ ItemPointerGetOffsetNumber(&newtuple->t_self));
+
+ if (newblk >= blk)
+ {
+ /*
+ * Oops, the update moved the tuple higher in the
+ * relation, not lower as we wanted. Let's stop now
+ * before we do any more of that.
+ */
+ elog(WARNING, "moved tuple %u/%u to a higher page, stopping",
+ blk, off);
+ found_unmovable = true;
+ }
+
+ if(newblk > minblk)
+ minblk = newblk;
+ }
+ else
+ {
+ elog(DEBUG1, "failed to move tuple %u/%u to a lower page, stopping",
+ blk, off);
+ found_unmovable = true;
+ }
+ }
+ }
+ if (!found_unmovable)
+ pagescleared++;
+ }
+
+ elog(NOTICE, "%d tuples moved, there is now %d pages at the end of relation with only dead tuples. Please wait for any in-progress transactions to finsih and run VACUUM to reclaim the space",
+ tuplesmoved, pagescleared);
+
+ if (BufferIsValid(buf))
+ ReleaseBuffer(buf);
+
+ ExecContext_Finish(&ec);
+
+ heap_close(rel, AccessShareLock);
+
+ PG_RETURN_VOID();
+}
+
diff --git a/contrib/vacuumfull/vacuumfull.sql.in b/contrib/vacuumfull/vacuumfull.sql.in
new file mode 100644
index 0000000..6017cc9
--- /dev/null
+++ b/contrib/vacuumfull/vacuumfull.sql.in
@@ -0,0 +1,12 @@
+/* $PostgreSQL$ */
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+--
+-- vacuumfull()
+--
+CREATE OR REPLACE FUNCTION vacuumfull(regclass)
+RETURNS void
+AS '$libdir/vacuumfull', 'vacuumfull'
+LANGUAGE C STRICT;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b0a911e..56c42cf 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2349,12 +2349,17 @@ simple_heap_delete(Relation relation, ItemPointer tid)
* cmax/cmin if successful)
* crosscheck - if not InvalidSnapshot, also check old tuple against this
* wait - true if should wait for any conflicting update to commit/abort
+ * forcefsm - use FSM even if there's space on the same page.
*
* Normal, successful return value is HeapTupleMayBeUpdated, which
* actually means we *did* update it. Failure return codes are
* HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
* (the last only possible if wait == false).
*
+ * If 'forcefsm' is TRUE, and the new tuple couldn't be stored on a page
+ * with lower block number than the old page, the tuple is not updated and
+ * HeapTupleNoSpace is returned.
+ *
* On success, the header fields of *newtup are updated to match the new
* stored tuple; in particular, newtup->t_self is set to the TID where the
* new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
@@ -2369,7 +2374,7 @@ simple_heap_delete(Relation relation, ItemPointer tid)
HTSU_Result
heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
ItemPointer ctid, TransactionId *update_xmax,
- CommandId cid, Snapshot crosscheck, bool wait)
+ CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm)
{
HTSU_Result result;
TransactionId xid = GetCurrentTransactionId();
@@ -2599,7 +2604,7 @@ l2:
newtupsize = MAXALIGN(newtup->t_len);
- if (need_toast || newtupsize > pagefree)
+ if (need_toast || newtupsize > pagefree || forcefsm)
{
/* Clear obsolete visibility flags ... */
oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
@@ -2649,11 +2654,34 @@ l2:
* while not holding the lock on the old page, and we must rely on it
* to get the locks on both pages in the correct order.
*/
- if (newtupsize > pagefree)
+ if (newtupsize > pagefree || forcefsm)
{
/* Assume there's no chance to put heaptup on same page. */
newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
buffer, 0, NULL);
+ if (BufferGetBlockNumber(newbuf) >= ItemPointerGetBlockNumber(otid))
+ {
+ /*
+ * We couldn't put the new tuple version on any page before
+ * the old version. The purpose of 'forcefsm' is to update
+ * tuples so that they are stored on lower-numbered pages,
+ * to allow the heap to be truncated later, so there's no
+ * point in continuing if the new page is higher than the old
+ * one.
+ */
+
+ if (newbuf != buffer)
+ UnlockReleaseBuffer(newbuf);
+
+ /* undo the xmax change. */
+ HeapTupleHeaderSetXmax(oldtup.t_data, InvalidTransactionId);
+
+ UnlockReleaseBuffer(buffer);
+ if (have_tuple_lock)
+ UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+ bms_free(hot_attrs);
+ return HeapTupleNoSpace;
+ }
}
else
{
@@ -2977,7 +3005,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
result = heap_update(relation, otid, tup,
&update_ctid, &update_xmax,
GetCurrentCommandId(true), InvalidSnapshot,
- true /* wait for commit */ );
+ true /* wait for commit */, false);
switch (result)
{
case HeapTupleSelfUpdated:
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 55df5d0..7118dbb 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -2076,7 +2076,7 @@ lreplace:;
&update_ctid, &update_xmax,
estate->es_output_cid,
estate->es_crosscheck_snapshot,
- true /* wait for commit */ );
+ true /* wait for commit */, false);
switch (result)
{
case HeapTupleSelfUpdated:
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index f8395fe..f88a029 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -102,7 +102,7 @@ extern HTSU_Result heap_delete(Relation relation, ItemPointer tid,
extern HTSU_Result heap_update(Relation relation, ItemPointer otid,
HeapTuple newtup,
ItemPointer ctid, TransactionId *update_xmax,
- CommandId cid, Snapshot crosscheck, bool wait);
+ CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm);
extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
Buffer *buffer, ItemPointer ctid,
TransactionId *update_xmax, CommandId cid,
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index df79e3a..1aab53b 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -72,7 +72,8 @@ typedef enum
HeapTupleInvisible,
HeapTupleSelfUpdated,
HeapTupleUpdated,
- HeapTupleBeingUpdated
+ HeapTupleBeingUpdated,
+ HeapTupleNoSpace /* only used by heap_update(), with 'forcefsm' */
} HTSU_Result;
#endif /* SNAPSHOT_H */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers