Hi!

I've always been a little wary of using the TRUNCATE command due to
the warning in the documentation about it not being "MVCC-safe":
queries may silently give wrong results and it's hard to tell when
they are affected.

That got me thinking, why can't we handle this like a standby server
does -- if some query has data removed from underneath it, it aborts
with a serialization failure.

Does this solution sound like a good idea?

The attached patch is a lame attempt at implementing this. I added a
new pg_class.relvalidxmin attribute which tracks the Xid of the last
TRUNCATE (maybe it should be called reltruncatexid?). Whenever
starting a relation scan with a snapshot older than relvalidxmin, an
error is thrown. This seems to work out well since TRUNCATE updates
pg_class anyway, and anyone who sees the new relfilenode automatically
knows when it was truncated.

Am I on the right track? Are there any better ways to attach this
information to a relation?
Should I also add another counter to pg_stat_database_conflicts?
Currently this table is only used on standby servers.

Since I wrote it just this afternoon, there are a few things still
wrong with the patch (it doesn't handle xid wraparound for one), so
don't be too picky about the code yet. :)

Example:
  CREATE TABLE foo (i int);
Session A:
  BEGIN ISOLATION LEVEL REPEATABLE READ;
  SELECT txid_current(); -- Force snapshot open
Session B:
  TRUNCATE TABLE foo;
Session A:
  SELECT * FROM foo;
ERROR:  canceling statement due to conflict with TRUNCATE TABLE on foo
DETAIL:  Rows visible to this transaction have been removed.


Patch also available in my github 'truncate' branch:
https://github.com/intgr/postgres/commits/truncate

Regards,
Marti
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
new file mode 100644
index 99a431a..e909b17
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
*************** heap_beginscan_internal(Relation relatio
*** 1175,1180 ****
--- 1175,1196 ----
  	HeapScanDesc scan;
  
  	/*
+ 	 * If the snapshot is older than relvalidxmin then that means TRUNCATE has
+ 	 * removed data that we should still see. Abort rather than giving
+ 	 * potentially bogus results.
+ 	 */
+ 	if (relation->rd_rel->relvalidxmin != InvalidTransactionId &&
+ 		snapshot->xmax != InvalidTransactionId &&
+ 		NormalTransactionIdPrecedes(snapshot->xmax, relation->rd_rel->relvalidxmin))
+ 	{
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ 				 errmsg("canceling statement due to conflict with TRUNCATE TABLE on %s",
+ 						 NameStr(relation->rd_rel->relname)),
+ 				 errdetail("Rows visible to this transaction have been removed.")));
+ 	}
+ 
+ 	/*
  	 * increment relation ref count while scanning relation
  	 *
  	 * This is just to make really sure the relcache entry won't go away while
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
new file mode 100644
index aef410a..3f9bd5d
*** a/src/backend/catalog/heap.c
--- b/src/backend/catalog/heap.c
*************** InsertPgClassTuple(Relation pg_class_des
*** 787,792 ****
--- 787,793 ----
  	values[Anum_pg_class_relhastriggers - 1] = BoolGetDatum(rd_rel->relhastriggers);
  	values[Anum_pg_class_relhassubclass - 1] = BoolGetDatum(rd_rel->relhassubclass);
  	values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid);
+ 	values[Anum_pg_class_relvalidxmin - 1] = TransactionIdGetDatum(rd_rel->relvalidxmin);
  	if (relacl != (Datum) 0)
  		values[Anum_pg_class_relacl - 1] = relacl;
  	else
*************** AddNewRelationTuple(Relation pg_class_de
*** 884,889 ****
--- 885,891 ----
  		new_rel_reltup->relfrozenxid = InvalidTransactionId;
  	}
  
+ 	new_rel_reltup->relvalidxmin = InvalidTransactionId;
  	new_rel_reltup->relowner = relowner;
  	new_rel_reltup->reltype = new_type_oid;
  	new_rel_reltup->reloftype = reloftype;
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
new file mode 100644
index bfbe642..0d96a6a
*** a/src/backend/catalog/index.c
--- b/src/backend/catalog/index.c
*************** reindex_index(Oid indexId, bool skip_con
*** 2854,2860 ****
  		}
  
  		/* We'll build a new physical relation for the index */
! 		RelationSetNewRelfilenode(iRel, InvalidTransactionId);
  
  		/* Initialize the index and rebuild */
  		/* Note: we do not need to re-establish pkey setting */
--- 2854,2860 ----
  		}
  
  		/* We'll build a new physical relation for the index */
! 		RelationSetNewRelfilenode(iRel, InvalidTransactionId, InvalidTransactionId);
  
  		/* Initialize the index and rebuild */
  		/* Note: we do not need to re-establish pkey setting */
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
new file mode 100644
index 1f95abc..ab4aec2
*** a/src/backend/commands/sequence.c
--- b/src/backend/commands/sequence.c
*************** ResetSequence(Oid seq_relid)
*** 287,293 ****
  	 * Create a new storage file for the sequence.	We want to keep the
  	 * sequence's relfrozenxid at 0, since it won't contain any unfrozen XIDs.
  	 */
! 	RelationSetNewRelfilenode(seq_rel, InvalidTransactionId);
  
  	/*
  	 * Insert the modified tuple into the new storage file.
--- 287,293 ----
  	 * Create a new storage file for the sequence.	We want to keep the
  	 * sequence's relfrozenxid at 0, since it won't contain any unfrozen XIDs.
  	 */
! 	RelationSetNewRelfilenode(seq_rel, InvalidTransactionId, InvalidTransactionId);
  
  	/*
  	 * Insert the modified tuple into the new storage file.
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
new file mode 100644
index 07dc326..993dc41
*** a/src/backend/commands/tablecmds.c
--- b/src/backend/commands/tablecmds.c
***************
*** 19,24 ****
--- 19,25 ----
  #include "access/reloptions.h"
  #include "access/relscan.h"
  #include "access/sysattr.h"
+ #include "access/transam.h"
  #include "access/xact.h"
  #include "catalog/catalog.h"
  #include "catalog/dependency.h"
*************** ExecuteTruncate(TruncateStmt *stmt)
*** 1118,1124 ****
  			 * as the relfilenode value. The old storage file is scheduled for
  			 * deletion at commit.
  			 */
! 			RelationSetNewRelfilenode(rel, RecentXmin);
  
  			heap_relid = RelationGetRelid(rel);
  			toast_relid = rel->rd_rel->reltoastrelid;
--- 1119,1125 ----
  			 * as the relfilenode value. The old storage file is scheduled for
  			 * deletion at commit.
  			 */
! 			RelationSetNewRelfilenode(rel, RecentXmin, GetCurrentTransactionId());
  
  			heap_relid = RelationGetRelid(rel);
  			toast_relid = rel->rd_rel->reltoastrelid;
*************** ExecuteTruncate(TruncateStmt *stmt)
*** 1129,1135 ****
  			if (OidIsValid(toast_relid))
  			{
  				rel = relation_open(toast_relid, AccessExclusiveLock);
! 				RelationSetNewRelfilenode(rel, RecentXmin);
  				heap_close(rel, NoLock);
  			}
  
--- 1130,1141 ----
  			if (OidIsValid(toast_relid))
  			{
  				rel = relation_open(toast_relid, AccessExclusiveLock);
! 				RelationSetNewRelfilenode(rel, RecentXmin, InvalidTransactionId);
! 
! 				/*
! 				 * We don't need a cmin here because there can't be any open
! 				 * cursors in the same session as the TRUNCATE command.
! 				 */
  				heap_close(rel, NoLock);
  			}
  
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
new file mode 100644
index a59950e..17303e8
*** a/src/backend/utils/cache/relcache.c
--- b/src/backend/utils/cache/relcache.c
*************** RelationBuildLocalRelation(const char *r
*** 2605,2611 ****
   * the XIDs that will be put into the new relation contents.
   */
  void
! RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
  {
  	Oid			newrelfilenode;
  	RelFileNodeBackend newrnode;
--- 2605,2612 ----
   * the XIDs that will be put into the new relation contents.
   */
  void
! RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid,
! 						  TransactionId validxmin)
  {
  	Oid			newrelfilenode;
  	RelFileNodeBackend newrnode;
*************** RelationSetNewRelfilenode(Relation relat
*** 2674,2679 ****
--- 2675,2684 ----
  	}
  	classform->relfrozenxid = freezeXid;
  
+ 	if (validxmin != InvalidTransactionId &&
+ 		TransactionIdPrecedes(classform->relvalidxmin, validxmin))
+ 		classform->relvalidxmin = validxmin;
+ 
  	simple_heap_update(pg_class, &tuple->t_self, tuple);
  	CatalogUpdateIndexes(pg_class, tuple);
  
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
new file mode 100644
index 78c3c9e..acd2558
*** a/src/include/catalog/catversion.h
--- b/src/include/catalog/catversion.h
***************
*** 53,58 ****
   */
  
  /*							yyyymmddN */
! #define CATALOG_VERSION_NO	201202083
  
  #endif
--- 53,58 ----
   */
  
  /*							yyyymmddN */
! #define CATALOG_VERSION_NO	201202091
  
  #endif
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
new file mode 100644
index 3b01bb4..8f1ed9b
*** a/src/include/catalog/pg_class.h
--- b/src/include/catalog/pg_class.h
*************** CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI
*** 67,72 ****
--- 67,73 ----
  	bool		relhastriggers; /* has (or has had) any TRIGGERs */
  	bool		relhassubclass; /* has (or has had) derived classes */
  	TransactionId relfrozenxid; /* all Xids < this are frozen in this rel */
+ 	TransactionId relvalidxmin;	/* data is only valid for snapshots > this Xid */
  
  #ifdef CATALOG_VARLEN			/* variable-length fields start here */
  	/* NOTE: These fields are not present in a relcache entry's rd_rel field. */
*************** CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI
*** 77,83 ****
  
  /* Size of fixed part of pg_class tuples, not counting var-length fields */
  #define CLASS_TUPLE_SIZE \
! 	 (offsetof(FormData_pg_class,relfrozenxid) + sizeof(TransactionId))
  
  /* ----------------
   *		Form_pg_class corresponds to a pointer to a tuple with
--- 78,84 ----
  
  /* Size of fixed part of pg_class tuples, not counting var-length fields */
  #define CLASS_TUPLE_SIZE \
! 	 (offsetof(FormData_pg_class,relvalidxmin) + sizeof(TransactionId))
  
  /* ----------------
   *		Form_pg_class corresponds to a pointer to a tuple with
*************** typedef FormData_pg_class *Form_pg_class
*** 91,97 ****
   * ----------------
   */
  
! #define Natts_pg_class					27
  #define Anum_pg_class_relname			1
  #define Anum_pg_class_relnamespace		2
  #define Anum_pg_class_reltype			3
--- 92,98 ----
   * ----------------
   */
  
! #define Natts_pg_class					28
  #define Anum_pg_class_relname			1
  #define Anum_pg_class_relnamespace		2
  #define Anum_pg_class_reltype			3
*************** typedef FormData_pg_class *Form_pg_class
*** 117,124 ****
  #define Anum_pg_class_relhastriggers	23
  #define Anum_pg_class_relhassubclass	24
  #define Anum_pg_class_relfrozenxid		25
! #define Anum_pg_class_relacl			26
! #define Anum_pg_class_reloptions		27
  
  /* ----------------
   *		initial contents of pg_class
--- 118,126 ----
  #define Anum_pg_class_relhastriggers	23
  #define Anum_pg_class_relhassubclass	24
  #define Anum_pg_class_relfrozenxid		25
! #define Anum_pg_class_relvalidxmin		26
! #define Anum_pg_class_relacl			27
! #define Anum_pg_class_reloptions		28
  
  /* ----------------
   *		initial contents of pg_class
*************** typedef FormData_pg_class *Form_pg_class
*** 130,142 ****
   */
  
  /* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */
! DATA(insert OID = 1247 (  pg_type		PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 3 _null_ _null_ ));
  DESCR("");
! DATA(insert OID = 1249 (  pg_attribute	PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 _null_ _null_ ));
  DESCR("");
! DATA(insert OID = 1255 (  pg_proc		PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 f f p r 26 0 t f f f f 3 _null_ _null_ ));
  DESCR("");
! DATA(insert OID = 1259 (  pg_class		PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 f f p r 27 0 t f f f f 3 _null_ _null_ ));
  DESCR("");
  
  
--- 132,144 ----
   */
  
  /* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */
! DATA(insert OID = 1247 (  pg_type		PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 3 0 _null_ _null_ ));
  DESCR("");
! DATA(insert OID = 1249 (  pg_attribute	PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 0 _null_ _null_ ));
  DESCR("");
! DATA(insert OID = 1255 (  pg_proc		PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 f f p r 26 0 t f f f f 3 0 _null_ _null_ ));
  DESCR("");
! DATA(insert OID = 1259 (  pg_class		PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 f f p r 28 0 t f f f f 3 0 _null_ _null_ ));
  DESCR("");
  
  
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
new file mode 100644
index 2f39486..60c2eb6
*** a/src/include/utils/relcache.h
--- b/src/include/utils/relcache.h
*************** extern Relation RelationBuildLocalRelati
*** 76,82 ****
   * Routine to manage assignment of new relfilenode to a relation
   */
  extern void RelationSetNewRelfilenode(Relation relation,
! 						  TransactionId freezeXid);
  
  /*
   * Routines for flushing/rebuilding relcache entries in various scenarios
--- 76,83 ----
   * Routine to manage assignment of new relfilenode to a relation
   */
  extern void RelationSetNewRelfilenode(Relation relation,
! 						  TransactionId freezeXid,
! 						  TransactionId validxmin);
  
  /*
   * Routines for flushing/rebuilding relcache entries in various scenarios
diff --git a/src/test/isolation/expected/truncate.out b/src/test/isolation/expected/truncate.out
new file mode 100644
index ...2b9f161
*** a/src/test/isolation/expected/truncate.out
--- b/src/test/isolation/expected/truncate.out
***************
*** 0 ****
--- 1,63 ----
+ Parsed test spec with 2 sessions
+ 
+ starting permutation: begin select roll trunc
+ step begin: 
+   BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+   -- Force snapshot open
+   SELECT 1 AS foo FROM txid_current();
+ 
+ foo            
+ 
+ 1              
+ step select: SELECT * FROM foo;
+ i              
+ 
+ 1              
+ step roll: ROLLBACK;
+ step trunc: TRUNCATE TABLE foo;
+ 
+ starting permutation: begin select trunc roll
+ step begin: 
+   BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+   -- Force snapshot open
+   SELECT 1 AS foo FROM txid_current();
+ 
+ foo            
+ 
+ 1              
+ step select: SELECT * FROM foo;
+ i              
+ 
+ 1              
+ step trunc: TRUNCATE TABLE foo; <waiting ...>
+ step roll: ROLLBACK;
+ step trunc: <... completed>
+ 
+ starting permutation: begin trunc select roll
+ step begin: 
+   BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+   -- Force snapshot open
+   SELECT 1 AS foo FROM txid_current();
+ 
+ foo            
+ 
+ 1              
+ step trunc: TRUNCATE TABLE foo;
+ step select: SELECT * FROM foo;
+ ERROR:  canceling statement due to conflict with TRUNCATE TABLE on foo
+ step roll: ROLLBACK;
+ 
+ starting permutation: trunc begin select roll
+ step trunc: TRUNCATE TABLE foo;
+ step begin: 
+   BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+   -- Force snapshot open
+   SELECT 1 AS foo FROM txid_current();
+ 
+ foo            
+ 
+ 1              
+ step select: SELECT * FROM foo;
+ i              
+ 
+ step roll: ROLLBACK;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
new file mode 100644
index 669c0f2..bba722a
*** a/src/test/isolation/isolation_schedule
--- b/src/test/isolation/isolation_schedule
*************** test: fk-contention
*** 14,16 ****
--- 14,17 ----
  test: fk-deadlock
  test: fk-deadlock2
  test: eval-plan-qual
+ test: truncate
diff --git a/src/test/isolation/specs/truncate.spec b/src/test/isolation/specs/truncate.spec
new file mode 100644
index ...2c7b707
*** a/src/test/isolation/specs/truncate.spec
--- b/src/test/isolation/specs/truncate.spec
***************
*** 0 ****
--- 1,23 ----
+ setup
+ {
+   CREATE TABLE foo (i int);
+   INSERT INTO foo VALUES (1);
+ }
+ 
+ teardown
+ {
+   DROP TABLE foo;
+ }
+ 
+ session "s1"
+ step "begin"
+ {
+   BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+   -- Force snapshot open
+   SELECT 1 AS foo FROM txid_current();
+ }
+ step "select"	{ SELECT * FROM foo; }
+ step "roll"		{ ROLLBACK; }
+ 
+ session "s2"
+ step "trunc"	{ TRUNCATE TABLE foo; }
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to