Index: src/backend/optimizer/path/costsize.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v
retrieving revision 1.214
diff -c -r1.214 costsize.c
*** src/backend/optimizer/path/costsize.c	5 Jan 2010 21:53:58 -0000	1.214
--- src/backend/optimizer/path/costsize.c	10 Feb 2010 17:03:51 -0000
***************
*** 78,83 ****
--- 78,84 ----
  #include "optimizer/placeholder.h"
  #include "optimizer/planmain.h"
  #include "optimizer/restrictinfo.h"
+ #include "optimizer/plancat.h"
  #include "parser/parsetree.h"
  #include "utils/lsyscache.h"
  #include "utils/selfuncs.h"
***************
*** 2623,2628 ****
--- 2624,2700 ----
  								  (void *) context);
  }
  
+ /*
+  * cost_index_scan_vs_seqscansort
+  *		Estimate the CPU costs of evaluating an index scan and
+  *		a sequence scan + sort. This is needed by CLUSTER to
+  *		a decide the fastest path (index scan vs seq scan + sort).
+  *
+ */
+ void
+ cost_index_scan_vs_seqscansort(Oid tableOid, Oid indexOid,
+ 									Cost 	*indexScanTotCost,
+ 									Cost	*seqAndSortCost)
+ {
+ 	RelOptInfo 	*rel;
+ 	PlannerInfo *root;
+ 	Query 		*query;
+ 	PlannerGlobal *glob;
+ 	RangeTblEntry 	*rte;
+ 	ListCell   		*index;
+ 	IndexPath		*indexScanPath;
+ 	Path 			*seqAndSortPath;
+ 
+ 	rel = makeNode(RelOptInfo);
+ 	rel->reloptkind = RELOPT_BASEREL;
+ 	rel->relid = 1;
+ 	rel->rtekind = RTE_RELATION;
+ 
+ 	/* needed by get_relation_info */
+ 	glob = makeNode(PlannerGlobal);
+ 
+ 	/* needed by get_relation_info: */
+ 	query = makeNode(Query);
+ 	query->resultRelation = 0;
+ 
+ 	root = makeNode(PlannerInfo);
+ 
+ 	root->parse = query;
+ 	root->glob = glob;
+ 
+ 	get_relation_info(root, tableOid, false, rel);
+ 	seqAndSortPath = create_seqscan_path(NULL, rel);
+ 
+ 	rel->rows = rel->tuples;
+ 
+ 	rte = makeNode(RangeTblEntry);
+ 	rte->rtekind = RTE_RELATION;
+ 	rte->relid = tableOid;
+ 
+ 	root->simple_rel_array_size = 2;
+ 	root->simple_rte_array = (RangeTblEntry **)
+ 	palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
+ 	root->simple_rte_array[1] = rte;
+ 
+ 	root->total_table_pages = rel->pages;
+ 	foreach(index, rel->indexlist)
+ 	{
+ 		IndexOptInfo *indexInfo = (IndexOptInfo*)(index->data.ptr_value);
+ 		if (indexInfo->indexoid == indexOid)
+ 		{
+ 			indexScanPath = create_index_path(root, indexInfo, NULL, NULL, ForwardScanDirection, NULL);
+ 			break;
+ 		}
+ 	}
+ 
+ 	Assert(indexScanPath != NULL);
+ 
+ 	cost_sort(seqAndSortPath, root, NULL, seqAndSortPath->total_cost, rel->tuples, rel->width, -1);
+ 
+ 	*seqAndSortCost = seqAndSortPath->total_cost;
+ 	*indexScanTotCost = indexScanPath->path.total_cost;
+ }
+ 
  
  /*
   * adjust_semi_join
Index: src/backend/utils/sort/tuplesort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v
retrieving revision 1.94
diff -c -r1.94 tuplesort.c
*** src/backend/utils/sort/tuplesort.c	2 Jan 2010 16:57:58 -0000	1.94
--- src/backend/utils/sort/tuplesort.c	10 Feb 2010 17:03:52 -0000
***************
*** 104,109 ****
--- 104,110 ----
  #include "access/nbtree.h"
  #include "catalog/pg_amop.h"
  #include "catalog/pg_operator.h"
+ #include "catalog/index.h"
  #include "commands/tablespace.h"
  #include "miscadmin.h"
  #include "pg_trace.h"
***************
*** 115,126 ****
--- 116,129 ----
  #include "utils/rel.h"
  #include "utils/syscache.h"
  #include "utils/tuplesort.h"
+ #include "executor/executor.h"
  
  
  /* sort-type codes for sort__start probes */
  #define HEAP_SORT	0
  #define INDEX_SORT	1
  #define DATUM_SORT	2
+ #define RAWHEAP_SORT	3
  
  /* GUC variables */
  #ifdef TRACE_SORT
***************
*** 366,371 ****
--- 369,378 ----
  	int			datumTypeLen;
  	bool		datumTypeByVal;
  
+ 	/* These are specific to the rawheap subcase: */
+ 	EState 	   *estate;
+ 	IndexInfo  *indexInfo;
+ 
  	/*
  	 * Resource snapshot for time of sort start.
  	 */
***************
*** 450,455 ****
--- 457,468 ----
  static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
  			 int tapenum, unsigned int len);
  static void reversedirection_heap(Tuplesortstate *state);
+ static int comparetup_rawheap(const SortTuple *a, const SortTuple *b,
+                              Tuplesortstate *state);
+ static void copytup_rawheap(Tuplesortstate *state, SortTuple *stup, void *tup);
+ static void writetup_rawheap(Tuplesortstate *state, int tapenum, SortTuple *stup);
+ static void readtup_rawheap(Tuplesortstate *state, SortTuple *stup, int tapenum,
+                             unsigned int len);
  static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
  					   Tuplesortstate *state);
  static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
***************
*** 549,554 ****
--- 562,570 ----
  
  	state->result_tape = -1;	/* flag that result tape has not been formed */
  
+ 	/* set estate to NULL, so we don't try to free it later if not used */
+ 	state->estate = NULL;
+ 
  	MemoryContextSwitchTo(oldcontext);
  
  	return state;
***************
*** 762,767 ****
--- 778,844 ----
  	return state;
  }
  
+ Tuplesortstate *
+ tuplesort_begin_rawheap(Relation indexRel,
+ 						TupleDesc tupDesc,
+                         int workMem, bool randomAccess)
+ {
+     Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+     MemoryContext oldcontext;
+ 
+     TupleTableSlot *existing_slot;
+     ExprContext	   *econtext;
+ 
+     Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
+ 
+     oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ 
+ #ifdef TRACE_SORT
+     if (trace_sort)
+         elog(LOG,
+              "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
+              RelationGetNumberOfAttributes(indexRel), workMem, randomAccess ? 't' : 'f');
+ #endif
+ 
+ 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
+ 
+ 	TRACE_POSTGRESQL_SORT_START(RAWHEAP_SORT, false, state->nKeys, workMem, randomAccess);
+ 
+     state->comparetup = comparetup_rawheap;
+     state->copytup = copytup_rawheap;
+     state->writetup = writetup_rawheap;
+     state->readtup = readtup_rawheap;
+     state->reversedirection = reversedirection_heap;
+ 
+ 	state->indexInfo = BuildIndexInfo(indexRel);
+ 	state->indexScanKey = _bt_mkscankey_nodata(indexRel);
+ 	state->enforceUnique = false;
+ 
+     state->tupDesc = tupDesc;    /* assume we need not copy tupDesc */
+ 
+     if (state->indexInfo->ii_Expressions != NULL)
+     {
+     	/* allocate the vars used by FormIndexDatum */
+ 		state->estate = CreateExecutorState();
+ 
+ 		/*
+ 		 * Need a TupleTableSlot to put existing tuples in.
+ 		 *
+ 		 * To use FormIndexDatum, we have to make the econtext's scantuple point
+ 		 * to this slot.  Be sure to save and restore caller's value for
+ 		 * scantuple.
+ 		 */
+ 		existing_slot = MakeSingleTupleTableSlot(tupDesc);
+ 
+ 		econtext = GetPerTupleExprContext(state->estate);
+ 		econtext->ecxt_scantuple = existing_slot;
+     }
+ 
+     MemoryContextSwitchTo(oldcontext);
+ 
+     return state;
+ }
+ 
  /*
   * tuplesort_set_bound
   *
***************
*** 849,854 ****
--- 926,934 ----
  	 */
  	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
  #endif
+ 	if (state->estate != NULL)
+ 		ExecDropSingleTupleTableSlot(GetPerTupleExprContext(state->estate)->ecxt_scantuple);
+ 
  
  	MemoryContextSwitchTo(oldcontext);
  
***************
*** 980,985 ****
--- 1060,1087 ----
  }
  
  /*
+ * Accept one tuple while collecting input data for sort.
+ *
+ * Note that the input data is always copied; the caller need not save it.
+ */
+ void
+ tuplesort_putrawtuple(Tuplesortstate *state, HeapTuple tup)
+ {
+     MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
+     SortTuple    stup;
+ 
+     /*
+      * Copy the given tuple into memory we control, and decrease availMem.
+      * Then call the common code.
+      */
+     COPYTUP(state, &stup, (void *) tup);
+ 
+     puttuple_common(state, &stup);
+ 
+     MemoryContextSwitchTo(oldcontext);
+ }
+ 
+ /*
   * Shared code for tuple and datum cases.
   */
  static void
***************
*** 1482,1487 ****
--- 1584,1609 ----
  }
  
  /*
+ * Fetch the next tuple in either forward or back direction.
+ * Returns NULL if no more tuples.    If *should_free is set, the
+ * caller must pfree the returned tuple when done with it.
+ */
+ HeapTuple
+ tuplesort_getrawtuple(Tuplesortstate *state, bool forward,
+                      bool *should_free)
+ {
+     MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
+     SortTuple    stup;
+ 
+     if (!tuplesort_gettuple_common(state, forward, &stup, should_free))
+         stup.tuple = NULL;
+ 
+     MemoryContextSwitchTo(oldcontext);
+ 
+     return stup.tuple;
+ }
+ 
+ /*
   * tuplesort_merge_order - report merge order we'll use for given memory
   * (note: "merge order" just means the number of input tapes in the merge).
   *
***************
*** 3079,3084 ****
--- 3201,3412 ----
  }
  
  /*
+ * Routines specialized for Raw on-disk HeapTuple case These are only used when
+ * we need the visibility info for things like CLUSTER. Otherwise we used the
+ * regular *tup_heap methods which actually manipulate MinimalTuples.
+ */
+ static int
+ comparetup_rawheap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+ {
+     ScanKey        scanKey = state->indexScanKey;
+     HeapTuple ltup;
+     HeapTuple rtup;
+     TupleDesc    tupDesc;
+     int            nkey;
+     int32        compare;
+ 
+     /* Allow interrupting long sorts */
+     CHECK_FOR_INTERRUPTS();
+ 
+     /* Compare the leading sort key */
+     compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags,
+                                      a->datum1, a->isnull1,
+                                      b->datum1, b->isnull1);
+     if (compare != 0)
+         return compare;
+ 
+     /* Compare additional sort keys */
+     ltup = (HeapTuple) a->tuple;
+     rtup = (HeapTuple) b->tuple;
+ 
+     if (state->indexInfo->ii_Expressions == NULL)
+     {
+         /* if not expression index, just get the proper heap_getattr */
+ 
+     	tupDesc = state->tupDesc;
+         scanKey++;
+ 
+         for (nkey = 1; nkey < state->nKeys; nkey++, scanKey++)
+         {
+             Datum       datum1,
+                         datum2;
+             bool        isnull1,
+                         isnull2;
+ 
+             datum1 = heap_getattr(ltup, state->indexInfo->ii_KeyAttrNumbers[nkey], tupDesc, &isnull1);
+             datum2 = heap_getattr(rtup, state->indexInfo->ii_KeyAttrNumbers[nkey], tupDesc, &isnull2);
+ 
+             compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags,
+                                              datum1, isnull1,
+                                              datum2, isnull2);
+             if (compare != 0)
+                 return compare;
+         }
+     }
+     else
+     {
+     	/* in the expression index case, we get all the values/nulls:
+     	 * it would be faster to get only the required ones, but it would mean
+     	 * copy&paste from FormIndexDatum
+     	*/
+ 
+ 		Datum		l_existing_values[INDEX_MAX_KEYS];
+ 		bool		l_existing_isnull[INDEX_MAX_KEYS];
+ 		Datum		r_existing_values[INDEX_MAX_KEYS];
+ 		bool		r_existing_isnull[INDEX_MAX_KEYS];
+ 		TupleTableSlot *ecxt_scantuple;
+ 
+ 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
+ 
+ 		ExecStoreTuple(ltup,	ecxt_scantuple, InvalidBuffer, false);
+     	FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+ 						l_existing_values, l_existing_isnull);
+ 
+ 		ExecStoreTuple(rtup,	ecxt_scantuple, InvalidBuffer, false);
+     	FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+ 						r_existing_values, r_existing_isnull);
+ 
+     	for (nkey = 1; nkey < state->nKeys; nkey++, scanKey++)
+         {
+             compare = inlineApplySortFunction(&scanKey->sk_func,
+ 												scanKey->sk_flags,
+ 												l_existing_values[nkey],
+ 												l_existing_isnull[nkey],
+ 												r_existing_values[nkey],
+ 												r_existing_isnull[nkey]);
+ 
+             if (compare != 0)
+                 return compare;
+ 
+         }
+     }
+ 
+ 
+ 
+     return 0;
+ }
+ 
+ static void
+ copytup_rawheap(Tuplesortstate *state, SortTuple *stup, void *tup)
+ {
+     HeapTuple    tuple = (HeapTuple) tup;
+ 
+     /* copy the tuple into sort storage */
+     stup->tuple = (void *) heap_copytuple(tuple);
+     USEMEM(state, GetMemoryChunkSpace(stup->tuple));
+     /* set up first-column key value */
+     if (state->indexInfo->ii_Expressions == NULL)
+     {
+     	/* no expression index, just get the key datum value */
+ 		stup->datum1 = heap_getattr((HeapTuple) stup->tuple,
+ 									state->indexInfo->ii_KeyAttrNumbers[0],
+ 									state->tupDesc,
+ 									&stup->isnull1);
+     }
+     else
+     {
+     	/*
+     	 * Extract the index column values and isnull flags from the existing
+     	 * tuple; we're interested only in the very first one, but to avoid
+     	 * copy&paste from FormIndexDatum we get all of them (even if it's
+     	 * slower)
+ 		*/
+ 
+ 		Datum		existing_values[INDEX_MAX_KEYS];
+ 		bool		existing_isnull[INDEX_MAX_KEYS];
+ 		TupleTableSlot *ecxt_scantuple;
+ 
+ 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
+     	ExecStoreTuple(tuple,	ecxt_scantuple, InvalidBuffer, false);
+     	FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+     				   existing_values, existing_isnull);
+ 
+     	stup->datum1 = existing_values[0];
+     	stup->isnull1 = existing_isnull[0];
+     }
+ }
+ 
+ static void
+ writetup_rawheap(Tuplesortstate *state, int tapenum, SortTuple *stup)
+ {
+     HeapTuple    tuple = (HeapTuple) stup->tuple;
+     tuple->t_len += HEAPTUPLESIZE; /* write out the header as well */
+ 
+ 	LogicalTapeWrite(state->tapeset, tapenum,
+ 					 tuple, HEAPTUPLESIZE);
+ 	LogicalTapeWrite(state->tapeset, tapenum, tuple->t_data, tuple->t_len-HEAPTUPLESIZE);
+     if (state->randomAccess)    /* need trailing length word? */
+         LogicalTapeWrite(state->tapeset, tapenum,
+                          tuple, sizeof(tuple->t_len));
+ 
+     FREEMEM(state, GetMemoryChunkSpace(tuple));
+     heap_freetuple(tuple);
+ }
+ 
+ static void
+ readtup_rawheap(Tuplesortstate *state, SortTuple *stup,
+              int tapenum, unsigned int tuplen)
+ {
+ 	HeapTuple    tuple = (HeapTuple) palloc(tuplen);
+ 
+     USEMEM(state, GetMemoryChunkSpace(tuple));
+ 
+     tuple->t_len = tuplen - HEAPTUPLESIZE;
+     if (LogicalTapeRead(state->tapeset, tapenum, &tuple->t_self, HEAPTUPLESIZE-sizeof(tuplen)) != HEAPTUPLESIZE-sizeof(tuplen))
+         elog(ERROR, "unexpected end of data");
+     tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
+     if (LogicalTapeRead(state->tapeset, tapenum, tuple->t_data, tuple->t_len) != tuple->t_len)
+         elog(ERROR, "unexpected end of data");
+     if (state->randomAccess)    /* need trailing length word? */
+         if (LogicalTapeRead(state->tapeset, tapenum, &tuplen,
+                             sizeof(tuplen)) != sizeof(tuplen))
+             elog(ERROR, "unexpected end of data");
+ 
+     stup->tuple = tuple;
+ 
+     /* set up first-column key value */
+     if (state->indexInfo->ii_Expressions == NULL)
+     {
+     	/* no expression index, just get the key datum value */
+ 		stup->datum1 = heap_getattr((HeapTuple) stup->tuple,
+ 									state->indexInfo->ii_KeyAttrNumbers[0],
+ 									state->tupDesc,
+ 									&stup->isnull1);
+     }
+     else
+     {
+     	/*
+     	 * Extract the index column values and isnull flags from the existing
+     	 * tuple; we're interested only in the very first one, but to avoid
+     	 * copy&paste from FormIndexDatum we get all of them (even if it's
+     	 * slower)
+ 		*/
+ 
+ 		Datum		existing_values[INDEX_MAX_KEYS];
+ 		bool		existing_isnull[INDEX_MAX_KEYS];
+ 		TupleTableSlot *ecxt_scantuple;
+ 
+ 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
+     	ExecStoreTuple(tuple,	ecxt_scantuple, InvalidBuffer, false);
+     	FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+     				   existing_values, existing_isnull);
+ 
+     	stup->datum1 = existing_values[0];
+     	stup->isnull1 = existing_isnull[0];
+     }
+ }
+ 
+ /*
   * Convenience routine to free a tuple previously loaded into sort memory
   */
  static void
Index: src/include/optimizer/cost.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/optimizer/cost.h,v
retrieving revision 1.100
diff -c -r1.100 cost.h
*** src/include/optimizer/cost.h	2 Jan 2010 16:58:07 -0000	1.100
--- src/include/optimizer/cost.h	10 Feb 2010 17:03:52 -0000
***************
*** 109,114 ****
--- 109,118 ----
  extern void cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan);
  extern void cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root);
  extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root);
+ extern void cost_index_scan_vs_seqscansort(Oid tableOid, Oid indexOid,
+ 											Cost 	*indexScanTotCost,
+ 											Cost	*seqAndSortCost);
+ 
  extern void set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel);
  extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
  						   RelOptInfo *outer_rel,
Index: src/include/utils/tuplesort.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/utils/tuplesort.h,v
retrieving revision 1.35
diff -c -r1.35 tuplesort.h
*** src/include/utils/tuplesort.h	2 Jan 2010 16:58:10 -0000	1.35
--- src/include/utils/tuplesort.h	10 Feb 2010 17:03:52 -0000
***************
*** 64,69 ****
--- 64,72 ----
  extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
  					  Oid sortOperator, bool nullsFirstFlag,
  					  int workMem, bool randomAccess);
+ extern Tuplesortstate *tuplesort_begin_rawheap(Relation indexRel,
+ 												TupleDesc tupDesc,
+ 												int workMem, bool randomAccess);
  
  extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
  
***************
*** 72,77 ****
--- 75,81 ----
  extern void tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple);
  extern void tuplesort_putdatum(Tuplesortstate *state, Datum val,
  				   bool isNull);
+ extern void tuplesort_putrawtuple(Tuplesortstate *state, HeapTuple tup);
  
  extern void tuplesort_performsort(Tuplesortstate *state);
  
***************
*** 81,86 ****
--- 85,92 ----
  						bool *should_free);
  extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
  				   Datum *val, bool *isNull);
+ extern HeapTuple tuplesort_getrawtuple(Tuplesortstate *state, bool forward,
+                                      bool *should_free);
  
  extern void tuplesort_end(Tuplesortstate *state);
  
Index: src/backend/commands/cluster.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/commands/cluster.c,v
retrieving revision 1.200
diff -c -r1.200 cluster.c
*** src/backend/commands/cluster.c	9 Feb 2010 21:43:30 -0000	1.200
--- src/backend/commands/cluster.c	10 Feb 2010 17:03:49 -0000
***************
*** 49,55 ****
  #include "utils/snapmgr.h"
  #include "utils/syscache.h"
  #include "utils/tqual.h"
! 
  
  /*
   * This struct is used to pass around the information on tables to be
--- 49,59 ----
  #include "utils/snapmgr.h"
  #include "utils/syscache.h"
  #include "utils/tqual.h"
! #include "utils/tuplesort.h"
! #include "optimizer/plancat.h"
! #include "optimizer/pathnode.h"
! #include "optimizer/cost.h"
! #include "executor/spi_priv.h"
  
  /*
   * This struct is used to pass around the information on tables to be
***************
*** 69,76 ****
  			   int freeze_min_age, int freeze_table_age,
  			   bool *pSwapToastByContent, TransactionId *pFreezeXid);
  static List *get_tables_to_cluster(MemoryContext cluster_context);
! 
! 
  
  /*---------------------------------------------------------------------------
   * This cluster code allows for clustering multiple tables at once. Because
--- 73,81 ----
  			   int freeze_min_age, int freeze_table_age,
  			   bool *pSwapToastByContent, TransactionId *pFreezeXid);
  static List *get_tables_to_cluster(MemoryContext cluster_context);
! static void deform_and_rewrite_tuple(HeapTuple tuple, TupleDesc oldTupDesc, TupleDesc newTupDesc,
! 										Datum *values, bool *isnull,
! 										bool newRelHasOids, RewriteState rwstate);
  
  /*---------------------------------------------------------------------------
   * This cluster code allows for clustering multiple tables at once. Because
***************
*** 769,774 ****
--- 774,781 ----
  	TransactionId OldestXmin;
  	TransactionId FreezeXid;
  	RewriteState rwstate;
+ 	bool 		 use_sort = false;
+ 	Tuplesortstate *tuplesort;
  
  	/*
  	 * Open the relations we need.
***************
*** 877,883 ****
  	 * tuples that still need to be copied, we scan with SnapshotAny and use
  	 * HeapTupleSatisfiesVacuum for the visibility test.
  	 */
! 	if (OldIndex != NULL)
  	{
  		heapScan = NULL;
  		indexScan = index_beginscan(OldHeap, OldIndex,
--- 884,900 ----
  	 * tuples that still need to be copied, we scan with SnapshotAny and use
  	 * HeapTupleSatisfiesVacuum for the visibility test.
  	 */
! 	if (OldIndex != NULL && (OldIndex->rd_am->amcanorder) && (OldIndex->rd_rel->relam == BTREE_AM_OID))
! 	{
! 		Cost 	indexScanTotCost,
! 				seqAndSortCost;
! 		cost_index_scan_vs_seqscansort(OIDOldHeap, OIDOldIndex,
! 													&indexScanTotCost,
! 													&seqAndSortCost);
! 		use_sort = seqAndSortCost < indexScanTotCost;
! 	}
! 
! 	if (OldIndex != NULL && !use_sort)
  	{
  		heapScan = NULL;
  		indexScan = index_beginscan(OldHeap, OldIndex,
***************
*** 886,905 ****
  	else
  	{
  		heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
  		indexScan = NULL;
  	}
  
  	for (;;)
  	{
  		HeapTuple	tuple;
- 		HeapTuple	copiedTuple;
  		Buffer		buf;
! 		bool		isdead;
! 		int			i;
  
  		CHECK_FOR_INTERRUPTS();
  
! 		if (OldIndex != NULL)
  		{
  			tuple = index_getnext(indexScan, ForwardScanDirection);
  			if (tuple == NULL)
--- 903,926 ----
  	else
  	{
  		heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
+ 		if (use_sort)
+ 		{
+ 			tuplesort = tuplesort_begin_rawheap(OldIndex, oldTupDesc,
+ 		                                        maintenance_work_mem, false);
+ 		}
  		indexScan = NULL;
  	}
  
  	for (;;)
  	{
+ 
  		HeapTuple	tuple;
  		Buffer		buf;
! 		bool isdead;
  
  		CHECK_FOR_INTERRUPTS();
  
! 		if (OldIndex != NULL && !use_sort)
  		{
  			tuple = index_getnext(indexScan, ForwardScanDirection);
  			if (tuple == NULL)
***************
*** 919,925 ****
  
  			buf = heapScan->rs_cbuf;
  		}
- 
  		LockBuffer(buf, BUFFER_LOCK_SHARE);
  
  		switch (HeapTupleSatisfiesVacuum(tuple->t_data, OldestXmin, buf))
--- 940,945 ----
***************
*** 978,1022 ****
  			continue;
  		}
  
! 		/*
! 		 * We cannot simply copy the tuple as-is, for several reasons:
! 		 *
! 		 * 1. We'd like to squeeze out the values of any dropped columns, both
! 		 * to save space and to ensure we have no corner-case failures. (It's
! 		 * possible for example that the new table hasn't got a TOAST table
! 		 * and so is unable to store any large values of dropped cols.)
! 		 *
! 		 * 2. The tuple might not even be legal for the new table; this is
! 		 * currently only known to happen as an after-effect of ALTER TABLE
! 		 * SET WITHOUT OIDS.
! 		 *
! 		 * So, we must reconstruct the tuple from component Datums.
! 		 */
! 		heap_deform_tuple(tuple, oldTupDesc, values, isnull);
! 
! 		/* Be sure to null out any dropped columns */
! 		for (i = 0; i < natts; i++)
  		{
! 			if (newTupDesc->attrs[i]->attisdropped)
! 				isnull[i] = true;
  		}
  
- 		copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
  
! 		/* Preserve OID, if any */
! 		if (NewHeap->rd_rel->relhasoids)
! 			HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
  
! 		/* The heap rewrite module does the rest */
! 		rewrite_heap_tuple(rwstate, tuple, copiedTuple);
  
! 		heap_freetuple(copiedTuple);
  	}
  
! 	if (OldIndex != NULL)
  		index_endscan(indexScan);
  	else
  		heap_endscan(heapScan);
  
  	/* Write out any remaining tuples, and fsync if needed */
  	end_heap_rewrite(rwstate);
--- 998,1051 ----
  			continue;
  		}
  
! 		if (!use_sort)
! 		{
! 			deform_and_rewrite_tuple(tuple, oldTupDesc, newTupDesc, values, isnull,
! 										NewHeap->rd_rel->relhasoids, rwstate);
! 		}
! 		else
  		{
! 			/* pass tuple to tuple store */
! 			tuplesort_putrawtuple(tuplesort, tuple);
  		}
  
  
! 	}
! 
! 	if (use_sort)
! 	{
! 		tuplesort_performsort(tuplesort);
! 
! 		/* read from tuplestore */
! 		for (;;)
! 		{
! 			HeapTuple	tuple;
! 			bool        shouldfree;
! 
! 			CHECK_FOR_INTERRUPTS();
  
! 			tuple = tuplesort_getrawtuple(tuplesort, true, &shouldfree);
! 			if (tuple == NULL)
! 				break;
  
! 			deform_and_rewrite_tuple(tuple, oldTupDesc, newTupDesc, values, isnull,
! 										NewHeap->rd_rel->relhasoids, rwstate);
! 			if (shouldfree)
! 				heap_freetuple(tuple);
! 		}
  	}
  
! 
! 	if (OldIndex != NULL && !use_sort)
! 	{
  		index_endscan(indexScan);
+ 	}
  	else
+ 	{
  		heap_endscan(heapScan);
+ 		if (use_sort)
+ 			tuplesort_end(tuplesort);
+ 	}
  
  	/* Write out any remaining tuples, and fsync if needed */
  	end_heap_rewrite(rwstate);
***************
*** 1519,1521 ****
--- 1548,1594 ----
  
  	return rvs;
  }
+ 
+ static void deform_and_rewrite_tuple(HeapTuple tuple, TupleDesc oldTupDesc, TupleDesc newTupDesc,
+ 										Datum *values, bool *isnull,
+ 										bool newRelHasOids, RewriteState rwstate)
+ {
+ 	HeapTuple	copiedTuple;
+ 	int 		i;
+ 
+ 	/*
+ 	 * We cannot simply copy the tuple as-is, for several reasons:
+ 	 *
+ 	 * 1. We'd like to squeeze out the values of any dropped columns, both
+ 	 * to save space and to ensure we have no corner-case failures. (It's
+ 	 * possible for example that the new table hasn't got a TOAST table
+ 	 * and so is unable to store any large values of dropped cols.)
+ 	 *
+ 	 * 2. The tuple might not even be legal for the new table; this is
+ 	 * currently only known to happen as an after-effect of ALTER TABLE
+ 	 * SET WITHOUT OIDS.
+ 	 *
+ 	 * So, we must reconstruct the tuple from component Datums.
+ 	 */
+ 
+ 	heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+ 
+ 	/* Be sure to null out any dropped columns */
+ 	for (i = 0; i < newTupDesc->natts; i++)
+ 	{
+ 		if (newTupDesc->attrs[i]->attisdropped)
+ 			isnull[i] = true;
+ 	}
+ 
+ 	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+ 
+ 	/* Preserve OID, if any */
+ 	if (newRelHasOids)
+ 		HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
+ 
+ 	/* The heap rewrite module does the rest */
+ 	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+ 
+ 	heap_freetuple(copiedTuple);
+ }
+ 
