From 2032eb6a4cfa4b3edd1435280628d6e69fe8395f Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Sat, 23 Jan 2021 16:35:21 +0900
Subject: [PATCH v1] Allow batching of inserts during cross-partition updates

Currently, the insert that runs internally as part of
cross-partition update of partitioned tables can't use batching
because ExecInitRoutingInfo() which initializes the insert target
partition allows batching only if the query's original operation is
CMD_INSERT.  This commit loosens that restriction to allow the
internal inserts of cross-partition use batching too, essentially
reverting a check that 927f453a94106 put in place.

There are two places that need fixing to correctly handle batching
in such cases:

* postgresGetForeignModifyBatchSize() must look at the appropriate
PgFdwModifyState, that is, one that belongs to the insert, not to the
original update operation, because only the former would be set up to
contain the information necessary to perform batching.

* ExecModifyTable(), when inserting any remaining batched tuples,
must look at the correct set of ResultRelInfos that would've been
used by such inserts, because failing to do so would result in those
tuples not actually getting inserted.  To fix, ExecModifyTable() is
now made to get the ResultRelInfos from the PartitionTupleRouting
data structure which contains the ResultRelInfo that would be used by
those internal inserts. To allow nodeModifyTable.c look inside
PartitionTupleRouting, its definition is now exposed whereas it was
previously local to execPartition.c.
---
 contrib/postgres_fdw/postgres_fdw.c    | 11 ++--
 src/backend/executor/execPartition.c   | 72 +-------------------------
 src/backend/executor/nodeModifyTable.c | 50 +++++++++++++-----
 src/include/executor/execPartition.h   | 72 +++++++++++++++++++++++++-
 4 files changed, 112 insertions(+), 93 deletions(-)

diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 35b48575c5..53472cf73c 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -1934,18 +1934,17 @@ static int
 postgresGetForeignModifyBatchSize(ResultRelInfo *resultRelInfo)
 {
 	int	batch_size;
-	PgFdwModifyState *fmstate = resultRelInfo->ri_FdwState ?
-							(PgFdwModifyState *) resultRelInfo->ri_FdwState :
-							NULL;
+	PgFdwModifyState *fmstate = (PgFdwModifyState *) resultRelInfo->ri_FdwState;
 
 	/* should be called only once */
 	Assert(resultRelInfo->ri_BatchSize == 0);
 
 	/*
-	 * Should never get called when the insert is being performed as part of
-	 * a row movement operation.
+	 * Use the auxiliary state if any; see postgresBeginForeignInsert() for
+	 * details on what it represents.
 	 */
-	Assert(fmstate == NULL || fmstate->aux_fmstate == NULL);
+	if (fmstate && fmstate->aux_fmstate != NULL)
+		fmstate = fmstate->aux_fmstate;
 
 	/*
 	 * In EXPLAIN without ANALYZE, ri_fdwstate is NULL, so we have to lookup
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index b8da4c5967..811997b4c8 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -35,75 +35,6 @@
 #include "utils/ruleutils.h"
 
 
-/*-----------------------
- * PartitionTupleRouting - Encapsulates all information required to
- * route a tuple inserted into a partitioned table to one of its leaf
- * partitions.
- *
- * partition_root
- *		The partitioned table that's the target of the command.
- *
- * partition_dispatch_info
- *		Array of 'max_dispatch' elements containing a pointer to a
- *		PartitionDispatch object for every partitioned table touched by tuple
- *		routing.  The entry for the target partitioned table is *always*
- *		present in the 0th element of this array.  See comment for
- *		PartitionDispatchData->indexes for details on how this array is
- *		indexed.
- *
- * nonleaf_partitions
- *		Array of 'max_dispatch' elements containing pointers to fake
- *		ResultRelInfo objects for nonleaf partitions, useful for checking
- *		the partition constraint.
- *
- * num_dispatch
- *		The current number of items stored in the 'partition_dispatch_info'
- *		array.  Also serves as the index of the next free array element for
- *		new PartitionDispatch objects that need to be stored.
- *
- * max_dispatch
- *		The current allocated size of the 'partition_dispatch_info' array.
- *
- * partitions
- *		Array of 'max_partitions' elements containing a pointer to a
- *		ResultRelInfo for every leaf partitions touched by tuple routing.
- *		Some of these are pointers to ResultRelInfos which are borrowed out of
- *		'subplan_resultrel_htab'.  The remainder have been built especially
- *		for tuple routing.  See comment for PartitionDispatchData->indexes for
- *		details on how this array is indexed.
- *
- * num_partitions
- *		The current number of items stored in the 'partitions' array.  Also
- *		serves as the index of the next free array element for new
- *		ResultRelInfo objects that need to be stored.
- *
- * max_partitions
- *		The current allocated size of the 'partitions' array.
- *
- * subplan_resultrel_htab
- *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
- *		cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
- *		NULL in other cases.  Some of these may be useful for tuple routing
- *		to save having to build duplicates.
- *
- * memcxt
- *		Memory context used to allocate subsidiary structs.
- *-----------------------
- */
-struct PartitionTupleRouting
-{
-	Relation	partition_root;
-	PartitionDispatch *partition_dispatch_info;
-	ResultRelInfo **nonleaf_partitions;
-	int			num_dispatch;
-	int			max_dispatch;
-	ResultRelInfo **partitions;
-	int			num_partitions;
-	int			max_partitions;
-	HTAB	   *subplan_resultrel_htab;
-	MemoryContext memcxt;
-};
-
 /*-----------------------
  * PartitionDispatch - information about one partitioned table in a partition
  * hierarchy required to route a tuple to any of its partitions.  A
@@ -1000,8 +931,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 	 *
 	 * If the FDW does not support batching, we set the batch size to 1.
 	 */
-	if (mtstate->operation == CMD_INSERT &&
-		partRelInfo->ri_FdwRoutine != NULL &&
+	if (partRelInfo->ri_FdwRoutine != NULL &&
 		partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
 		partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
 		partRelInfo->ri_BatchSize =
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 2993ba43e3..5f92854c94 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2059,8 +2059,6 @@ ExecModifyTable(PlanState *pstate)
 	HeapTupleData oldtupdata;
 	HeapTuple	oldtuple;
 	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
-	List				  *relinfos = NIL;
-	ListCell			  *lc;
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -2277,22 +2275,46 @@ ExecModifyTable(PlanState *pstate)
 	}
 
 	/*
-	 * Insert remaining tuples for batch insert.
+	 * Insert any remaining batched tuples.
+	 *
+	 * If the query's main target relation is a partitioned table, any inserts
+	 * would have been performed using tuple routing, so use the appropriate
+	 * set of target relations.  Note that this also covers any inserts
+	 * performed during cross-partition UPDATEs that would have occurred
+	 * through tuple routing.
 	 */
 	if (proute)
-		relinfos = estate->es_tuple_routing_result_relations;
-	else
-		relinfos = estate->es_opened_result_relations;
+	{
+		int			i;
 
-	foreach(lc, relinfos)
+		Assert(node->rootResultRelInfo->ri_RelationDesc->rd_rel->relkind ==
+			   RELKIND_PARTITIONED_TABLE);
+		for (i = 0; i < proute->num_partitions; i++)
+		{
+			resultRelInfo = proute->partitions[i];
+			if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
+	}
+	else
 	{
-		resultRelInfo = lfirst(lc);
-		if (resultRelInfo->ri_NumSlots > 0)
-			ExecBatchInsert(node, resultRelInfo,
-						   resultRelInfo->ri_Slots,
-						   resultRelInfo->ri_PlanSlots,
-						   resultRelInfo->ri_NumSlots,
-						   estate, node->canSetTag);
+		ListCell *lc;
+
+		/* Otherwise, use result relations from the range table. */
+		foreach(lc, estate->es_opened_result_relations)
+		{
+			resultRelInfo = lfirst(lc);
+			if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
 	}
 
 	/*
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index d30ffde7d9..846261ef88 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -18,9 +18,77 @@
 #include "nodes/plannodes.h"
 #include "partitioning/partprune.h"
 
-/* See execPartition.c for the definitions. */
+/* See execPartition.c for the definition. */
 typedef struct PartitionDispatchData *PartitionDispatch;
-typedef struct PartitionTupleRouting PartitionTupleRouting;
+
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ *		The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ *		Array of 'max_dispatch' elements containing a pointer to a
+ *		PartitionDispatch object for every partitioned table touched by tuple
+ *		routing.  The entry for the target partitioned table is *always*
+ *		present in the 0th element of this array.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
+ * num_dispatch
+ *		The current number of items stored in the 'partition_dispatch_info'
+ *		array.  Also serves as the index of the next free array element for
+ *		new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ *		The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ *		Array of 'max_partitions' elements containing a pointer to a
+ *		ResultRelInfo for every leaf partitions touched by tuple routing.
+ *		Some of these are pointers to ResultRelInfos which are borrowed out of
+ *		'subplan_resultrel_htab'.  The remainder have been built especially
+ *		for tuple routing.  See comment for PartitionDispatchData->indexes for
+ *		details on how this array is indexed.
+ *
+ * num_partitions
+ *		The current number of items stored in the 'partitions' array.  Also
+ *		serves as the index of the next free array element for new
+ *		ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ *		The current allocated size of the 'partitions' array.
+ *
+ * subplan_resultrel_htab
+ *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
+ *		cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
+ *		NULL in other cases.  Some of these may be useful for tuple routing
+ *		to save having to build duplicates.
+ *
+ * memcxt
+ *		Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+	Relation	partition_root;
+	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
+	int			num_dispatch;
+	int			max_dispatch;
+	ResultRelInfo **partitions;
+	int			num_partitions;
+	int			max_partitions;
+	HTAB	   *subplan_resultrel_htab;
+	MemoryContext memcxt;
+} PartitionTupleRouting;
 
 /*
  * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning
-- 
2.24.1

