From d90ea2d0178e1bd6e4e0c3c92ecc932c92f423b3 Mon Sep 17 00:00:00 2001
From: amitlan <amitlangote09@gmail.com>
Date: Tue, 16 Mar 2021 14:03:05 +0900
Subject: [PATCH v4] Allow batching of inserts during cross-partition updates

Currently, the insert that runs internally as part of
cross-partition update of partitioned tables can't use batching
because ExecInitRoutingInfo(), which initializes the insert target
partition, allows batching only if the query's original operation is
CMD_INSERT.  This commit loosens that restriction to allow the
internal inserts of cross-partition use batching too, essentially
reverting a check that 927f453a94106 put in place.

There are two places that need fixing to correctly handle batching
in such cases:

* postgresGetForeignModifyBatchSize() must look at the appropriate
PgFdwModifyState, that is, one that belongs to the insert, not to the
original update operation, because only the former would be set up to
contain the information necessary to perform batching.

* ExecModifyTable(), when inserting any remaining batched tuples,
must look at the correct set of ResultRelInfos that would've been
used by such inserts, because failing to do so would result in those
tuples not actually getting inserted.  To fix, ExecModifyTable() is
now made to get the ResultRelInfos from the PartitionTupleRouting
data structure which contains the ResultRelInfo that would be used by
those internal inserts. To allow nodeModifyTable.c look inside
PartitionTupleRouting, its definition is now exposed whereas it was
previously local to execPartition.c.

This also adjusts the test case added by 927f453a94106 to confirm
that the insert performed when moving the rows into the foreign
partition indeed uses batched mode.
---
 .../postgres_fdw/expected/postgres_fdw.out    | 60 +++++++++++++---
 contrib/postgres_fdw/postgres_fdw.c           | 11 ++-
 contrib/postgres_fdw/sql/postgres_fdw.sql     | 33 +++++++--
 src/backend/executor/execPartition.c          | 72 +------------------
 src/backend/executor/nodeModifyTable.c        | 51 +++++++++----
 src/include/executor/execPartition.h          | 72 ++++++++++++++++++-
 6 files changed, 191 insertions(+), 108 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 59e4e27ffb..30712f6b26 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -9402,8 +9402,8 @@ SELECT COUNT(*) FROM batch_table;
     66
 (1 row)
 
--- Check that enabling batched inserts doesn't interfere with cross-partition
--- updates
+-- Check that batched inserts also works for inserts made during
+-- cross-partition updates
 CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
 CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
 CREATE FOREIGN TABLE batch_cp_upd_test1_f
@@ -9411,18 +9411,60 @@ CREATE FOREIGN TABLE batch_cp_upd_test1_f
 	FOR VALUES IN (1)
 	SERVER loopback
 	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
-CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
+CREATE TABLE batch_cp_upd_test2 PARTITION OF batch_cp_upd_test
 	FOR VALUES IN (2);
-INSERT INTO batch_cp_upd_test VALUES (1), (2);
--- The following moves a row from the local partition to the foreign one
-UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
-ERROR:  cannot route tuples into foreign table to be updated "batch_cp_upd_test1_f"
+CREATE TABLE batch_cp_upd_test3 (LIKE batch_cp_upd_test);
+CREATE FOREIGN TABLE batch_cp_upd_test3_f
+	PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (3)
+	SERVER loopback
+	OPTIONS (table_name 'batch_cp_upd_test3', batch_size '1');
+INSERT INTO batch_cp_upd_test VALUES (2), (2), (2);
+-- The following moves rows from the local partition to the foreign one that
+-- has insert batching enabled
+UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
 SELECT tableoid::regclass, * FROM batch_cp_upd_test;
        tableoid       | a 
 ----------------------+---
  batch_cp_upd_test1_f | 1
- batch_cp_up_test1    | 2
-(2 rows)
+ batch_cp_upd_test1_f | 1
+ batch_cp_upd_test1_f | 1
+(3 rows)
+
+-- cmin should be the same in all the rows, because they would've been
+-- inserted by one command
+SELECT cmin, * FROM batch_cp_upd_test1;
+ cmin | a 
+------+---
+    0 | 1
+    0 | 1
+    0 | 1
+(3 rows)
+
+INSERT INTO batch_cp_upd_test VALUES (2), (2), (2);
+-- This update moves rows from the local partition to the foreign one that
+-- has insert batching disabled
+UPDATE batch_cp_upd_test t SET a = 3 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
+SELECT tableoid::regclass, * FROM batch_cp_upd_test;
+       tableoid       | a 
+----------------------+---
+ batch_cp_upd_test1_f | 1
+ batch_cp_upd_test1_f | 1
+ batch_cp_upd_test1_f | 1
+ batch_cp_upd_test3_f | 3
+ batch_cp_upd_test3_f | 3
+ batch_cp_upd_test3_f | 3
+(6 rows)
+
+-- cmin shoud be different across rows, because each one would be inserted
+-- by its own command
+SELECT cmin, * FROM batch_cp_upd_test3;
+ cmin | a 
+------+---
+    0 | 3
+    1 | 3
+    2 | 3
+(3 rows)
 
 -- Clean up
 DROP TABLE batch_table, batch_cp_upd_test CASCADE;
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 16c2979f2d..eeb52ca4d9 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -1959,18 +1959,17 @@ static int
 postgresGetForeignModifyBatchSize(ResultRelInfo *resultRelInfo)
 {
 	int	batch_size;
-	PgFdwModifyState *fmstate = resultRelInfo->ri_FdwState ?
-							(PgFdwModifyState *) resultRelInfo->ri_FdwState :
-							NULL;
+	PgFdwModifyState *fmstate = (PgFdwModifyState *) resultRelInfo->ri_FdwState;
 
 	/* should be called only once */
 	Assert(resultRelInfo->ri_BatchSize == 0);
 
 	/*
-	 * Should never get called when the insert is being performed as part of
-	 * a row movement operation.
+	 * Use the auxiliary state if any; see postgresBeginForeignInsert() for
+	 * details on what it represents.
 	 */
-	Assert(fmstate == NULL || fmstate->aux_fmstate == NULL);
+	if (fmstate && fmstate->aux_fmstate != NULL)
+		fmstate = fmstate->aux_fmstate;
 
 	/*
 	 * In EXPLAIN without ANALYZE, ri_fdwstate is NULL, so we have to lookup
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 107d1c0e03..22362ec52c 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -2924,8 +2924,8 @@ CREATE TABLE batch_table_p2
 INSERT INTO batch_table SELECT * FROM generate_series(1, 66) i;
 SELECT COUNT(*) FROM batch_table;
 
--- Check that enabling batched inserts doesn't interfere with cross-partition
--- updates
+-- Check that batched inserts also works for inserts made during
+-- cross-partition updates
 CREATE TABLE batch_cp_upd_test (a int) PARTITION BY LIST (a);
 CREATE TABLE batch_cp_upd_test1 (LIKE batch_cp_upd_test);
 CREATE FOREIGN TABLE batch_cp_upd_test1_f
@@ -2933,13 +2933,34 @@ CREATE FOREIGN TABLE batch_cp_upd_test1_f
 	FOR VALUES IN (1)
 	SERVER loopback
 	OPTIONS (table_name 'batch_cp_upd_test1', batch_size '10');
-CREATE TABLE batch_cp_up_test1 PARTITION OF batch_cp_upd_test
+CREATE TABLE batch_cp_upd_test2 PARTITION OF batch_cp_upd_test
 	FOR VALUES IN (2);
-INSERT INTO batch_cp_upd_test VALUES (1), (2);
+CREATE TABLE batch_cp_upd_test3 (LIKE batch_cp_upd_test);
+CREATE FOREIGN TABLE batch_cp_upd_test3_f
+	PARTITION OF batch_cp_upd_test
+	FOR VALUES IN (3)
+	SERVER loopback
+	OPTIONS (table_name 'batch_cp_upd_test3', batch_size '1');
+INSERT INTO batch_cp_upd_test VALUES (2), (2), (2);
+
+-- The following moves rows from the local partition to the foreign one that
+-- has insert batching enabled
+UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
+SELECT tableoid::regclass, * FROM batch_cp_upd_test;
+
+-- cmin should be the same in all the rows, because they would've been
+-- inserted by one command
+SELECT cmin, * FROM batch_cp_upd_test1;
+
+INSERT INTO batch_cp_upd_test VALUES (2), (2), (2);
 
--- The following moves a row from the local partition to the foreign one
-UPDATE batch_cp_upd_test t SET a = 1 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a;
+-- This update moves rows from the local partition to the foreign one that
+-- has insert batching disabled
+UPDATE batch_cp_upd_test t SET a = 3 FROM (VALUES (1), (2)) s(a) WHERE t.a = s.a AND s.a = 2;
 SELECT tableoid::regclass, * FROM batch_cp_upd_test;
+-- cmin shoud be different across rows, because each one would be inserted
+-- by its own command
+SELECT cmin, * FROM batch_cp_upd_test3;
 
 -- Clean up
 DROP TABLE batch_table, batch_cp_upd_test CASCADE;
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 558060e080..75a01ba7e0 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -35,75 +35,6 @@
 #include "utils/ruleutils.h"
 
 
-/*-----------------------
- * PartitionTupleRouting - Encapsulates all information required to
- * route a tuple inserted into a partitioned table to one of its leaf
- * partitions.
- *
- * partition_root
- *		The partitioned table that's the target of the command.
- *
- * partition_dispatch_info
- *		Array of 'max_dispatch' elements containing a pointer to a
- *		PartitionDispatch object for every partitioned table touched by tuple
- *		routing.  The entry for the target partitioned table is *always*
- *		present in the 0th element of this array.  See comment for
- *		PartitionDispatchData->indexes for details on how this array is
- *		indexed.
- *
- * nonleaf_partitions
- *		Array of 'max_dispatch' elements containing pointers to fake
- *		ResultRelInfo objects for nonleaf partitions, useful for checking
- *		the partition constraint.
- *
- * num_dispatch
- *		The current number of items stored in the 'partition_dispatch_info'
- *		array.  Also serves as the index of the next free array element for
- *		new PartitionDispatch objects that need to be stored.
- *
- * max_dispatch
- *		The current allocated size of the 'partition_dispatch_info' array.
- *
- * partitions
- *		Array of 'max_partitions' elements containing a pointer to a
- *		ResultRelInfo for every leaf partitions touched by tuple routing.
- *		Some of these are pointers to ResultRelInfos which are borrowed out of
- *		'subplan_resultrel_htab'.  The remainder have been built especially
- *		for tuple routing.  See comment for PartitionDispatchData->indexes for
- *		details on how this array is indexed.
- *
- * num_partitions
- *		The current number of items stored in the 'partitions' array.  Also
- *		serves as the index of the next free array element for new
- *		ResultRelInfo objects that need to be stored.
- *
- * max_partitions
- *		The current allocated size of the 'partitions' array.
- *
- * subplan_resultrel_htab
- *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
- *		cache ResultRelInfos from targets of an UPDATE ModifyTable node;
- *		NULL in other cases.  Some of these may be useful for tuple routing
- *		to save having to build duplicates.
- *
- * memcxt
- *		Memory context used to allocate subsidiary structs.
- *-----------------------
- */
-struct PartitionTupleRouting
-{
-	Relation	partition_root;
-	PartitionDispatch *partition_dispatch_info;
-	ResultRelInfo **nonleaf_partitions;
-	int			num_dispatch;
-	int			max_dispatch;
-	ResultRelInfo **partitions;
-	int			num_partitions;
-	int			max_partitions;
-	HTAB	   *subplan_resultrel_htab;
-	MemoryContext memcxt;
-};
-
 /*-----------------------
  * PartitionDispatch - information about one partitioned table in a partition
  * hierarchy required to route a tuple to any of its partitions.  A
@@ -1001,8 +932,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
 	 *
 	 * If the FDW does not support batching, we set the batch size to 1.
 	 */
-	if (mtstate->operation == CMD_INSERT &&
-		partRelInfo->ri_FdwRoutine != NULL &&
+	if (partRelInfo->ri_FdwRoutine != NULL &&
 		partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
 		partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
 		partRelInfo->ri_BatchSize =
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index bf65785e64..0460e2704a 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2158,8 +2158,6 @@ ExecModifyTable(PlanState *pstate)
 	HeapTupleData oldtupdata;
 	HeapTuple	oldtuple;
 	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
-	List	   *relinfos = NIL;
-	ListCell   *lc;
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -2438,22 +2436,47 @@ ExecModifyTable(PlanState *pstate)
 	}
 
 	/*
-	 * Insert remaining tuples for batch insert.
+	 * Insert any remaining batched tuples.
+	 *
+	 * If the query's main target relation is a partitioned table, any inserts
+	 * would have been performed using tuple routing, so use the appropriate
+	 * set of target relations.  Note that this also covers any inserts
+	 * performed during cross-partition UPDATEs that would have occurred
+	 * through tuple routing.
 	 */
 	if (proute)
-		relinfos = estate->es_tuple_routing_result_relations;
-	else
-		relinfos = estate->es_opened_result_relations;
+	{
+		int			i;
 
-	foreach(lc, relinfos)
+		Assert(node->rootResultRelInfo);
+		Assert(node->rootResultRelInfo->ri_RelationDesc->rd_rel->relkind ==
+			   RELKIND_PARTITIONED_TABLE);
+		for (i = 0; i < proute->num_partitions; i++)
+		{
+			resultRelInfo = proute->partitions[i];
+			if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
+	}
+	else
 	{
-		resultRelInfo = lfirst(lc);
-		if (resultRelInfo->ri_NumSlots > 0)
-			ExecBatchInsert(node, resultRelInfo,
-						   resultRelInfo->ri_Slots,
-						   resultRelInfo->ri_PlanSlots,
-						   resultRelInfo->ri_NumSlots,
-						   estate, node->canSetTag);
+		ListCell *lc;
+
+		/* Otherwise, use result relations from the range table. */
+		foreach(lc, estate->es_opened_result_relations)
+		{
+			resultRelInfo = lfirst(lc);
+			if (resultRelInfo && resultRelInfo->ri_NumSlots > 0)
+				ExecBatchInsert(node, resultRelInfo,
+								resultRelInfo->ri_Slots,
+								resultRelInfo->ri_PlanSlots,
+								resultRelInfo->ri_NumSlots,
+								estate, node->canSetTag);
+		}
 	}
 
 	/*
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index d30ffde7d9..846261ef88 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -18,9 +18,77 @@
 #include "nodes/plannodes.h"
 #include "partitioning/partprune.h"
 
-/* See execPartition.c for the definitions. */
+/* See execPartition.c for the definition. */
 typedef struct PartitionDispatchData *PartitionDispatch;
-typedef struct PartitionTupleRouting PartitionTupleRouting;
+
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ *		The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ *		Array of 'max_dispatch' elements containing a pointer to a
+ *		PartitionDispatch object for every partitioned table touched by tuple
+ *		routing.  The entry for the target partitioned table is *always*
+ *		present in the 0th element of this array.  See comment for
+ *		PartitionDispatchData->indexes for details on how this array is
+ *		indexed.
+ *
+ * nonleaf_partitions
+ *		Array of 'max_dispatch' elements containing pointers to fake
+ *		ResultRelInfo objects for nonleaf partitions, useful for checking
+ *		the partition constraint.
+ *
+ * num_dispatch
+ *		The current number of items stored in the 'partition_dispatch_info'
+ *		array.  Also serves as the index of the next free array element for
+ *		new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ *		The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ *		Array of 'max_partitions' elements containing a pointer to a
+ *		ResultRelInfo for every leaf partitions touched by tuple routing.
+ *		Some of these are pointers to ResultRelInfos which are borrowed out of
+ *		'subplan_resultrel_htab'.  The remainder have been built especially
+ *		for tuple routing.  See comment for PartitionDispatchData->indexes for
+ *		details on how this array is indexed.
+ *
+ * num_partitions
+ *		The current number of items stored in the 'partitions' array.  Also
+ *		serves as the index of the next free array element for new
+ *		ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ *		The current allocated size of the 'partitions' array.
+ *
+ * subplan_resultrel_htab
+ *		Hash table to store subplan ResultRelInfos by Oid.  This is used to
+ *		cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
+ *		NULL in other cases.  Some of these may be useful for tuple routing
+ *		to save having to build duplicates.
+ *
+ * memcxt
+ *		Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+	Relation	partition_root;
+	PartitionDispatch *partition_dispatch_info;
+	ResultRelInfo **nonleaf_partitions;
+	int			num_dispatch;
+	int			max_dispatch;
+	ResultRelInfo **partitions;
+	int			num_partitions;
+	int			max_partitions;
+	HTAB	   *subplan_resultrel_htab;
+	MemoryContext memcxt;
+} PartitionTupleRouting;
 
 /*
  * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning
-- 
2.24.1

